LLVM 22.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132} // end anonymous namespace
133
135 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
136 false)
138INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
140#ifdef EXPENSIVE_CHECKS
143#endif
145 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
146 false)
147
148/// This pass converts a legalized DAG into a AMDGPU-specific
149// DAG, ready for instruction scheduling.
151 CodeGenOptLevel OptLevel) {
152 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
153}
154
158
160 Subtarget = &MF.getSubtarget<GCNSubtarget>();
161 Subtarget->checkSubtargetFeatures(MF.getFunction());
162 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
164}
165
166bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
167 // XXX - only need to list legal operations.
168 switch (Opc) {
169 case ISD::FADD:
170 case ISD::FSUB:
171 case ISD::FMUL:
172 case ISD::FDIV:
173 case ISD::FREM:
175 case ISD::UINT_TO_FP:
176 case ISD::SINT_TO_FP:
177 case ISD::FABS:
178 // Fabs is lowered to a bit operation, but it's an and which will clear the
179 // high bits anyway.
180 case ISD::FSQRT:
181 case ISD::FSIN:
182 case ISD::FCOS:
183 case ISD::FPOWI:
184 case ISD::FPOW:
185 case ISD::FLOG:
186 case ISD::FLOG2:
187 case ISD::FLOG10:
188 case ISD::FEXP:
189 case ISD::FEXP2:
190 case ISD::FCEIL:
191 case ISD::FTRUNC:
192 case ISD::FRINT:
193 case ISD::FNEARBYINT:
194 case ISD::FROUNDEVEN:
195 case ISD::FROUND:
196 case ISD::FFLOOR:
197 case ISD::FMINNUM:
198 case ISD::FMAXNUM:
199 case ISD::FLDEXP:
200 case AMDGPUISD::FRACT:
201 case AMDGPUISD::CLAMP:
204 case AMDGPUISD::FMIN3:
205 case AMDGPUISD::FMAX3:
206 case AMDGPUISD::FMED3:
208 case AMDGPUISD::RCP:
209 case AMDGPUISD::RSQ:
211 // On gfx10, all 16-bit instructions preserve the high bits.
212 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
213 case ISD::FP_ROUND:
214 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
215 // high bits on gfx9.
216 // TODO: If we had the source node we could see if the source was fma/mad
218 case ISD::FMA:
219 case ISD::FMAD:
222 default:
223 // fcopysign, select and others may be lowered to 32-bit bit operations
224 // which don't zero the high bits.
225 return false;
226 }
227}
228
230#ifdef EXPENSIVE_CHECKS
232 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
233 for (auto &L : LI->getLoopsInPreorder()) {
234 assert(L->isLCSSAForm(DT));
235 }
236#endif
238}
239
249
251 assert(Subtarget->d16PreservesUnusedBits());
252 MVT VT = N->getValueType(0).getSimpleVT();
253 if (VT != MVT::v2i16 && VT != MVT::v2f16)
254 return false;
255
256 SDValue Lo = N->getOperand(0);
257 SDValue Hi = N->getOperand(1);
258
259 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
260
261 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
262 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
263 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
264
265 // Need to check for possible indirect dependencies on the other half of the
266 // vector to avoid introducing a cycle.
267 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
268 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
269
270 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
271 SDValue Ops[] = {
272 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
273 };
274
275 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
276 if (LdHi->getMemoryVT() == MVT::i8) {
277 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
279 } else {
280 assert(LdHi->getMemoryVT() == MVT::i16);
281 }
282
283 SDValue NewLoadHi =
284 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
285 Ops, LdHi->getMemoryVT(),
286 LdHi->getMemOperand());
287
288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
289 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
290 return true;
291 }
292
293 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
294 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
295 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
296 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
297 if (LdLo && Lo.hasOneUse()) {
298 SDValue TiedIn = getHi16Elt(Hi);
299 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
300 return false;
301
302 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
303 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
304 if (LdLo->getMemoryVT() == MVT::i8) {
305 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
307 } else {
308 assert(LdLo->getMemoryVT() == MVT::i16);
309 }
310
311 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
312
313 SDValue Ops[] = {
314 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
315 };
316
317 SDValue NewLoadLo =
318 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
319 Ops, LdLo->getMemoryVT(),
320 LdLo->getMemOperand());
321
322 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
323 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
324 return true;
325 }
326
327 return false;
328}
329
331 if (!Subtarget->d16PreservesUnusedBits())
332 return;
333
334 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
335
336 bool MadeChange = false;
337 while (Position != CurDAG->allnodes_begin()) {
338 SDNode *N = &*--Position;
339 if (N->use_empty())
340 continue;
341
342 switch (N->getOpcode()) {
344 // TODO: Match load d16 from shl (extload:i16), 16
345 MadeChange |= matchLoadD16FromBuildVector(N);
346 break;
347 default:
348 break;
349 }
350 }
351
352 if (MadeChange) {
353 CurDAG->RemoveDeadNodes();
354 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
355 CurDAG->dump(););
356 }
357}
358
359bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
360 if (N->isUndef())
361 return true;
362
363 const SIInstrInfo *TII = Subtarget->getInstrInfo();
365 return TII->isInlineConstant(C->getAPIntValue());
366
368 return TII->isInlineConstant(C->getValueAPF());
369
370 return false;
371}
372
373/// Determine the register class for \p OpNo
374/// \returns The register class of the virtual register that will be used for
375/// the given operand number \OpNo or NULL if the register class cannot be
376/// determined.
377const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
378 unsigned OpNo) const {
379 if (!N->isMachineOpcode()) {
380 if (N->getOpcode() == ISD::CopyToReg) {
381 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
382 if (Reg.isVirtual()) {
384 return MRI.getRegClass(Reg);
385 }
386
387 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
388 return TRI->getPhysRegBaseClass(Reg);
389 }
390
391 return nullptr;
392 }
393
394 switch (N->getMachineOpcode()) {
395 default: {
396 const MCInstrDesc &Desc =
397 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
398 unsigned OpIdx = Desc.getNumDefs() + OpNo;
399 if (OpIdx >= Desc.getNumOperands())
400 return nullptr;
401 int RegClass = Desc.operands()[OpIdx].RegClass;
402 if (RegClass == -1)
403 return nullptr;
404
405 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
406 }
407 case AMDGPU::REG_SEQUENCE: {
408 unsigned RCID = N->getConstantOperandVal(0);
409 const TargetRegisterClass *SuperRC =
410 Subtarget->getRegisterInfo()->getRegClass(RCID);
411
412 SDValue SubRegOp = N->getOperand(OpNo + 1);
413 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
414 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
415 SubRegIdx);
416 }
417 }
418}
419
420SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
421 SDValue Glue) const {
423 Ops.push_back(NewChain); // Replace the chain.
424 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
425 Ops.push_back(N->getOperand(i));
426
427 Ops.push_back(Glue);
428 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
429}
430
431SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
432 const SITargetLowering& Lowering =
433 *static_cast<const SITargetLowering*>(getTargetLowering());
434
435 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
436
437 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
438 return glueCopyToOp(N, M0, M0.getValue(1));
439}
440
441SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
442 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
443 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
444 if (Subtarget->ldsRequiresM0Init())
445 return glueCopyToM0(
446 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
447 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
448 MachineFunction &MF = CurDAG->getMachineFunction();
449 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
450 return
451 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
452 }
453 return N;
454}
455
456MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
457 EVT VT) const {
458 SDNode *Lo = CurDAG->getMachineNode(
459 AMDGPU::S_MOV_B32, DL, MVT::i32,
460 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
461 SDNode *Hi = CurDAG->getMachineNode(
462 AMDGPU::S_MOV_B32, DL, MVT::i32,
463 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
464 const SDValue Ops[] = {
465 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
466 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
467 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
468
469 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
470}
471
472void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
473 EVT VT = N->getValueType(0);
474 unsigned NumVectorElts = VT.getVectorNumElements();
475 EVT EltVT = VT.getVectorElementType();
476 SDLoc DL(N);
477 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
478
479 if (NumVectorElts == 1) {
480 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
481 RegClass);
482 return;
483 }
484
485 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
486 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
487 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
488 uint64_t C = 0;
489 bool AllConst = true;
490 unsigned EltSize = EltVT.getSizeInBits();
491 for (unsigned I = 0; I < NumVectorElts; ++I) {
492 SDValue Op = N->getOperand(I);
493 if (Op.isUndef()) {
494 AllConst = false;
495 break;
496 }
497 uint64_t Val;
499 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
500 } else
501 Val = cast<ConstantSDNode>(Op)->getZExtValue();
502 C |= Val << (EltSize * I);
503 }
504 if (AllConst) {
505 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
506 MachineSDNode *Copy =
507 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
508 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
509 RegClass);
510 return;
511 }
512 }
513
514 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
515 "supported yet");
516 // 32 = Max Num Vector Elements
517 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
518 // 1 = Vector Register Class
519 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
520
521 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
522 bool IsRegSeq = true;
523 unsigned NOps = N->getNumOperands();
524 for (unsigned i = 0; i < NOps; i++) {
525 // XXX: Why is this here?
526 if (isa<RegisterSDNode>(N->getOperand(i))) {
527 IsRegSeq = false;
528 break;
529 }
530 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
532 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
533 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
534 }
535 if (NOps != NumVectorElts) {
536 // Fill in the missing undef elements if this was a scalar_to_vector.
537 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
538 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
539 DL, EltVT);
540 for (unsigned i = NOps; i < NumVectorElts; ++i) {
541 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
543 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
544 RegSeqArgs[1 + (2 * i) + 1] =
545 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
546 }
547 }
548
549 if (!IsRegSeq)
550 SelectCode(N);
551 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
552}
553
555 EVT VT = N->getValueType(0);
556 EVT EltVT = VT.getVectorElementType();
557
558 // TODO: Handle 16-bit element vectors with even aligned masks.
559 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
560 VT.getVectorNumElements() != 2) {
561 SelectCode(N);
562 return;
563 }
564
565 auto *SVN = cast<ShuffleVectorSDNode>(N);
566
567 SDValue Src0 = SVN->getOperand(0);
568 SDValue Src1 = SVN->getOperand(1);
569 ArrayRef<int> Mask = SVN->getMask();
570 SDLoc DL(N);
571
572 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
573 Mask[0] < 4 && Mask[1] < 4);
574
575 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
576 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
577 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
578 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
579
580 if (Mask[0] < 0) {
581 Src0SubReg = Src1SubReg;
582 MachineSDNode *ImpDef =
583 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
584 VSrc0 = SDValue(ImpDef, 0);
585 }
586
587 if (Mask[1] < 0) {
588 Src1SubReg = Src0SubReg;
589 MachineSDNode *ImpDef =
590 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
591 VSrc1 = SDValue(ImpDef, 0);
592 }
593
594 // SGPR case needs to lower to copies.
595 //
596 // Also use subregister extract when we can directly blend the registers with
597 // a simple subregister copy.
598 //
599 // TODO: Maybe we should fold this out earlier
600 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
601 Src1SubReg == AMDGPU::sub0) {
602 // The low element of the result always comes from src0.
603 // The high element of the result always comes from src1.
604 // op_sel selects the high half of src0.
605 // op_sel_hi selects the high half of src1.
606
607 unsigned Src0OpSel =
608 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
609 unsigned Src1OpSel =
610 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
611
612 // Enable op_sel_hi to avoid printing it. This should have no effect on the
613 // result.
614 Src0OpSel |= SISrcMods::OP_SEL_1;
615 Src1OpSel |= SISrcMods::OP_SEL_1;
616
617 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
618 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
619 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
620
621 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
622 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
623 ZeroMods, // clamp
624 ZeroMods, // op_sel
625 ZeroMods, // op_sel_hi
626 ZeroMods, // neg_lo
627 ZeroMods}); // neg_hi
628 return;
629 }
630
631 SDValue ResultElt0 =
632 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
633 SDValue ResultElt1 =
634 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
635
636 const SDValue Ops[] = {
637 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
638 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
639 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
640 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
641}
642
644 unsigned int Opc = N->getOpcode();
645 if (N->isMachineOpcode()) {
646 N->setNodeId(-1);
647 return; // Already selected.
648 }
649
650 // isa<MemSDNode> almost works but is slightly too permissive for some DS
651 // intrinsics.
652 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
653 N = glueCopyToM0LDSInit(N);
654 SelectCode(N);
655 return;
656 }
657
658 switch (Opc) {
659 default:
660 break;
661 // We are selecting i64 ADD here instead of custom lower it during
662 // DAG legalization, so we can fold some i64 ADDs used for address
663 // calculation into the LOAD and STORE instructions.
664 case ISD::ADDC:
665 case ISD::ADDE:
666 case ISD::SUBC:
667 case ISD::SUBE: {
668 if (N->getValueType(0) != MVT::i64)
669 break;
670
671 SelectADD_SUB_I64(N);
672 return;
673 }
674 case ISD::UADDO_CARRY:
675 case ISD::USUBO_CARRY:
676 if (N->getValueType(0) != MVT::i32)
677 break;
678
679 SelectAddcSubb(N);
680 return;
681 case ISD::UADDO:
682 case ISD::USUBO: {
683 SelectUADDO_USUBO(N);
684 return;
685 }
687 SelectFMUL_W_CHAIN(N);
688 return;
689 }
691 SelectFMA_W_CHAIN(N);
692 return;
693 }
694
696 case ISD::BUILD_VECTOR: {
697 EVT VT = N->getValueType(0);
698 unsigned NumVectorElts = VT.getVectorNumElements();
699 if (VT.getScalarSizeInBits() == 16) {
700 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
701 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
702 ReplaceNode(N, Packed);
703 return;
704 }
705 }
706
707 break;
708 }
709
710 assert(VT.getVectorElementType().bitsEq(MVT::i32));
711 unsigned RegClassID =
712 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
713 SelectBuildVector(N, RegClassID);
714 return;
715 }
718 return;
719 case ISD::BUILD_PAIR: {
720 SDValue RC, SubReg0, SubReg1;
721 SDLoc DL(N);
722 if (N->getValueType(0) == MVT::i128) {
723 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
724 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
725 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
726 } else if (N->getValueType(0) == MVT::i64) {
727 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
728 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
729 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
730 } else {
731 llvm_unreachable("Unhandled value type for BUILD_PAIR");
732 }
733 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
734 N->getOperand(1), SubReg1 };
735 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
736 N->getValueType(0), Ops));
737 return;
738 }
739
740 case ISD::Constant:
741 case ISD::ConstantFP: {
742 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
743 Subtarget->has64BitLiterals())
744 break;
745
746 uint64_t Imm;
748 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
749 if (AMDGPU::isValid32BitLiteral(Imm, true))
750 break;
751 } else {
753 Imm = C->getZExtValue();
754 if (AMDGPU::isValid32BitLiteral(Imm, false))
755 break;
756 }
757
758 SDLoc DL(N);
759 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
760 return;
761 }
763 case AMDGPUISD::BFE_U32: {
764 // There is a scalar version available, but unlike the vector version which
765 // has a separate operand for the offset and width, the scalar version packs
766 // the width and offset into a single operand. Try to move to the scalar
767 // version if the offsets are constant, so that we can try to keep extended
768 // loads of kernel arguments in SGPRs.
769
770 // TODO: Technically we could try to pattern match scalar bitshifts of
771 // dynamic values, but it's probably not useful.
773 if (!Offset)
774 break;
775
776 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
777 if (!Width)
778 break;
779
780 bool Signed = Opc == AMDGPUISD::BFE_I32;
781
782 uint32_t OffsetVal = Offset->getZExtValue();
783 uint32_t WidthVal = Width->getZExtValue();
784
785 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
786 WidthVal));
787 return;
788 }
790 SelectDIV_SCALE(N);
791 return;
792 }
795 SelectMAD_64_32(N);
796 return;
797 }
798 case ISD::SMUL_LOHI:
799 case ISD::UMUL_LOHI:
800 return SelectMUL_LOHI(N);
801 case ISD::CopyToReg: {
803 *static_cast<const SITargetLowering*>(getTargetLowering());
804 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
805 break;
806 }
807 case ISD::AND:
808 case ISD::SRL:
809 case ISD::SRA:
811 if (N->getValueType(0) != MVT::i32)
812 break;
813
814 SelectS_BFE(N);
815 return;
816 case ISD::BRCOND:
817 SelectBRCOND(N);
818 return;
819 case ISD::FP_EXTEND:
820 SelectFP_EXTEND(N);
821 return;
827 // Hack around using a legal type if f16 is illegal.
828 if (N->getValueType(0) == MVT::i32) {
829 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
830 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
831 { N->getOperand(0), N->getOperand(1) });
832 SelectCode(N);
833 return;
834 }
835
836 break;
837 }
839 SelectINTRINSIC_W_CHAIN(N);
840 return;
841 }
843 SelectINTRINSIC_WO_CHAIN(N);
844 return;
845 }
846 case ISD::INTRINSIC_VOID: {
847 SelectINTRINSIC_VOID(N);
848 return;
849 }
851 SelectWAVE_ADDRESS(N);
852 return;
853 }
854 case ISD::STACKRESTORE: {
855 SelectSTACKRESTORE(N);
856 return;
857 }
858 }
859
860 SelectCode(N);
861}
862
863bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
864 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
865 const Instruction *Term = BB->getTerminator();
866 return Term->getMetadata("amdgpu.uniform") ||
867 Term->getMetadata("structurizecfg.uniform");
868}
869
870bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
871 unsigned ShAmtBits) const {
872 assert(N->getOpcode() == ISD::AND);
873
874 const APInt &RHS = N->getConstantOperandAPInt(1);
875 if (RHS.countr_one() >= ShAmtBits)
876 return true;
877
878 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
879 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
880}
881
883 SDValue &N0, SDValue &N1) {
884 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
886 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
887 // (i64 (bitcast (v2i32 (build_vector
888 // (or (extract_vector_elt V, 0), OFFSET),
889 // (extract_vector_elt V, 1)))))
890 SDValue Lo = Addr.getOperand(0).getOperand(0);
891 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
892 SDValue BaseLo = Lo.getOperand(0);
893 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
894 // Check that split base (Lo and Hi) are extracted from the same one.
895 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
897 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
898 // Lo is statically extracted from index 0.
899 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
900 BaseLo.getConstantOperandVal(1) == 0 &&
901 // Hi is statically extracted from index 0.
902 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
903 BaseHi.getConstantOperandVal(1) == 1) {
904 N0 = BaseLo.getOperand(0).getOperand(0);
905 N1 = Lo.getOperand(1);
906 return true;
907 }
908 }
909 }
910 return false;
911}
912
913bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
914 SDValue &RHS) const {
915 if (CurDAG->isBaseWithConstantOffset(Addr)) {
916 LHS = Addr.getOperand(0);
917 RHS = Addr.getOperand(1);
918 return true;
919 }
920
923 return true;
924 }
925
926 return false;
927}
928
930 return "AMDGPU DAG->DAG Pattern Instruction Selection";
931}
932
936
940#ifdef EXPENSIVE_CHECKS
942 .getManager();
943 auto &F = MF.getFunction();
944 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
945 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
946 for (auto &L : LI.getLoopsInPreorder())
947 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
948#endif
949 return SelectionDAGISelPass::run(MF, MFAM);
950}
951
952//===----------------------------------------------------------------------===//
953// Complex Patterns
954//===----------------------------------------------------------------------===//
955
956bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
957 SDValue &Offset) {
958 return false;
959}
960
961bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
962 SDValue &Offset) {
964 SDLoc DL(Addr);
965
966 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
967 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
968 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
969 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
971 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
972 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
973 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
975 Base = Addr.getOperand(0);
976 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
977 } else {
978 Base = Addr;
979 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
980 }
981
982 return true;
983}
984
985SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
986 const SDLoc &DL) const {
987 SDNode *Mov = CurDAG->getMachineNode(
988 AMDGPU::S_MOV_B32, DL, MVT::i32,
989 CurDAG->getTargetConstant(Val, DL, MVT::i32));
990 return SDValue(Mov, 0);
991}
992
993// FIXME: Should only handle uaddo_carry/usubo_carry
994void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
995 SDLoc DL(N);
996 SDValue LHS = N->getOperand(0);
997 SDValue RHS = N->getOperand(1);
998
999 unsigned Opcode = N->getOpcode();
1000 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1001 bool ProduceCarry =
1002 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1003 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1004
1005 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1006 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1007
1008 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1009 DL, MVT::i32, LHS, Sub0);
1010 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1011 DL, MVT::i32, LHS, Sub1);
1012
1013 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1014 DL, MVT::i32, RHS, Sub0);
1015 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1016 DL, MVT::i32, RHS, Sub1);
1017
1018 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1019
1020 static const unsigned OpcMap[2][2][2] = {
1021 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1022 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1023 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1024 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1025
1026 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1027 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1028
1029 SDNode *AddLo;
1030 if (!ConsumeCarry) {
1031 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1032 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1033 } else {
1034 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1035 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1036 }
1037 SDValue AddHiArgs[] = {
1038 SDValue(Hi0, 0),
1039 SDValue(Hi1, 0),
1040 SDValue(AddLo, 1)
1041 };
1042 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1043
1044 SDValue RegSequenceArgs[] = {
1045 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1046 SDValue(AddLo,0),
1047 Sub0,
1048 SDValue(AddHi,0),
1049 Sub1,
1050 };
1051 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1052 MVT::i64, RegSequenceArgs);
1053
1054 if (ProduceCarry) {
1055 // Replace the carry-use
1056 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1057 }
1058
1059 // Replace the remaining uses.
1060 ReplaceNode(N, RegSequence);
1061}
1062
1063void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1064 SDValue LHS = N->getOperand(0);
1065 SDValue RHS = N->getOperand(1);
1066 SDValue CI = N->getOperand(2);
1067
1068 if (N->isDivergent()) {
1069 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1070 : AMDGPU::V_SUBB_U32_e64;
1071 CurDAG->SelectNodeTo(
1072 N, Opc, N->getVTList(),
1073 {LHS, RHS, CI,
1074 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1075 } else {
1076 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1077 : AMDGPU::S_SUB_CO_PSEUDO;
1078 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1079 }
1080}
1081
1082void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1083 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1084 // carry out despite the _i32 name. These were renamed in VI to _U32.
1085 // FIXME: We should probably rename the opcodes here.
1086 bool IsAdd = N->getOpcode() == ISD::UADDO;
1087 bool IsVALU = N->isDivergent();
1088
1089 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1090 ++UI)
1091 if (UI.getUse().getResNo() == 1) {
1092 if (UI->isMachineOpcode()) {
1093 if (UI->getMachineOpcode() !=
1094 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1095 IsVALU = true;
1096 break;
1097 }
1098 } else {
1099 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1100 IsVALU = true;
1101 break;
1102 }
1103 }
1104 }
1105
1106 if (IsVALU) {
1107 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1108
1109 CurDAG->SelectNodeTo(
1110 N, Opc, N->getVTList(),
1111 {N->getOperand(0), N->getOperand(1),
1112 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1113 } else {
1114 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1115 : AMDGPU::S_USUBO_PSEUDO;
1116
1117 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1118 {N->getOperand(0), N->getOperand(1)});
1119 }
1120}
1121
1122void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1123 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1124 SDValue Ops[10];
1125
1126 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1127 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1128 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1129 Ops[8] = N->getOperand(0);
1130 Ops[9] = N->getOperand(4);
1131
1132 // If there are no source modifiers, prefer fmac over fma because it can use
1133 // the smaller VOP2 encoding.
1134 bool UseFMAC = Subtarget->hasDLInsts() &&
1135 cast<ConstantSDNode>(Ops[0])->isZero() &&
1136 cast<ConstantSDNode>(Ops[2])->isZero() &&
1137 cast<ConstantSDNode>(Ops[4])->isZero();
1138 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1139 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1140}
1141
1142void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1143 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1144 SDValue Ops[8];
1145
1146 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1147 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1148 Ops[6] = N->getOperand(0);
1149 Ops[7] = N->getOperand(3);
1150
1151 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1152}
1153
1154// We need to handle this here because tablegen doesn't support matching
1155// instructions with multiple outputs.
1156void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1157 EVT VT = N->getValueType(0);
1158
1159 assert(VT == MVT::f32 || VT == MVT::f64);
1160
1161 unsigned Opc
1162 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1163
1164 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1165 // omod
1166 SDValue Ops[8];
1167 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1168 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1169 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1170 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1171}
1172
1173// We need to handle this here because tablegen doesn't support matching
1174// instructions with multiple outputs.
1175void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1176 SDLoc SL(N);
1177 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1178 unsigned Opc;
1179 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
1180 if (Subtarget->hasMADIntraFwdBug())
1181 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1182 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1183 else if (UseNoCarry)
1184 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1185 else
1186 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1187
1188 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1189 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1190 Clamp };
1191
1192 if (UseNoCarry) {
1193 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1194 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1195 CurDAG->RemoveDeadNode(N);
1196 return;
1197 }
1198
1199 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1200}
1201
1202// We need to handle this here because tablegen doesn't support matching
1203// instructions with multiple outputs.
1204void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1205 SDLoc SL(N);
1206 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1207 SDVTList VTList;
1208 unsigned Opc;
1209 if (Subtarget->hasMadU64U32NoCarry()) {
1210 VTList = CurDAG->getVTList(MVT::i64);
1211 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1212 } else {
1213 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1214 if (Subtarget->hasMADIntraFwdBug()) {
1215 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1216 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1217 } else {
1218 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1219 }
1220 }
1221
1222 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1223 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1224 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1225 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1226 if (!SDValue(N, 0).use_empty()) {
1227 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1228 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1229 MVT::i32, SDValue(Mad, 0), Sub0);
1230 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1231 }
1232 if (!SDValue(N, 1).use_empty()) {
1233 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1234 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1235 MVT::i32, SDValue(Mad, 0), Sub1);
1236 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1237 }
1238 CurDAG->RemoveDeadNode(N);
1239}
1240
1241bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1242 if (!isUInt<16>(Offset))
1243 return false;
1244
1245 if (!Base || Subtarget->hasUsableDSOffset() ||
1246 Subtarget->unsafeDSOffsetFoldingEnabled())
1247 return true;
1248
1249 // On Southern Islands instruction with a negative base value and an offset
1250 // don't seem to work.
1251 return CurDAG->SignBitIsZero(Base);
1252}
1253
1254bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1255 SDValue &Offset) const {
1256 SDLoc DL(Addr);
1257 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1258 SDValue N0 = Addr.getOperand(0);
1259 SDValue N1 = Addr.getOperand(1);
1260 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1261 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1262 // (add n0, c0)
1263 Base = N0;
1264 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1265 return true;
1266 }
1267 } else if (Addr.getOpcode() == ISD::SUB) {
1268 // sub C, x -> add (sub 0, x), C
1269 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1270 int64_t ByteOffset = C->getSExtValue();
1271 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1272 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1273
1274 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1275 // the known bits in isDSOffsetLegal. We need to emit the selected node
1276 // here, so this is thrown away.
1277 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1278 Zero, Addr.getOperand(1));
1279
1280 if (isDSOffsetLegal(Sub, ByteOffset)) {
1282 Opnds.push_back(Zero);
1283 Opnds.push_back(Addr.getOperand(1));
1284
1285 // FIXME: Select to VOP3 version for with-carry.
1286 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1287 if (Subtarget->hasAddNoCarry()) {
1288 SubOp = AMDGPU::V_SUB_U32_e64;
1289 Opnds.push_back(
1290 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1291 }
1292
1293 MachineSDNode *MachineSub =
1294 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1295
1296 Base = SDValue(MachineSub, 0);
1297 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1298 return true;
1299 }
1300 }
1301 }
1302 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1303 // If we have a constant address, prefer to put the constant into the
1304 // offset. This can save moves to load the constant address since multiple
1305 // operations can share the zero base address register, and enables merging
1306 // into read2 / write2 instructions.
1307
1308 SDLoc DL(Addr);
1309
1310 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1311 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1312 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1313 DL, MVT::i32, Zero);
1314 Base = SDValue(MovZero, 0);
1315 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1316 return true;
1317 }
1318 }
1319
1320 // default case
1321 Base = Addr;
1322 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1323 return true;
1324}
1325
1326bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1327 unsigned Offset1,
1328 unsigned Size) const {
1329 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1330 return false;
1331 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1332 return false;
1333
1334 if (!Base || Subtarget->hasUsableDSOffset() ||
1335 Subtarget->unsafeDSOffsetFoldingEnabled())
1336 return true;
1337
1338 // On Southern Islands instruction with a negative base value and an offset
1339 // don't seem to work.
1340 return CurDAG->SignBitIsZero(Base);
1341}
1342
1343// Return whether the operation has NoUnsignedWrap property.
1344static bool isNoUnsignedWrap(SDValue Addr) {
1345 return (Addr.getOpcode() == ISD::ADD &&
1346 Addr->getFlags().hasNoUnsignedWrap()) ||
1347 Addr->getOpcode() == ISD::OR;
1348}
1349
1350// Check that the base address of flat scratch load/store in the form of `base +
1351// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1352// requirement). We always treat the first operand as the base address here.
1353bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1354 if (isNoUnsignedWrap(Addr))
1355 return true;
1356
1357 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1358 // values.
1359 if (Subtarget->hasSignedScratchOffsets())
1360 return true;
1361
1362 auto LHS = Addr.getOperand(0);
1363 auto RHS = Addr.getOperand(1);
1364
1365 // If the immediate offset is negative and within certain range, the base
1366 // address cannot also be negative. If the base is also negative, the sum
1367 // would be either negative or much larger than the valid range of scratch
1368 // memory a thread can access.
1369 ConstantSDNode *ImmOp = nullptr;
1370 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1371 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1372 return true;
1373 }
1374
1375 return CurDAG->SignBitIsZero(LHS);
1376}
1377
1378// Check address value in SGPR/VGPR are legal for flat scratch in the form
1379// of: SGPR + VGPR.
1380bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1381 if (isNoUnsignedWrap(Addr))
1382 return true;
1383
1384 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1385 // values.
1386 if (Subtarget->hasSignedScratchOffsets())
1387 return true;
1388
1389 auto LHS = Addr.getOperand(0);
1390 auto RHS = Addr.getOperand(1);
1391 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1392}
1393
1394// Check address value in SGPR/VGPR are legal for flat scratch in the form
1395// of: SGPR + VGPR + Imm.
1396bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1397 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1398 // values.
1399 if (AMDGPU::isGFX12Plus(*Subtarget))
1400 return true;
1401
1402 auto Base = Addr.getOperand(0);
1403 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1404 // If the immediate offset is negative and within certain range, the base
1405 // address cannot also be negative. If the base is also negative, the sum
1406 // would be either negative or much larger than the valid range of scratch
1407 // memory a thread can access.
1408 if (isNoUnsignedWrap(Base) &&
1409 (isNoUnsignedWrap(Addr) ||
1410 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1411 return true;
1412
1413 auto LHS = Base.getOperand(0);
1414 auto RHS = Base.getOperand(1);
1415 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1416}
1417
1418// TODO: If offset is too big, put low 16-bit into offset.
1419bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1420 SDValue &Offset0,
1421 SDValue &Offset1) const {
1422 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1423}
1424
1425bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1426 SDValue &Offset0,
1427 SDValue &Offset1) const {
1428 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1429}
1430
1431bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1432 SDValue &Offset0, SDValue &Offset1,
1433 unsigned Size) const {
1434 SDLoc DL(Addr);
1435
1436 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1437 SDValue N0 = Addr.getOperand(0);
1438 SDValue N1 = Addr.getOperand(1);
1439 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1440 unsigned OffsetValue0 = C1->getZExtValue();
1441 unsigned OffsetValue1 = OffsetValue0 + Size;
1442
1443 // (add n0, c0)
1444 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1445 Base = N0;
1446 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1447 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1448 return true;
1449 }
1450 } else if (Addr.getOpcode() == ISD::SUB) {
1451 // sub C, x -> add (sub 0, x), C
1452 if (const ConstantSDNode *C =
1454 unsigned OffsetValue0 = C->getZExtValue();
1455 unsigned OffsetValue1 = OffsetValue0 + Size;
1456
1457 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1458 SDLoc DL(Addr);
1459 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1460
1461 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1462 // the known bits in isDSOffsetLegal. We need to emit the selected node
1463 // here, so this is thrown away.
1464 SDValue Sub =
1465 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1466
1467 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1469 Opnds.push_back(Zero);
1470 Opnds.push_back(Addr.getOperand(1));
1471 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1472 if (Subtarget->hasAddNoCarry()) {
1473 SubOp = AMDGPU::V_SUB_U32_e64;
1474 Opnds.push_back(
1475 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1476 }
1477
1478 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1479 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1480
1481 Base = SDValue(MachineSub, 0);
1482 Offset0 =
1483 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1484 Offset1 =
1485 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1486 return true;
1487 }
1488 }
1489 }
1490 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1491 unsigned OffsetValue0 = CAddr->getZExtValue();
1492 unsigned OffsetValue1 = OffsetValue0 + Size;
1493
1494 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1495 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1496 MachineSDNode *MovZero =
1497 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1498 Base = SDValue(MovZero, 0);
1499 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1500 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1501 return true;
1502 }
1503 }
1504
1505 // default case
1506
1507 Base = Addr;
1508 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1509 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1510 return true;
1511}
1512
1513bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1514 SDValue &SOffset, SDValue &Offset,
1515 SDValue &Offen, SDValue &Idxen,
1516 SDValue &Addr64) const {
1517 // Subtarget prefers to use flat instruction
1518 // FIXME: This should be a pattern predicate and not reach here
1519 if (Subtarget->useFlatForGlobal())
1520 return false;
1521
1522 SDLoc DL(Addr);
1523
1524 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1525 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1526 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1527 SOffset = Subtarget->hasRestrictedSOffset()
1528 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1529 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1530
1531 ConstantSDNode *C1 = nullptr;
1532 SDValue N0 = Addr;
1533 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1534 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1535 if (isUInt<32>(C1->getZExtValue()))
1536 N0 = Addr.getOperand(0);
1537 else
1538 C1 = nullptr;
1539 }
1540
1541 if (N0->isAnyAdd()) {
1542 // (add N2, N3) -> addr64, or
1543 // (add (add N2, N3), C1) -> addr64
1544 SDValue N2 = N0.getOperand(0);
1545 SDValue N3 = N0.getOperand(1);
1546 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1547
1548 if (N2->isDivergent()) {
1549 if (N3->isDivergent()) {
1550 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1551 // addr64, and construct the resource from a 0 address.
1552 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1553 VAddr = N0;
1554 } else {
1555 // N2 is divergent, N3 is not.
1556 Ptr = N3;
1557 VAddr = N2;
1558 }
1559 } else {
1560 // N2 is not divergent.
1561 Ptr = N2;
1562 VAddr = N3;
1563 }
1564 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1565 } else if (N0->isDivergent()) {
1566 // N0 is divergent. Use it as the addr64, and construct the resource from a
1567 // 0 address.
1568 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1569 VAddr = N0;
1570 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1571 } else {
1572 // N0 -> offset, or
1573 // (N0 + C1) -> offset
1574 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1575 Ptr = N0;
1576 }
1577
1578 if (!C1) {
1579 // No offset.
1580 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1581 return true;
1582 }
1583
1584 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1585 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1586 // Legal offset for instruction.
1587 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1588 return true;
1589 }
1590
1591 // Illegal offset, store it in soffset.
1592 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1593 SOffset =
1594 SDValue(CurDAG->getMachineNode(
1595 AMDGPU::S_MOV_B32, DL, MVT::i32,
1596 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1597 0);
1598 return true;
1599}
1600
1601bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1602 SDValue &VAddr, SDValue &SOffset,
1603 SDValue &Offset) const {
1604 SDValue Ptr, Offen, Idxen, Addr64;
1605
1606 // addr64 bit was removed for volcanic islands.
1607 // FIXME: This should be a pattern predicate and not reach here
1608 if (!Subtarget->hasAddr64())
1609 return false;
1610
1611 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1612 return false;
1613
1614 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1615 if (C->getSExtValue()) {
1616 SDLoc DL(Addr);
1617
1618 const SITargetLowering& Lowering =
1619 *static_cast<const SITargetLowering*>(getTargetLowering());
1620
1621 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1622 return true;
1623 }
1624
1625 return false;
1626}
1627
1628std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1629 SDLoc DL(N);
1630
1631 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1632 SDValue TFI =
1633 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1634
1635 // We rebase the base address into an absolute stack address and hence
1636 // use constant 0 for soffset. This value must be retained until
1637 // frame elimination and eliminateFrameIndex will choose the appropriate
1638 // frame register if need be.
1639 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1640}
1641
1642bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1643 SDValue Addr, SDValue &Rsrc,
1644 SDValue &VAddr, SDValue &SOffset,
1645 SDValue &ImmOffset) const {
1646
1647 SDLoc DL(Addr);
1648 MachineFunction &MF = CurDAG->getMachineFunction();
1649 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1650
1651 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1652
1653 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1654 int64_t Imm = CAddr->getSExtValue();
1655 const int64_t NullPtr =
1657 // Don't fold null pointer.
1658 if (Imm != NullPtr) {
1659 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1660 SDValue HighBits =
1661 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1662 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1663 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1664 VAddr = SDValue(MovHighBits, 0);
1665
1666 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1667 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1668 return true;
1669 }
1670 }
1671
1672 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1673 // (add n0, c1)
1674
1675 SDValue N0 = Addr.getOperand(0);
1676 uint64_t C1 = Addr.getConstantOperandVal(1);
1677
1678 // Offsets in vaddr must be positive if range checking is enabled.
1679 //
1680 // The total computation of vaddr + soffset + offset must not overflow. If
1681 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1682 // overflowing.
1683 //
1684 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1685 // always perform a range check. If a negative vaddr base index was used,
1686 // this would fail the range check. The overall address computation would
1687 // compute a valid address, but this doesn't happen due to the range
1688 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1689 //
1690 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1691 // MUBUF vaddr, but not on older subtargets which can only do this if the
1692 // sign bit is known 0.
1693 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1694 if (TII->isLegalMUBUFImmOffset(C1) &&
1695 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1696 CurDAG->SignBitIsZero(N0))) {
1697 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1698 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1699 return true;
1700 }
1701 }
1702
1703 // (node)
1704 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1705 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1706 return true;
1707}
1708
1709static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1710 if (Val.getOpcode() != ISD::CopyFromReg)
1711 return false;
1712 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1713 if (!Reg.isPhysical())
1714 return false;
1715 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1716 return RC && TRI.isSGPRClass(RC);
1717}
1718
1719bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1720 SDValue Addr,
1721 SDValue &SRsrc,
1722 SDValue &SOffset,
1723 SDValue &Offset) const {
1724 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1725 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1726 MachineFunction &MF = CurDAG->getMachineFunction();
1727 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1728 SDLoc DL(Addr);
1729
1730 // CopyFromReg <sgpr>
1731 if (IsCopyFromSGPR(*TRI, Addr)) {
1732 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1733 SOffset = Addr;
1734 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1735 return true;
1736 }
1737
1738 ConstantSDNode *CAddr;
1739 if (Addr.getOpcode() == ISD::ADD) {
1740 // Add (CopyFromReg <sgpr>) <constant>
1741 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1742 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1743 return false;
1744 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1745 return false;
1746
1747 SOffset = Addr.getOperand(0);
1748 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1749 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1750 // <constant>
1751 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1752 } else {
1753 return false;
1754 }
1755
1756 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1757
1758 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1759 return true;
1760}
1761
1762bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1763 SDValue &SOffset, SDValue &Offset
1764 ) const {
1765 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1766 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1767
1768 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1769 return false;
1770
1771 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1772 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1773 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1774 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1775 maskTrailingOnes<uint64_t>(32); // Size
1776 SDLoc DL(Addr);
1777
1778 const SITargetLowering& Lowering =
1779 *static_cast<const SITargetLowering*>(getTargetLowering());
1780
1781 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1782 return true;
1783 }
1784 return false;
1785}
1786
1787bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1788 SDValue &SOffset) const {
1789 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1790 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1791 return true;
1792 }
1793
1794 SOffset = ByteOffsetNode;
1795 return true;
1796}
1797
1798// Find a load or store from corresponding pattern root.
1799// Roots may be build_vector, bitconvert or their combinations.
1802 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1803 return MN;
1805 for (SDValue V : N->op_values())
1806 if (MemSDNode *MN =
1808 return MN;
1809 llvm_unreachable("cannot find MemSDNode in the pattern!");
1810}
1811
1812bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1813 SDValue &VAddr, SDValue &Offset,
1814 uint64_t FlatVariant) const {
1815 int64_t OffsetVal = 0;
1816
1817 unsigned AS = findMemSDNode(N)->getAddressSpace();
1818
1819 bool CanHaveFlatSegmentOffsetBug =
1820 Subtarget->hasFlatSegmentOffsetBug() &&
1821 FlatVariant == SIInstrFlags::FLAT &&
1823
1824 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1825 SDValue N0, N1;
1826 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1827 (FlatVariant != SIInstrFlags::FlatScratch ||
1828 isFlatScratchBaseLegal(Addr))) {
1829 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1830
1831 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1832 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1833 Addr = N0;
1834 OffsetVal = COffsetVal;
1835 } else {
1836 // If the offset doesn't fit, put the low bits into the offset field and
1837 // add the rest.
1838 //
1839 // For a FLAT instruction the hardware decides whether to access
1840 // global/scratch/shared memory based on the high bits of vaddr,
1841 // ignoring the offset field, so we have to ensure that when we add
1842 // remainder to vaddr it still points into the same underlying object.
1843 // The easiest way to do that is to make sure that we split the offset
1844 // into two pieces that are both >= 0 or both <= 0.
1845
1846 SDLoc DL(N);
1847 uint64_t RemainderOffset;
1848
1849 std::tie(OffsetVal, RemainderOffset) =
1850 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1851
1852 SDValue AddOffsetLo =
1853 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1854 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1855
1856 if (Addr.getValueType().getSizeInBits() == 32) {
1858 Opnds.push_back(N0);
1859 Opnds.push_back(AddOffsetLo);
1860 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1861 if (Subtarget->hasAddNoCarry()) {
1862 AddOp = AMDGPU::V_ADD_U32_e64;
1863 Opnds.push_back(Clamp);
1864 }
1865 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1866 } else {
1867 // TODO: Should this try to use a scalar add pseudo if the base address
1868 // is uniform and saddr is usable?
1869 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1870 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1871
1872 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1873 DL, MVT::i32, N0, Sub0);
1874 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1875 DL, MVT::i32, N0, Sub1);
1876
1877 SDValue AddOffsetHi =
1878 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1879
1880 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1881
1882 SDNode *Add =
1883 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1884 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1885
1886 SDNode *Addc = CurDAG->getMachineNode(
1887 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1888 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1889
1890 SDValue RegSequenceArgs[] = {
1891 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1892 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1893
1894 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1895 MVT::i64, RegSequenceArgs),
1896 0);
1897 }
1898 }
1899 }
1900 }
1901
1902 VAddr = Addr;
1903 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1904 return true;
1905}
1906
1907bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1908 SDValue &VAddr,
1909 SDValue &Offset) const {
1910 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1911}
1912
1913bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1914 SDValue &VAddr,
1915 SDValue &Offset) const {
1916 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1917}
1918
1919bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1920 SDValue &VAddr,
1921 SDValue &Offset) const {
1922 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1924}
1925
1926// If this matches *_extend i32:x, return x
1927// Otherwise if the value is I32 returns x.
1929 const SelectionDAG *DAG) {
1930 if (Op.getValueType() == MVT::i32)
1931 return Op;
1932
1933 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
1934 Op.getOpcode() != ISD::ANY_EXTEND &&
1935 !(DAG->SignBitIsZero(Op) &&
1936 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
1937 return SDValue();
1938
1939 SDValue ExtSrc = Op.getOperand(0);
1940 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1941}
1942
1943// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1944// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1945bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1946 SDValue &SAddr, SDValue &VOffset,
1947 SDValue &Offset, bool &ScaleOffset,
1948 bool NeedIOffset) const {
1949 int64_t ImmOffset = 0;
1950 ScaleOffset = false;
1951
1952 // Match the immediate offset first, which canonically is moved as low as
1953 // possible.
1954
1955 SDValue LHS, RHS;
1956 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1957 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1958 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1959
1960 if (NeedIOffset &&
1961 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1963 Addr = LHS;
1964 ImmOffset = COffsetVal;
1965 } else if (!LHS->isDivergent()) {
1966 if (COffsetVal > 0) {
1967 SDLoc SL(N);
1968 // saddr + large_offset -> saddr +
1969 // (voffset = large_offset & ~MaxOffset) +
1970 // (large_offset & MaxOffset);
1971 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
1972 if (NeedIOffset) {
1973 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1975 }
1976
1977 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
1978 : isUInt<32>(RemainderOffset)) {
1979 SDNode *VMov = CurDAG->getMachineNode(
1980 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1981 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1982 VOffset = SDValue(VMov, 0);
1983 SAddr = LHS;
1984 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1985 return true;
1986 }
1987 }
1988
1989 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1990 // is 1 we would need to perform 1 or 2 extra moves for each half of
1991 // the constant and it is better to do a scalar add and then issue a
1992 // single VALU instruction to materialize zero. Otherwise it is less
1993 // instructions to perform VALU adds with immediates or inline literals.
1994 unsigned NumLiterals =
1995 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1996 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1997 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1998 return false;
1999 }
2000 }
2001
2002 // Match the variable offset.
2003 if (Addr->isAnyAdd()) {
2004 LHS = Addr.getOperand(0);
2005
2006 if (!LHS->isDivergent()) {
2007 // add (i64 sgpr), (*_extend (i32 vgpr))
2008 RHS = Addr.getOperand(1);
2009 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2010 if (SDValue ExtRHS = matchExtFromI32orI32(
2011 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2012 SAddr = LHS;
2013 VOffset = ExtRHS;
2014 }
2015 }
2016
2017 RHS = Addr.getOperand(1);
2018 if (!SAddr && !RHS->isDivergent()) {
2019 // add (*_extend (i32 vgpr)), (i64 sgpr)
2020 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2021 if (SDValue ExtLHS = matchExtFromI32orI32(
2022 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2023 SAddr = RHS;
2024 VOffset = ExtLHS;
2025 }
2026 }
2027
2028 if (SAddr) {
2029 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2030 return true;
2031 }
2032 }
2033
2034 if (Subtarget->hasScaleOffset() &&
2035 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2038 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2039 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2040 Addr.getOperand(0)->isDivergent() &&
2042 !Addr.getOperand(2)->isDivergent()) {
2043 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2044 unsigned Size =
2045 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2046 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2047 if (ScaleOffset) {
2048 SAddr = Addr.getOperand(2);
2049 VOffset = Addr.getOperand(0);
2050 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2051 return true;
2052 }
2053 }
2054
2055 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2056 isa<ConstantSDNode>(Addr))
2057 return false;
2058
2059 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2060 // moves required to copy a 64-bit SGPR to VGPR.
2061 SAddr = Addr;
2062 SDNode *VMov =
2063 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2064 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2065 VOffset = SDValue(VMov, 0);
2066 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2067 return true;
2068}
2069
2070bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2071 SDValue &SAddr, SDValue &VOffset,
2072 SDValue &Offset,
2073 SDValue &CPol) const {
2074 bool ScaleOffset;
2075 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2076 return false;
2077
2078 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2079 SDLoc(), MVT::i32);
2080 return true;
2081}
2082
2083bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2084 SDValue &SAddr, SDValue &VOffset,
2085 SDValue &Offset,
2086 SDValue &CPol) const {
2087 bool ScaleOffset;
2088 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2089 return false;
2090
2091 // We are assuming CPol is always the last operand of the intrinsic.
2092 auto PassedCPol =
2093 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2094 CPol = CurDAG->getTargetConstant(
2095 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2096 return true;
2097}
2098
2099bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2100 SDValue &SAddr,
2101 SDValue &VOffset,
2102 SDValue &Offset,
2103 SDValue &CPol) const {
2104 bool ScaleOffset;
2105 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2106 return false;
2107
2108 // We are assuming CPol is second from last operand of the intrinsic.
2109 auto PassedCPol =
2110 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2111 CPol = CurDAG->getTargetConstant(
2112 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2113 return true;
2114}
2115
2116bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2117 SDValue &SAddr, SDValue &VOffset,
2118 SDValue &Offset,
2119 SDValue &CPol) const {
2120 bool ScaleOffset;
2121 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2122 return false;
2123
2124 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2125 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2126 return true;
2127}
2128
2129bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2130 SDValue &SAddr,
2131 SDValue &VOffset,
2132 SDValue &CPol) const {
2133 bool ScaleOffset;
2134 SDValue DummyOffset;
2135 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2136 false))
2137 return false;
2138
2139 // We are assuming CPol is always the last operand of the intrinsic.
2140 auto PassedCPol =
2141 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2142 CPol = CurDAG->getTargetConstant(
2143 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2144 return true;
2145}
2146
2147bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2148 SDValue &SAddr,
2149 SDValue &VOffset,
2150 SDValue &CPol) const {
2151 bool ScaleOffset;
2152 SDValue DummyOffset;
2153 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2154 false))
2155 return false;
2156
2157 // We are assuming CPol is second from last operand of the intrinsic.
2158 auto PassedCPol =
2159 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2160 CPol = CurDAG->getTargetConstant(
2161 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2162 return true;
2163}
2164
2166 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2167 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2168 } else if (SAddr.getOpcode() == ISD::ADD &&
2170 // Materialize this into a scalar move for scalar address to avoid
2171 // readfirstlane.
2172 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2173 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2174 FI->getValueType(0));
2175 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2176 MVT::i32, TFI, SAddr.getOperand(1)),
2177 0);
2178 }
2179
2180 return SAddr;
2181}
2182
2183// Match (32-bit SGPR base) + sext(imm offset)
2184bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2185 SDValue &SAddr,
2186 SDValue &Offset) const {
2187 if (Addr->isDivergent())
2188 return false;
2189
2190 SDLoc DL(Addr);
2191
2192 int64_t COffsetVal = 0;
2193
2194 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2195 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2196 SAddr = Addr.getOperand(0);
2197 } else {
2198 SAddr = Addr;
2199 }
2200
2201 SAddr = SelectSAddrFI(CurDAG, SAddr);
2202
2203 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2204
2205 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2207 int64_t SplitImmOffset, RemainderOffset;
2208 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2210
2211 COffsetVal = SplitImmOffset;
2212
2213 SDValue AddOffset =
2215 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2216 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2217 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2218 SAddr, AddOffset),
2219 0);
2220 }
2221
2222 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2223
2224 return true;
2225}
2226
2227// Check whether the flat scratch SVS swizzle bug affects this access.
2228bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2229 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2230 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2231 return false;
2232
2233 // The bug affects the swizzling of SVS accesses if there is any carry out
2234 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2235 // voffset to (soffset + inst_offset).
2236 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2237 KnownBits SKnown =
2238 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2239 KnownBits::makeConstant(APInt(32, ImmOffset,
2240 /*isSigned=*/true)));
2241 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2242 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2243 return (VMax & 3) + (SMax & 3) >= 4;
2244}
2245
2246bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2247 SDValue &VAddr, SDValue &SAddr,
2248 SDValue &Offset,
2249 SDValue &CPol) const {
2250 int64_t ImmOffset = 0;
2251
2252 SDValue LHS, RHS;
2253 SDValue OrigAddr = Addr;
2254 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2255 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2256 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2257
2258 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2260 Addr = LHS;
2261 ImmOffset = COffsetVal;
2262 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2263 SDLoc SL(N);
2264 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2265 // (large_offset & MaxOffset);
2266 int64_t SplitImmOffset, RemainderOffset;
2267 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2269
2270 if (isUInt<32>(RemainderOffset)) {
2271 SDNode *VMov = CurDAG->getMachineNode(
2272 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2273 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2274 VAddr = SDValue(VMov, 0);
2275 SAddr = LHS;
2276 if (!isFlatScratchBaseLegal(Addr))
2277 return false;
2278 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2279 return false;
2280 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2281 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2282 return true;
2283 }
2284 }
2285 }
2286
2287 if (Addr.getOpcode() != ISD::ADD)
2288 return false;
2289
2290 LHS = Addr.getOperand(0);
2291 RHS = Addr.getOperand(1);
2292
2293 if (!LHS->isDivergent() && RHS->isDivergent()) {
2294 SAddr = LHS;
2295 VAddr = RHS;
2296 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2297 SAddr = RHS;
2298 VAddr = LHS;
2299 } else {
2300 return false;
2301 }
2302
2303 if (OrigAddr != Addr) {
2304 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2305 return false;
2306 } else {
2307 if (!isFlatScratchBaseLegalSV(OrigAddr))
2308 return false;
2309 }
2310
2311 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2312 return false;
2313 SAddr = SelectSAddrFI(CurDAG, SAddr);
2314 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2315
2316 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2317 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2318 SDLoc(), MVT::i32);
2319 return true;
2320}
2321
2322// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2323// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2324// Handle the case where the Immediate Offset + SOffset is negative.
2325bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2326 bool Imm32Only,
2327 bool IsBuffer,
2328 int64_t ImmOffset) const {
2329 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2330 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2331 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2332 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2333 return false;
2334 }
2335
2336 return true;
2337}
2338
2339// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2340// the load byte size. If it is update \p Offset to a pre-scaled value and
2341// return true.
2342bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2343 bool IsSigned) const {
2344 bool ScaleOffset = false;
2345 if (!Subtarget->hasScaleOffset() || !Offset)
2346 return false;
2347
2348 unsigned Size =
2349 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2350
2351 SDValue Off = Offset;
2352 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2353 Off = Ext;
2354
2355 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2356 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2357 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2358 } else if (Offset.getOpcode() == ISD::MUL ||
2359 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2360 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2361 (Offset.isMachineOpcode() &&
2362 Offset.getMachineOpcode() ==
2363 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2364 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2365 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2366 ScaleOffset = C->getZExtValue() == Size;
2367 }
2368
2369 if (ScaleOffset)
2370 Offset = Off.getOperand(0);
2371
2372 return ScaleOffset;
2373}
2374
2375// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2376// not null) offset. If Imm32Only is true, match only 32-bit immediate
2377// offsets available on CI.
2378bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2379 SDValue *SOffset, SDValue *Offset,
2380 bool Imm32Only, bool IsBuffer,
2381 bool HasSOffset, int64_t ImmOffset,
2382 bool *ScaleOffset) const {
2383 assert((!SOffset || !Offset) &&
2384 "Cannot match both soffset and offset at the same time!");
2385
2386 if (ScaleOffset) {
2387 assert(N && SOffset);
2388
2389 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2390 }
2391
2392 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2393 if (!C) {
2394 if (!SOffset)
2395 return false;
2396
2397 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2398 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2399 *SOffset = ByteOffsetNode;
2400 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2401 ImmOffset);
2402 }
2403 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2404 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2405 *SOffset = ByteOffsetNode.getOperand(0);
2406 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2407 ImmOffset);
2408 }
2409 }
2410 return false;
2411 }
2412
2413 SDLoc SL(ByteOffsetNode);
2414
2415 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2416 // offset for S_BUFFER instructions is unsigned.
2417 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2418 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2419 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2420 if (EncodedOffset && Offset && !Imm32Only) {
2421 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2422 return true;
2423 }
2424
2425 // SGPR and literal offsets are unsigned.
2426 if (ByteOffset < 0)
2427 return false;
2428
2429 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2430 if (EncodedOffset && Offset && Imm32Only) {
2431 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2432 return true;
2433 }
2434
2435 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2436 return false;
2437
2438 if (SOffset) {
2439 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2440 *SOffset = SDValue(
2441 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2442 return true;
2443 }
2444
2445 return false;
2446}
2447
2448SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2449 if (Addr.getValueType() != MVT::i32)
2450 return Addr;
2451
2452 // Zero-extend a 32-bit address.
2453 SDLoc SL(Addr);
2454
2455 const MachineFunction &MF = CurDAG->getMachineFunction();
2456 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2457 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2458 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2459
2460 const SDValue Ops[] = {
2461 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2462 Addr,
2463 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2464 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2465 0),
2466 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2467 };
2468
2469 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2470 Ops), 0);
2471}
2472
2473// Match a base and an immediate (if Offset is not null) or an SGPR (if
2474// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2475// true, match only 32-bit immediate offsets available on CI.
2476bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2477 SDValue &SBase, SDValue *SOffset,
2478 SDValue *Offset, bool Imm32Only,
2479 bool IsBuffer, bool HasSOffset,
2480 int64_t ImmOffset,
2481 bool *ScaleOffset) const {
2482 if (SOffset && Offset) {
2483 assert(!Imm32Only && !IsBuffer);
2484 SDValue B;
2485
2486 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2487 return false;
2488
2489 int64_t ImmOff = 0;
2490 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2491 ImmOff = C->getSExtValue();
2492
2493 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2494 true, ImmOff, ScaleOffset);
2495 }
2496
2497 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2498 // wraparound, because s_load instructions perform the addition in 64 bits.
2499 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2500 !Addr->getFlags().hasNoUnsignedWrap())
2501 return false;
2502
2503 SDValue N0, N1;
2504 // Extract the base and offset if possible.
2505 if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {
2506 N0 = Addr.getOperand(0);
2507 N1 = Addr.getOperand(1);
2508 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2509 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2510 }
2511 if (!N0 || !N1)
2512 return false;
2513
2514 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2515 ImmOffset, ScaleOffset)) {
2516 SBase = N0;
2517 return true;
2518 }
2519 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2520 ImmOffset, ScaleOffset)) {
2521 SBase = N1;
2522 return true;
2523 }
2524 return false;
2525}
2526
2527bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2528 SDValue *SOffset, SDValue *Offset,
2529 bool Imm32Only, bool *ScaleOffset) const {
2530 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2531 /* IsBuffer */ false, /* HasSOffset */ false,
2532 /* ImmOffset */ 0, ScaleOffset)) {
2533 SBase = Expand32BitAddress(SBase);
2534 return true;
2535 }
2536
2537 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2538 SBase = Expand32BitAddress(Addr);
2539 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2540 return true;
2541 }
2542
2543 return false;
2544}
2545
2546bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2547 SDValue &Offset) const {
2548 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2549 &Offset);
2550}
2551
2552bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2553 SDValue &Offset) const {
2554 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2555 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2556 &Offset, /* Imm32Only */ true);
2557}
2558
2559bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2560 SDValue &SOffset, SDValue &CPol) const {
2561 bool ScaleOffset;
2562 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2563 /* Imm32Only */ false, &ScaleOffset))
2564 return false;
2565
2566 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2567 SDLoc(N), MVT::i32);
2568 return true;
2569}
2570
2571bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2572 SDValue &SBase, SDValue &SOffset,
2573 SDValue &Offset,
2574 SDValue &CPol) const {
2575 bool ScaleOffset;
2576 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2577 return false;
2578
2579 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2580 SDLoc(N), MVT::i32);
2581 return true;
2582}
2583
2584bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2585 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2586 /* Imm32Only */ false, /* IsBuffer */ true);
2587}
2588
2589bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2590 SDValue &Offset) const {
2591 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2592 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2593 /* Imm32Only */ true, /* IsBuffer */ true);
2594}
2595
2596bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2597 SDValue &Offset) const {
2598 // Match the (soffset + offset) pair as a 32-bit register base and
2599 // an immediate offset.
2600 return N.getValueType() == MVT::i32 &&
2601 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2602 /* SOffset*/ nullptr, &Offset,
2603 /* Imm32Only */ false, /* IsBuffer */ true);
2604}
2605
2606bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2607 SDValue &Base,
2608 SDValue &Offset) const {
2609 SDLoc DL(Index);
2610
2611 if (CurDAG->isBaseWithConstantOffset(Index)) {
2612 SDValue N0 = Index.getOperand(0);
2613 SDValue N1 = Index.getOperand(1);
2614 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2615
2616 // (add n0, c0)
2617 // Don't peel off the offset (c0) if doing so could possibly lead
2618 // the base (n0) to be negative.
2619 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2620 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2621 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2622 Base = N0;
2623 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2624 return true;
2625 }
2626 }
2627
2628 if (isa<ConstantSDNode>(Index))
2629 return false;
2630
2631 Base = Index;
2632 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2633 return true;
2634}
2635
2636SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2637 SDValue Val, uint32_t Offset,
2638 uint32_t Width) {
2639 if (Val->isDivergent()) {
2640 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2641 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2642 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2643
2644 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2645 }
2646 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2647 // Transformation function, pack the offset and width of a BFE into
2648 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2649 // source, bits [5:0] contain the offset and bits [22:16] the width.
2650 uint32_t PackedVal = Offset | (Width << 16);
2651 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2652
2653 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2654}
2655
2656void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2657 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2658 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2659 // Predicate: 0 < b <= c < 32
2660
2661 const SDValue &Shl = N->getOperand(0);
2662 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2663 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2664
2665 if (B && C) {
2666 uint32_t BVal = B->getZExtValue();
2667 uint32_t CVal = C->getZExtValue();
2668
2669 if (0 < BVal && BVal <= CVal && CVal < 32) {
2670 bool Signed = N->getOpcode() == ISD::SRA;
2671 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2672 32 - CVal));
2673 return;
2674 }
2675 }
2676 SelectCode(N);
2677}
2678
2679void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2680 switch (N->getOpcode()) {
2681 case ISD::AND:
2682 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2683 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2684 // Predicate: isMask(mask)
2685 const SDValue &Srl = N->getOperand(0);
2686 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2687 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2688
2689 if (Shift && Mask) {
2690 uint32_t ShiftVal = Shift->getZExtValue();
2691 uint32_t MaskVal = Mask->getZExtValue();
2692
2693 if (isMask_32(MaskVal)) {
2694 uint32_t WidthVal = llvm::popcount(MaskVal);
2695 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2696 WidthVal));
2697 return;
2698 }
2699 }
2700 }
2701 break;
2702 case ISD::SRL:
2703 if (N->getOperand(0).getOpcode() == ISD::AND) {
2704 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2705 // Predicate: isMask(mask >> b)
2706 const SDValue &And = N->getOperand(0);
2707 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2708 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2709
2710 if (Shift && Mask) {
2711 uint32_t ShiftVal = Shift->getZExtValue();
2712 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2713
2714 if (isMask_32(MaskVal)) {
2715 uint32_t WidthVal = llvm::popcount(MaskVal);
2716 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2717 WidthVal));
2718 return;
2719 }
2720 }
2721 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2722 SelectS_BFEFromShifts(N);
2723 return;
2724 }
2725 break;
2726 case ISD::SRA:
2727 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2728 SelectS_BFEFromShifts(N);
2729 return;
2730 }
2731 break;
2732
2734 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2735 SDValue Src = N->getOperand(0);
2736 if (Src.getOpcode() != ISD::SRL)
2737 break;
2738
2739 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2740 if (!Amt)
2741 break;
2742
2743 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2744 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2745 Amt->getZExtValue(), Width));
2746 return;
2747 }
2748 }
2749
2750 SelectCode(N);
2751}
2752
2753bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2754 assert(N->getOpcode() == ISD::BRCOND);
2755 if (!N->hasOneUse())
2756 return false;
2757
2758 SDValue Cond = N->getOperand(1);
2759 if (Cond.getOpcode() == ISD::CopyToReg)
2760 Cond = Cond.getOperand(2);
2761
2762 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2763 return false;
2764
2765 MVT VT = Cond.getOperand(0).getSimpleValueType();
2766 if (VT == MVT::i32)
2767 return true;
2768
2769 if (VT == MVT::i64) {
2770 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2771 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2772 Subtarget->hasScalarCompareEq64();
2773 }
2774
2775 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2776 return true;
2777
2778 return false;
2779}
2780
2781static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2782 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2783 // Special case for amdgcn.ballot:
2784 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2785 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2786 // =>
2787 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2788 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2789 // Cond becomes a i(WaveSize) full mask value.
2790 // Note that ballot doesn't use SETEQ condition but its easy to support it
2791 // here for completeness, so in this case Negate is set true on return.
2792 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2793 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2794 isNullConstant(VCMP.getOperand(1))) {
2795
2796 auto Cond = VCMP.getOperand(0);
2797 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2798 Cond = Cond.getOperand(0);
2799
2800 if (isBoolSGPR(Cond)) {
2801 Negate = VCMP_CC == ISD::SETEQ;
2802 return Cond;
2803 }
2804 }
2805 return SDValue();
2806}
2807
2808void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2809 SDValue Cond = N->getOperand(1);
2810
2811 if (Cond.isUndef()) {
2812 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2813 N->getOperand(2), N->getOperand(0));
2814 return;
2815 }
2816
2817 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2818
2819 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2820 bool AndExec = !UseSCCBr;
2821 bool Negate = false;
2822
2823 if (Cond.getOpcode() == ISD::SETCC &&
2824 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2825 SDValue VCMP = Cond->getOperand(0);
2826 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2827 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2828 isNullConstant(Cond->getOperand(1)) &&
2829 // We may encounter ballot.i64 in wave32 mode on -O0.
2830 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2831 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2832 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2833 // BRCOND i1 %C, %BB
2834 // =>
2835 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2836 // VCC = COPY i(WaveSize) %VCMP
2837 // S_CBRANCH_VCCNZ/VCCZ %BB
2838 Negate = CC == ISD::SETEQ;
2839 bool NegatedBallot = false;
2840 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2841 Cond = BallotCond;
2842 UseSCCBr = !BallotCond->isDivergent();
2843 Negate = Negate ^ NegatedBallot;
2844 } else {
2845 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2846 // selected as V_CMP, but this may change for uniform condition.
2847 Cond = VCMP;
2848 UseSCCBr = false;
2849 }
2850 }
2851 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2852 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2853 // used.
2854 AndExec = false;
2855 }
2856
2857 unsigned BrOp =
2858 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2859 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2860 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2861 SDLoc SL(N);
2862
2863 if (AndExec) {
2864 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2865 // analyzed what generates the vcc value, so we do not know whether vcc
2866 // bits for disabled lanes are 0. Thus we need to mask out bits for
2867 // disabled lanes.
2868 //
2869 // For the case that we select S_CBRANCH_SCC1 and it gets
2870 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2871 // SIInstrInfo::moveToVALU which inserts the S_AND).
2872 //
2873 // We could add an analysis of what generates the vcc value here and omit
2874 // the S_AND when is unnecessary. But it would be better to add a separate
2875 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2876 // catches both cases.
2877 Cond = SDValue(
2878 CurDAG->getMachineNode(
2879 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2880 MVT::i1,
2881 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2882 : AMDGPU::EXEC,
2883 MVT::i1),
2884 Cond),
2885 0);
2886 }
2887
2888 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2889 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2890 N->getOperand(2), // Basic Block
2891 VCC.getValue(0));
2892}
2893
2894void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2895 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2896 !N->isDivergent()) {
2897 SDValue Src = N->getOperand(0);
2898 if (Src.getValueType() == MVT::f16) {
2899 if (isExtractHiElt(Src, Src)) {
2900 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2901 {Src});
2902 return;
2903 }
2904 }
2905 }
2906
2907 SelectCode(N);
2908}
2909
2910void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2911 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2912 // be copied to an SGPR with readfirstlane.
2913 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2914 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2915
2916 SDValue Chain = N->getOperand(0);
2917 SDValue Ptr = N->getOperand(2);
2918 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2919 MachineMemOperand *MMO = M->getMemOperand();
2920 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2921
2923 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2924 SDValue PtrBase = Ptr.getOperand(0);
2925 SDValue PtrOffset = Ptr.getOperand(1);
2926
2927 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2928 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2929 N = glueCopyToM0(N, PtrBase);
2930 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2931 }
2932 }
2933
2934 if (!Offset) {
2935 N = glueCopyToM0(N, Ptr);
2936 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2937 }
2938
2939 SDValue Ops[] = {
2940 Offset,
2941 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2942 Chain,
2943 N->getOperand(N->getNumOperands() - 1) // New glue
2944 };
2945
2946 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2947 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2948}
2949
2950// We need to handle this here because tablegen doesn't support matching
2951// instructions with multiple outputs.
2952void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2953 unsigned Opc;
2954 switch (IntrID) {
2955 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2956 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2957 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2958 break;
2959 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2960 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2961 break;
2962 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2963 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2964 break;
2965 }
2966 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2967 N->getOperand(5), N->getOperand(0)};
2968
2969 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2970 MachineMemOperand *MMO = M->getMemOperand();
2971 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2972 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2973}
2974
2975static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2976 switch (IntrID) {
2977 case Intrinsic::amdgcn_ds_gws_init:
2978 return AMDGPU::DS_GWS_INIT;
2979 case Intrinsic::amdgcn_ds_gws_barrier:
2980 return AMDGPU::DS_GWS_BARRIER;
2981 case Intrinsic::amdgcn_ds_gws_sema_v:
2982 return AMDGPU::DS_GWS_SEMA_V;
2983 case Intrinsic::amdgcn_ds_gws_sema_br:
2984 return AMDGPU::DS_GWS_SEMA_BR;
2985 case Intrinsic::amdgcn_ds_gws_sema_p:
2986 return AMDGPU::DS_GWS_SEMA_P;
2987 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2988 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2989 default:
2990 llvm_unreachable("not a gws intrinsic");
2991 }
2992}
2993
2994void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2995 if (!Subtarget->hasGWS() ||
2996 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2997 !Subtarget->hasGWSSemaReleaseAll())) {
2998 // Let this error.
2999 SelectCode(N);
3000 return;
3001 }
3002
3003 // Chain, intrinsic ID, vsrc, offset
3004 const bool HasVSrc = N->getNumOperands() == 4;
3005 assert(HasVSrc || N->getNumOperands() == 3);
3006
3007 SDLoc SL(N);
3008 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3009 int ImmOffset = 0;
3010 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3011 MachineMemOperand *MMO = M->getMemOperand();
3012
3013 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3014 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3015
3016 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3017 // offset field) % 64. Some versions of the programming guide omit the m0
3018 // part, or claim it's from offset 0.
3019 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3020 // If we have a constant offset, try to use the 0 in m0 as the base.
3021 // TODO: Look into changing the default m0 initialization value. If the
3022 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3023 // the immediate offset.
3024 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3025 ImmOffset = ConstOffset->getZExtValue();
3026 } else {
3027 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3028 ImmOffset = BaseOffset.getConstantOperandVal(1);
3029 BaseOffset = BaseOffset.getOperand(0);
3030 }
3031
3032 // Prefer to do the shift in an SGPR since it should be possible to use m0
3033 // as the result directly. If it's already an SGPR, it will be eliminated
3034 // later.
3035 SDNode *SGPROffset
3036 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3037 BaseOffset);
3038 // Shift to offset in m0
3039 SDNode *M0Base
3040 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3041 SDValue(SGPROffset, 0),
3042 CurDAG->getTargetConstant(16, SL, MVT::i32));
3043 glueCopyToM0(N, SDValue(M0Base, 0));
3044 }
3045
3046 SDValue Chain = N->getOperand(0);
3047 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3048
3049 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3051 if (HasVSrc)
3052 Ops.push_back(N->getOperand(2));
3053 Ops.push_back(OffsetField);
3054 Ops.push_back(Chain);
3055
3056 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3057 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3058}
3059
3060void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3061 if (Subtarget->getLDSBankCount() != 16) {
3062 // This is a single instruction with a pattern.
3063 SelectCode(N);
3064 return;
3065 }
3066
3067 SDLoc DL(N);
3068
3069 // This requires 2 instructions. It is possible to write a pattern to support
3070 // this, but the generated isel emitter doesn't correctly deal with multiple
3071 // output instructions using the same physical register input. The copy to m0
3072 // is incorrectly placed before the second instruction.
3073 //
3074 // TODO: Match source modifiers.
3075 //
3076 // def : Pat <
3077 // (int_amdgcn_interp_p1_f16
3078 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3079 // (i32 timm:$attrchan), (i32 timm:$attr),
3080 // (i1 timm:$high), M0),
3081 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3082 // timm:$attrchan, 0,
3083 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3084 // let Predicates = [has16BankLDS];
3085 // }
3086
3087 // 16 bank LDS
3088 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3089 N->getOperand(5), SDValue());
3090
3091 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3092
3093 SDNode *InterpMov =
3094 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3095 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3096 N->getOperand(3), // Attr
3097 N->getOperand(2), // Attrchan
3098 ToM0.getValue(1) // In glue
3099 });
3100
3101 SDNode *InterpP1LV =
3102 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3103 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3104 N->getOperand(1), // Src0
3105 N->getOperand(3), // Attr
3106 N->getOperand(2), // Attrchan
3107 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3108 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3109 N->getOperand(4), // high
3110 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3111 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3112 SDValue(InterpMov, 1)
3113 });
3114
3115 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3116}
3117
3118void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3119 unsigned IntrID = N->getConstantOperandVal(1);
3120 switch (IntrID) {
3121 case Intrinsic::amdgcn_ds_append:
3122 case Intrinsic::amdgcn_ds_consume: {
3123 if (N->getValueType(0) != MVT::i32)
3124 break;
3125 SelectDSAppendConsume(N, IntrID);
3126 return;
3127 }
3128 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3129 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3130 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3131 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3132 SelectDSBvhStackIntrinsic(N, IntrID);
3133 return;
3134 case Intrinsic::amdgcn_init_whole_wave:
3135 CurDAG->getMachineFunction()
3136 .getInfo<SIMachineFunctionInfo>()
3137 ->setInitWholeWave();
3138 break;
3139 }
3140
3141 SelectCode(N);
3142}
3143
3144void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3145 unsigned IntrID = N->getConstantOperandVal(0);
3146 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3147 SDNode *ConvGlueNode = N->getGluedNode();
3148 if (ConvGlueNode) {
3149 // FIXME: Possibly iterate over multiple glue nodes?
3150 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3151 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3152 ConvGlueNode =
3153 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3154 MVT::Glue, SDValue(ConvGlueNode, 0));
3155 } else {
3156 ConvGlueNode = nullptr;
3157 }
3158 switch (IntrID) {
3159 case Intrinsic::amdgcn_wqm:
3160 Opcode = AMDGPU::WQM;
3161 break;
3162 case Intrinsic::amdgcn_softwqm:
3163 Opcode = AMDGPU::SOFT_WQM;
3164 break;
3165 case Intrinsic::amdgcn_wwm:
3166 case Intrinsic::amdgcn_strict_wwm:
3167 Opcode = AMDGPU::STRICT_WWM;
3168 break;
3169 case Intrinsic::amdgcn_strict_wqm:
3170 Opcode = AMDGPU::STRICT_WQM;
3171 break;
3172 case Intrinsic::amdgcn_interp_p1_f16:
3173 SelectInterpP1F16(N);
3174 return;
3175 case Intrinsic::amdgcn_permlane16_swap:
3176 case Intrinsic::amdgcn_permlane32_swap: {
3177 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3178 !Subtarget->hasPermlane16Swap()) ||
3179 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3180 !Subtarget->hasPermlane32Swap())) {
3181 SelectCode(N); // Hit the default error
3182 return;
3183 }
3184
3185 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3186 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3187 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3188
3189 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3190 if (ConvGlueNode)
3191 NewOps.push_back(SDValue(ConvGlueNode, 0));
3192
3193 bool FI = N->getConstantOperandVal(3);
3194 NewOps[2] = CurDAG->getTargetConstant(
3195 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3196
3197 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3198 return;
3199 }
3200 default:
3201 SelectCode(N);
3202 break;
3203 }
3204
3205 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3206 SDValue Src = N->getOperand(1);
3207 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3208 }
3209
3210 if (ConvGlueNode) {
3211 SmallVector<SDValue, 4> NewOps(N->ops());
3212 NewOps.push_back(SDValue(ConvGlueNode, 0));
3213 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3214 }
3215}
3216
3217void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3218 unsigned IntrID = N->getConstantOperandVal(1);
3219 switch (IntrID) {
3220 case Intrinsic::amdgcn_ds_gws_init:
3221 case Intrinsic::amdgcn_ds_gws_barrier:
3222 case Intrinsic::amdgcn_ds_gws_sema_v:
3223 case Intrinsic::amdgcn_ds_gws_sema_br:
3224 case Intrinsic::amdgcn_ds_gws_sema_p:
3225 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3226 SelectDS_GWS(N, IntrID);
3227 return;
3228 default:
3229 break;
3230 }
3231
3232 SelectCode(N);
3233}
3234
3235void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3236 SDValue Log2WaveSize =
3237 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3238 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3239 {N->getOperand(0), Log2WaveSize});
3240}
3241
3242void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3243 SDValue SrcVal = N->getOperand(1);
3244 if (SrcVal.getValueType() != MVT::i32) {
3245 SelectCode(N); // Emit default error
3246 return;
3247 }
3248
3249 SDValue CopyVal;
3250 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3251 SDLoc SL(N);
3252
3253 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3254 CopyVal = SrcVal.getOperand(0);
3255 } else {
3256 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3257 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3258
3259 if (N->isDivergent()) {
3260 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3261 MVT::i32, SrcVal),
3262 0);
3263 }
3264
3265 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3266 {SrcVal, Log2WaveSize}),
3267 0);
3268 }
3269
3270 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3271 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3272}
3273
3274bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3275 unsigned &Mods,
3276 bool IsCanonicalizing,
3277 bool AllowAbs) const {
3278 Mods = SISrcMods::NONE;
3279 Src = In;
3280
3281 if (Src.getOpcode() == ISD::FNEG) {
3282 Mods |= SISrcMods::NEG;
3283 Src = Src.getOperand(0);
3284 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3285 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3286 // denormal mode, but we're implicitly canonicalizing in a source operand.
3287 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3288 if (LHS && LHS->isZero()) {
3289 Mods |= SISrcMods::NEG;
3290 Src = Src.getOperand(1);
3291 }
3292 }
3293
3294 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3295 Mods |= SISrcMods::ABS;
3296 Src = Src.getOperand(0);
3297 }
3298
3299 if (Mods != SISrcMods::NONE)
3300 return true;
3301
3302 // Convert various sign-bit masks on integers to src mods. Currently disabled
3303 // for 16-bit types as the codegen replaces the operand without adding a
3304 // srcmod. This is intentionally finding the cases where we are performing
3305 // float neg and abs on int types, the goal is not to obtain two's complement
3306 // neg or abs. Limit converison to select operands via the nonCanonalizing
3307 // pattern.
3308 // TODO: Add 16-bit support.
3309 if (IsCanonicalizing)
3310 return true;
3311
3312 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3313 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3314 // through the extract to the bitwise op.
3315 SDValue PeekSrc =
3316 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3317 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3318 // types as the codegen replaces the operand without adding a srcmod.
3319 // This is intentionally finding the cases where we are performing float neg
3320 // and abs on int types, the goal is not to obtain two's complement neg or
3321 // abs.
3322 // TODO: Add 16-bit support.
3323 unsigned Opc = PeekSrc.getOpcode();
3324 EVT VT = Src.getValueType();
3325 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3326 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3327 return true;
3328
3329 ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
3330 if (!CRHS)
3331 return true;
3332
3333 auto ReplaceSrc = [&]() -> SDValue {
3334 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3335 return Src.getOperand(0);
3336
3337 SDValue LHS = PeekSrc->getOperand(0);
3338 SDValue Index = Src->getOperand(1);
3339 return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3340 Src.getValueType(), LHS, Index);
3341 };
3342
3343 // Recognise Srcmods:
3344 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3345 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3346 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3347 // SrcModifiers.
3348 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3349 Mods |= SISrcMods::NEG;
3350 Src = ReplaceSrc();
3351 } else if (Opc == ISD::AND && AllowAbs &&
3352 CRHS->getAPIntValue().isMaxSignedValue()) {
3353 Mods |= SISrcMods::ABS;
3354 Src = ReplaceSrc();
3355 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3357 Src = ReplaceSrc();
3358 }
3359
3360 return true;
3361}
3362
3363bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3364 SDValue &SrcMods) const {
3365 unsigned Mods;
3366 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3367 /*AllowAbs=*/true)) {
3368 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3369 return true;
3370 }
3371
3372 return false;
3373}
3374
3375bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3376 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3377 unsigned Mods;
3378 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3379 /*AllowAbs=*/true)) {
3380 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3381 return true;
3382 }
3383
3384 return false;
3385}
3386
3387bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3388 SDValue &SrcMods) const {
3389 unsigned Mods;
3390 if (SelectVOP3ModsImpl(In, Src, Mods,
3391 /*IsCanonicalizing=*/true,
3392 /*AllowAbs=*/false)) {
3393 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3394 return true;
3395 }
3396
3397 return false;
3398}
3399
3400bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3401 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3402 return false;
3403
3404 Src = In;
3405 return true;
3406}
3407
3408bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3409 SDValue &SrcMods,
3410 bool OpSel) const {
3411 unsigned Mods;
3412 if (SelectVOP3ModsImpl(In, Src, Mods,
3413 /*IsCanonicalizing=*/true,
3414 /*AllowAbs=*/false)) {
3415 if (OpSel)
3416 Mods |= SISrcMods::OP_SEL_0;
3417 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3418 return true;
3419 }
3420
3421 return false;
3422}
3423
3424bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3425 SDValue &SrcMods) const {
3426 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3427}
3428
3429bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3430 SDValue &SrcMods) const {
3431 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3432}
3433
3434bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3435 SDValue &SrcMods, SDValue &Clamp,
3436 SDValue &Omod) const {
3437 SDLoc DL(In);
3438 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3439 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3440
3441 return SelectVOP3Mods(In, Src, SrcMods);
3442}
3443
3444bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3445 SDValue &SrcMods, SDValue &Clamp,
3446 SDValue &Omod) const {
3447 SDLoc DL(In);
3448 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3449 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3450
3451 return SelectVOP3BMods(In, Src, SrcMods);
3452}
3453
3454bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3455 SDValue &Clamp, SDValue &Omod) const {
3456 Src = In;
3457
3458 SDLoc DL(In);
3459 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3460 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3461
3462 return true;
3463}
3464
3465bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3466 SDValue &SrcMods, bool IsDOT) const {
3467 unsigned Mods = SISrcMods::NONE;
3468 Src = In;
3469
3470 // TODO: Handle G_FSUB 0 as fneg
3471 if (Src.getOpcode() == ISD::FNEG) {
3473 Src = Src.getOperand(0);
3474 }
3475
3476 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3477 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3478 unsigned VecMods = Mods;
3479
3480 SDValue Lo = stripBitcast(Src.getOperand(0));
3481 SDValue Hi = stripBitcast(Src.getOperand(1));
3482
3483 if (Lo.getOpcode() == ISD::FNEG) {
3484 Lo = stripBitcast(Lo.getOperand(0));
3485 Mods ^= SISrcMods::NEG;
3486 }
3487
3488 if (Hi.getOpcode() == ISD::FNEG) {
3489 Hi = stripBitcast(Hi.getOperand(0));
3490 Mods ^= SISrcMods::NEG_HI;
3491 }
3492
3493 if (isExtractHiElt(Lo, Lo))
3494 Mods |= SISrcMods::OP_SEL_0;
3495
3496 if (isExtractHiElt(Hi, Hi))
3497 Mods |= SISrcMods::OP_SEL_1;
3498
3499 unsigned VecSize = Src.getValueSizeInBits();
3500 Lo = stripExtractLoElt(Lo);
3501 Hi = stripExtractLoElt(Hi);
3502
3503 if (Lo.getValueSizeInBits() > VecSize) {
3504 Lo = CurDAG->getTargetExtractSubreg(
3505 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3506 MVT::getIntegerVT(VecSize), Lo);
3507 }
3508
3509 if (Hi.getValueSizeInBits() > VecSize) {
3510 Hi = CurDAG->getTargetExtractSubreg(
3511 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3512 MVT::getIntegerVT(VecSize), Hi);
3513 }
3514
3515 assert(Lo.getValueSizeInBits() <= VecSize &&
3516 Hi.getValueSizeInBits() <= VecSize);
3517
3518 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3519 // Really a scalar input. Just select from the low half of the register to
3520 // avoid packing.
3521
3522 if (VecSize == Lo.getValueSizeInBits()) {
3523 Src = Lo;
3524 } else if (VecSize == 32) {
3525 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3526 } else {
3527 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3528
3529 SDLoc SL(In);
3531 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3532 Lo.getValueType()), 0);
3533 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3534 : AMDGPU::SReg_64RegClassID;
3535 const SDValue Ops[] = {
3536 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3537 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3538 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3539
3540 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3541 Src.getValueType(), Ops), 0);
3542 }
3543 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3544 return true;
3545 }
3546
3547 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3548 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3549 .bitcastToAPInt().getZExtValue();
3550 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3551 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3552 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3553 return true;
3554 }
3555 }
3556
3557 Mods = VecMods;
3558 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3559 Src.getNumOperands() == 2) {
3560
3561 // TODO: We should repeat the build_vector source check above for the
3562 // vector_shuffle for negates and casts of individual elements.
3563
3564 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3565 ArrayRef<int> Mask = SVN->getMask();
3566
3567 if (Mask[0] < 2 && Mask[1] < 2) {
3568 // src1 should be undef.
3569 SDValue ShuffleSrc = SVN->getOperand(0);
3570
3571 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3572 ShuffleSrc = ShuffleSrc.getOperand(0);
3574 }
3575
3576 if (Mask[0] == 1)
3577 Mods |= SISrcMods::OP_SEL_0;
3578 if (Mask[1] == 1)
3579 Mods |= SISrcMods::OP_SEL_1;
3580
3581 Src = ShuffleSrc;
3582 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3583 return true;
3584 }
3585 }
3586
3587 // Packed instructions do not have abs modifiers.
3588 Mods |= SISrcMods::OP_SEL_1;
3589
3590 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3591 return true;
3592}
3593
3594bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3595 SDValue &SrcMods) const {
3596 return SelectVOP3PMods(In, Src, SrcMods, true);
3597}
3598
3599bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3600 SDValue &Src) const {
3601 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3602 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3603
3604 unsigned Mods = SISrcMods::OP_SEL_1;
3605 unsigned SrcVal = C->getZExtValue();
3606 if (SrcVal == 1)
3607 Mods |= SISrcMods::OP_SEL_0;
3608
3609 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3610 return true;
3611}
3612
3614 llvm::SelectionDAG *CurDAG,
3615 const SDLoc &DL) {
3616 unsigned DstRegClass;
3617 EVT DstTy;
3618 switch (Elts.size()) {
3619 case 8:
3620 DstRegClass = AMDGPU::VReg_256RegClassID;
3621 DstTy = MVT::v8i32;
3622 break;
3623 case 4:
3624 DstRegClass = AMDGPU::VReg_128RegClassID;
3625 DstTy = MVT::v4i32;
3626 break;
3627 case 2:
3628 DstRegClass = AMDGPU::VReg_64RegClassID;
3629 DstTy = MVT::v2i32;
3630 break;
3631 default:
3632 llvm_unreachable("unhandled Reg sequence size");
3633 }
3634
3636 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3637 for (unsigned i = 0; i < Elts.size(); ++i) {
3638 Ops.push_back(Elts[i]);
3639 Ops.push_back(CurDAG->getTargetConstant(
3641 }
3642 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3643}
3644
3646 llvm::SelectionDAG *CurDAG,
3647 const SDLoc &DL) {
3648 SmallVector<SDValue, 8> PackedElts;
3649 assert("unhandled Reg sequence size" &&
3650 (Elts.size() == 8 || Elts.size() == 16));
3651
3652 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3653 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3654 for (unsigned i = 0; i < Elts.size(); i += 2) {
3655 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3656 SDValue HiSrc;
3657 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3658 PackedElts.push_back(HiSrc);
3659 } else {
3660 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3661 MachineSDNode *Packed =
3662 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3663 {Elts[i + 1], Elts[i], PackLoLo});
3664 PackedElts.push_back(SDValue(Packed, 0));
3665 }
3666 }
3667
3668 return buildRegSequence32(PackedElts, CurDAG, DL);
3669}
3670
3672 llvm::SelectionDAG *CurDAG,
3673 const SDLoc &DL, unsigned ElementSize) {
3674 if (ElementSize == 16)
3675 return buildRegSequence16(Elts, CurDAG, DL);
3676 if (ElementSize == 32)
3677 return buildRegSequence32(Elts, CurDAG, DL);
3678 llvm_unreachable("Unhandled element size");
3679}
3680
3681static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3683 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3684 unsigned ElementSize) {
3685 if (ModOpcode == ISD::FNEG) {
3686 Mods |= SISrcMods::NEG;
3687 // Check if all elements also have abs modifier
3688 SmallVector<SDValue, 8> NegAbsElts;
3689 for (auto El : Elts) {
3690 if (El.getOpcode() != ISD::FABS)
3691 break;
3692 NegAbsElts.push_back(El->getOperand(0));
3693 }
3694 if (Elts.size() != NegAbsElts.size()) {
3695 // Neg
3696 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3697 } else {
3698 // Neg and Abs
3699 Mods |= SISrcMods::NEG_HI;
3700 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3701 }
3702 } else {
3703 assert(ModOpcode == ISD::FABS);
3704 // Abs
3705 Mods |= SISrcMods::NEG_HI;
3706 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3707 }
3708}
3709
3710// Check all f16 elements for modifiers while looking through b32 and v2b16
3711// build vector, stop if element does not satisfy ModifierCheck.
3712static void
3714 std::function<bool(SDValue)> ModifierCheck) {
3715 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3716 if (auto *F16Pair =
3717 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3718 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3719 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3720 if (!ModifierCheck(ElF16))
3721 break;
3722 }
3723 }
3724 }
3725}
3726
3727bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3728 SDValue &SrcMods) const {
3729 Src = In;
3730 unsigned Mods = SISrcMods::OP_SEL_1;
3731
3732 // mods are on f16 elements
3733 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3735
3736 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3737 if (Element.getOpcode() != ISD::FNEG)
3738 return false;
3739 EltsF16.push_back(Element.getOperand(0));
3740 return true;
3741 });
3742
3743 // All elements have neg modifier
3744 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3745 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3746 Mods |= SISrcMods::NEG;
3747 Mods |= SISrcMods::NEG_HI;
3748 }
3749 }
3750
3751 // mods are on v2f16 elements
3752 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3753 SmallVector<SDValue, 8> EltsV2F16;
3754 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3755 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3756 // Based on first element decide which mod we match, neg or abs
3757 if (ElV2f16.getOpcode() != ISD::FNEG)
3758 break;
3759 EltsV2F16.push_back(ElV2f16.getOperand(0));
3760 }
3761
3762 // All pairs of elements have neg modifier
3763 if (BV->getNumOperands() == EltsV2F16.size()) {
3764 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3765 Mods |= SISrcMods::NEG;
3766 Mods |= SISrcMods::NEG_HI;
3767 }
3768 }
3769
3770 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3771 return true;
3772}
3773
3774bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3775 SDValue &SrcMods) const {
3776 Src = In;
3777 unsigned Mods = SISrcMods::OP_SEL_1;
3778 unsigned ModOpcode;
3779
3780 // mods are on f16 elements
3781 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3783 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3784 // Based on first element decide which mod we match, neg or abs
3785 if (EltsF16.empty())
3786 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3787 if (ElF16.getOpcode() != ModOpcode)
3788 return false;
3789 EltsF16.push_back(ElF16.getOperand(0));
3790 return true;
3791 });
3792
3793 // All elements have ModOpcode modifier
3794 if (BV->getNumOperands() * 2 == EltsF16.size())
3795 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3796 16);
3797 }
3798
3799 // mods are on v2f16 elements
3800 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3801 SmallVector<SDValue, 8> EltsV2F16;
3802
3803 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3804 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3805 // Based on first element decide which mod we match, neg or abs
3806 if (EltsV2F16.empty())
3807 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3808 if (ElV2f16->getOpcode() != ModOpcode)
3809 break;
3810 EltsV2F16.push_back(ElV2f16->getOperand(0));
3811 }
3812
3813 // All elements have ModOpcode modifier
3814 if (BV->getNumOperands() == EltsV2F16.size())
3815 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3816 32);
3817 }
3818
3819 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3820 return true;
3821}
3822
3823bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3824 SDValue &SrcMods) const {
3825 Src = In;
3826 unsigned Mods = SISrcMods::OP_SEL_1;
3828
3829 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3830 assert(BV->getNumOperands() > 0);
3831 // Based on first element decide which mod we match, neg or abs
3832 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3833 unsigned ModOpcode =
3834 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3835 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3836 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3837 if (ElF32.getOpcode() != ModOpcode)
3838 break;
3839 EltsF32.push_back(ElF32.getOperand(0));
3840 }
3841
3842 // All elements had ModOpcode modifier
3843 if (BV->getNumOperands() == EltsF32.size())
3844 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3845 32);
3846 }
3847
3848 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3849 return true;
3850}
3851
3852bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3853 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3854 BitVector UndefElements;
3855 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3856 if (isInlineImmediate(Splat.getNode())) {
3857 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3858 unsigned Imm = C->getAPIntValue().getSExtValue();
3859 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3860 return true;
3861 }
3862 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3863 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3864 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3865 return true;
3866 }
3867 llvm_unreachable("unhandled Constant node");
3868 }
3869 }
3870
3871 // 16 bit splat
3872 SDValue SplatSrc32 = stripBitcast(In);
3873 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3874 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3875 SDValue SplatSrc16 = stripBitcast(Splat32);
3876 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3877 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3878 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3879 std::optional<APInt> RawValue;
3880 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3881 RawValue = C->getValueAPF().bitcastToAPInt();
3882 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3883 RawValue = C->getAPIntValue();
3884
3885 if (RawValue.has_value()) {
3886 EVT VT = In.getValueType().getScalarType();
3887 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3888 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3891 RawValue.value());
3892 if (TII->isInlineConstant(FloatVal)) {
3893 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3894 MVT::i16);
3895 return true;
3896 }
3897 } else if (VT.getSimpleVT() == MVT::i16) {
3898 if (TII->isInlineConstant(RawValue.value())) {
3899 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3900 MVT::i16);
3901 return true;
3902 }
3903 } else
3904 llvm_unreachable("unknown 16-bit type");
3905 }
3906 }
3907 }
3908
3909 return false;
3910}
3911
3912bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3913 SDValue &IndexKey) const {
3914 unsigned Key = 0;
3915 Src = In;
3916
3917 if (In.getOpcode() == ISD::SRL) {
3918 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3919 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3920 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3921 ShiftAmt->getZExtValue() % 8 == 0) {
3922 Key = ShiftAmt->getZExtValue() / 8;
3923 Src = ShiftSrc;
3924 }
3925 }
3926
3927 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3928 return true;
3929}
3930
3931bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3932 SDValue &IndexKey) const {
3933 unsigned Key = 0;
3934 Src = In;
3935
3936 if (In.getOpcode() == ISD::SRL) {
3937 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3938 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3939 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3940 ShiftAmt->getZExtValue() == 16) {
3941 Key = 1;
3942 Src = ShiftSrc;
3943 }
3944 }
3945
3946 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3947 return true;
3948}
3949
3950bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
3951 SDValue &IndexKey) const {
3952 unsigned Key = 0;
3953 Src = In;
3954
3955 SDValue InI32;
3956
3957 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
3958 const SDValue &ExtendSrc = In.getOperand(0);
3959 if (ExtendSrc.getValueSizeInBits() == 32)
3960 InI32 = ExtendSrc;
3961 } else if (In->getOpcode() == ISD::BITCAST) {
3962 const SDValue &CastSrc = In.getOperand(0);
3963 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
3964 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
3965 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
3966 if (Zero && Zero->getZExtValue() == 0)
3967 InI32 = CastSrc.getOperand(0);
3968 }
3969 }
3970
3971 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
3972 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
3973 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
3974 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
3975 EltIdx->getZExtValue() == 1) {
3976 Key = 1;
3977 Src = ExtractVecEltSrc;
3978 }
3979 }
3980
3981 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3982 return true;
3983}
3984
3985bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3986 SDValue &SrcMods) const {
3987 Src = In;
3988 // FIXME: Handle op_sel
3989 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3990 return true;
3991}
3992
3993bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3994 SDValue &SrcMods) const {
3995 // FIXME: Handle op_sel
3996 return SelectVOP3Mods(In, Src, SrcMods);
3997}
3998
3999// Match lowered fpext from bf16 to f32. This is a bit operation extending
4000// a 16-bit value with 16-bit of zeroes at LSB:
4001//
4002// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4003// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4004// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4005static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4006 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4007 return SDValue();
4008 Op = Op.getOperand(0);
4009
4010 IsExtractHigh = false;
4011 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4012 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
4013 if (!Low16 || !Low16->isZero())
4014 return SDValue();
4015 Op = stripBitcast(Op.getOperand(1));
4016 if (Op.getValueType() != MVT::bf16)
4017 return SDValue();
4018 return Op;
4019 }
4020
4021 if (Op.getValueType() != MVT::i32)
4022 return SDValue();
4023
4024 if (Op.getOpcode() == ISD::AND) {
4025 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4026 if (Mask->getZExtValue() == 0xffff0000) {
4027 IsExtractHigh = true;
4028 return Op.getOperand(0);
4029 }
4030 }
4031 return SDValue();
4032 }
4033
4034 if (Op.getOpcode() == ISD::SHL) {
4035 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4036 if (Amt->getZExtValue() == 16)
4037 return Op.getOperand(0);
4038 }
4039 }
4040
4041 return SDValue();
4042}
4043
4044// The return value is not whether the match is possible (which it always is),
4045// but whether or not it a conversion is really used.
4046bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4047 unsigned &Mods,
4048 MVT VT) const {
4049 Mods = 0;
4050 SelectVOP3ModsImpl(In, Src, Mods);
4051
4052 bool IsExtractHigh = false;
4053 if (Src.getOpcode() == ISD::FP_EXTEND) {
4054 Src = Src.getOperand(0);
4055 } else if (VT == MVT::bf16) {
4056 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4057 if (!B16)
4058 return false;
4059 Src = B16;
4060 } else
4061 return false;
4062
4063 if (Src.getValueType() != VT &&
4064 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4065 return false;
4066
4067 Src = stripBitcast(Src);
4068
4069 // Be careful about folding modifiers if we already have an abs. fneg is
4070 // applied last, so we don't want to apply an earlier fneg.
4071 if ((Mods & SISrcMods::ABS) == 0) {
4072 unsigned ModsTmp;
4073 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4074
4075 if ((ModsTmp & SISrcMods::NEG) != 0)
4076 Mods ^= SISrcMods::NEG;
4077
4078 if ((ModsTmp & SISrcMods::ABS) != 0)
4079 Mods |= SISrcMods::ABS;
4080 }
4081
4082 // op_sel/op_sel_hi decide the source type and source.
4083 // If the source's op_sel_hi is set, it indicates to do a conversion from
4084 // fp16. If the sources's op_sel is set, it picks the high half of the source
4085 // register.
4086
4087 Mods |= SISrcMods::OP_SEL_1;
4088 if (Src.getValueSizeInBits() == 16) {
4089 if (isExtractHiElt(Src, Src)) {
4090 Mods |= SISrcMods::OP_SEL_0;
4091
4092 // TODO: Should we try to look for neg/abs here?
4093 return true;
4094 }
4095
4096 if (Src.getOpcode() == ISD::TRUNCATE &&
4097 Src.getOperand(0).getValueType() == MVT::i32) {
4098 Src = Src.getOperand(0);
4099 return true;
4100 }
4101
4102 if (Subtarget->useRealTrue16Insts())
4103 // In true16 mode, pack src to a 32bit
4104 Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4105 } else if (IsExtractHigh)
4106 Mods |= SISrcMods::OP_SEL_0;
4107
4108 return true;
4109}
4110
4111bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4112 SDValue &SrcMods) const {
4113 unsigned Mods = 0;
4114 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4115 return false;
4116 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4117 return true;
4118}
4119
4120bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4121 SDValue &SrcMods) const {
4122 unsigned Mods = 0;
4123 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4124 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4125 return true;
4126}
4127
4128bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4129 SDValue &SrcMods) const {
4130 unsigned Mods = 0;
4131 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4132 return false;
4133 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4134 return true;
4135}
4136
4137bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4138 SDValue &SrcMods) const {
4139 unsigned Mods = 0;
4140 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4141 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4142 return true;
4143}
4144
4145// Match BITOP3 operation and return a number of matched instructions plus
4146// truth table.
4147static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4149 unsigned NumOpcodes = 0;
4150 uint8_t LHSBits, RHSBits;
4151
4152 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4153 // Define truth table given Src0, Src1, Src2 bits permutations:
4154 // 0 0 0
4155 // 0 0 1
4156 // 0 1 0
4157 // 0 1 1
4158 // 1 0 0
4159 // 1 0 1
4160 // 1 1 0
4161 // 1 1 1
4162 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4163
4164 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4165 if (C->isAllOnes()) {
4166 Bits = 0xff;
4167 return true;
4168 }
4169 if (C->isZero()) {
4170 Bits = 0;
4171 return true;
4172 }
4173 }
4174
4175 for (unsigned I = 0; I < Src.size(); ++I) {
4176 // Try to find existing reused operand
4177 if (Src[I] == Op) {
4178 Bits = SrcBits[I];
4179 return true;
4180 }
4181 // Try to replace parent operator
4182 if (Src[I] == In) {
4183 Bits = SrcBits[I];
4184 Src[I] = Op;
4185 return true;
4186 }
4187 }
4188
4189 if (Src.size() == 3) {
4190 // No room left for operands. Try one last time, there can be a 'not' of
4191 // one of our source operands. In this case we can compute the bits
4192 // without growing Src vector.
4193 if (Op.getOpcode() == ISD::XOR) {
4194 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4195 if (C->isAllOnes()) {
4196 SDValue LHS = Op.getOperand(0);
4197 for (unsigned I = 0; I < Src.size(); ++I) {
4198 if (Src[I] == LHS) {
4199 Bits = ~SrcBits[I];
4200 return true;
4201 }
4202 }
4203 }
4204 }
4205 }
4206
4207 return false;
4208 }
4209
4210 Bits = SrcBits[Src.size()];
4211 Src.push_back(Op);
4212 return true;
4213 };
4214
4215 switch (In.getOpcode()) {
4216 case ISD::AND:
4217 case ISD::OR:
4218 case ISD::XOR: {
4219 SDValue LHS = In.getOperand(0);
4220 SDValue RHS = In.getOperand(1);
4221
4222 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4223 if (!getOperandBits(LHS, LHSBits) ||
4224 !getOperandBits(RHS, RHSBits)) {
4225 Src = Backup;
4226 return std::make_pair(0, 0);
4227 }
4228
4229 // Recursion is naturally limited by the size of the operand vector.
4230 auto Op = BitOp3_Op(LHS, Src);
4231 if (Op.first) {
4232 NumOpcodes += Op.first;
4233 LHSBits = Op.second;
4234 }
4235
4236 Op = BitOp3_Op(RHS, Src);
4237 if (Op.first) {
4238 NumOpcodes += Op.first;
4239 RHSBits = Op.second;
4240 }
4241 break;
4242 }
4243 default:
4244 return std::make_pair(0, 0);
4245 }
4246
4247 uint8_t TTbl;
4248 switch (In.getOpcode()) {
4249 case ISD::AND:
4250 TTbl = LHSBits & RHSBits;
4251 break;
4252 case ISD::OR:
4253 TTbl = LHSBits | RHSBits;
4254 break;
4255 case ISD::XOR:
4256 TTbl = LHSBits ^ RHSBits;
4257 break;
4258 default:
4259 break;
4260 }
4261
4262 return std::make_pair(NumOpcodes + 1, TTbl);
4263}
4264
4265bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4266 SDValue &Src2, SDValue &Tbl) const {
4268 uint8_t TTbl;
4269 unsigned NumOpcodes;
4270
4271 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4272
4273 // Src.empty() case can happen if all operands are all zero or all ones.
4274 // Normally it shall be optimized out before reaching this.
4275 if (NumOpcodes < 2 || Src.empty())
4276 return false;
4277
4278 // For a uniform case threshold should be higher to account for moves between
4279 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4280 // and a readtfirstlane after.
4281 if (NumOpcodes < 4 && !In->isDivergent())
4282 return false;
4283
4284 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4285 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4286 // asm more readable. This cannot be modeled with AddedComplexity because
4287 // selector does not know how many operations did we match.
4288 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4289 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4290 In.getOperand(1).getOpcode() == In.getOpcode()))
4291 return false;
4292
4293 if (In.getOpcode() == ISD::OR &&
4294 (In.getOperand(0).getOpcode() == ISD::AND ||
4295 In.getOperand(1).getOpcode() == ISD::AND))
4296 return false;
4297 }
4298
4299 // Last operand can be ignored, turning a ternary operation into a binary.
4300 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4301 // 'c' with 'a' here without changing the answer. In some pathological
4302 // cases it should be possible to get an operation with a single operand
4303 // too if optimizer would not catch it.
4304 while (Src.size() < 3)
4305 Src.push_back(Src[0]);
4306
4307 Src0 = Src[0];
4308 Src1 = Src[1];
4309 Src2 = Src[2];
4310
4311 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4312 return true;
4313}
4314
4315SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4316 if (In.isUndef())
4317 return CurDAG->getUNDEF(MVT::i32);
4318
4319 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4320 SDLoc SL(In);
4321 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4322 }
4323
4324 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4325 SDLoc SL(In);
4326 return CurDAG->getConstant(
4327 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4328 }
4329
4330 SDValue Src;
4331 if (isExtractHiElt(In, Src))
4332 return Src;
4333
4334 return SDValue();
4335}
4336
4337bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4338 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4339
4340 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4341 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4342
4343 unsigned Limit = 0;
4344 bool AllUsesAcceptSReg = true;
4345 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4346 Limit < 10 && U != E; ++U, ++Limit) {
4347 const TargetRegisterClass *RC =
4348 getOperandRegClass(U->getUser(), U->getOperandNo());
4349
4350 // If the register class is unknown, it could be an unknown
4351 // register class that needs to be an SGPR, e.g. an inline asm
4352 // constraint
4353 if (!RC || SIRI->isSGPRClass(RC))
4354 return false;
4355
4356 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
4357 AllUsesAcceptSReg = false;
4358 SDNode *User = U->getUser();
4359 if (User->isMachineOpcode()) {
4360 unsigned Opc = User->getMachineOpcode();
4361 const MCInstrDesc &Desc = SII->get(Opc);
4362 if (Desc.isCommutable()) {
4363 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4364 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4365 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4366 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4367 const TargetRegisterClass *CommutedRC =
4368 getOperandRegClass(U->getUser(), CommutedOpNo);
4369 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4370 CommutedRC == &AMDGPU::VS_64RegClass)
4371 AllUsesAcceptSReg = true;
4372 }
4373 }
4374 }
4375 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4376 // commuting current user. This means have at least one use
4377 // that strictly require VGPR. Thus, we will not attempt to commute
4378 // other user instructions.
4379 if (!AllUsesAcceptSReg)
4380 break;
4381 }
4382 }
4383 return !AllUsesAcceptSReg && (Limit < 10);
4384}
4385
4386bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4387 const auto *Ld = cast<LoadSDNode>(N);
4388
4389 const MachineMemOperand *MMO = Ld->getMemOperand();
4390 if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4391 return false;
4392
4393 return MMO->getSize().hasValue() &&
4394 Ld->getAlign() >=
4395 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4396 uint64_t(4))) &&
4397 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4398 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4399 (Subtarget->getScalarizeGlobalBehavior() &&
4400 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4401 Ld->isSimple() &&
4402 static_cast<const SITargetLowering *>(getTargetLowering())
4403 ->isMemOpHasNoClobberedMemOperand(N)));
4404}
4405
4408 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4409 bool IsModified = false;
4410 do {
4411 IsModified = false;
4412
4413 // Go over all selected nodes and try to fold them a bit more
4414 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4415 while (Position != CurDAG->allnodes_end()) {
4416 SDNode *Node = &*Position++;
4418 if (!MachineNode)
4419 continue;
4420
4421 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4422 if (ResNode != Node) {
4423 if (ResNode)
4424 ReplaceUses(Node, ResNode);
4425 IsModified = true;
4426 }
4427 }
4428 CurDAG->RemoveDeadNodes();
4429 } while (IsModified);
4430}
4431
4436
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:114
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ TargetFrameIndex
Definition ISDOpcodes.h:182
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtOpcode(unsigned Opcode)
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ Undef
Value of the register doesn't matter.
@ User
could "use" a pointer
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:264
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static SDNode * packConstantV2I16(const SDNode *N, SelectionDAG &DAG)
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
#define N
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:129
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.