LLVM 23.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132static SDValue emitRegSequence(llvm::SelectionDAG &CurDAG, unsigned DstRegClass,
133 EVT DstTy, ArrayRef<SDValue> Elts,
134 ArrayRef<unsigned> SubRegClass,
135 const SDLoc &DL) {
136 assert(Elts.size() == SubRegClass.size() && "array size mismatch");
137 unsigned NumElts = Elts.size();
138 SmallVector<SDValue, 17> Ops(2 * NumElts + 1);
139 Ops[0] = (CurDAG.getTargetConstant(DstRegClass, DL, MVT::i32));
140 for (unsigned i = 0; i < NumElts; ++i) {
141 Ops[2 * i + 1] = Elts[i];
142 Ops[2 * i + 2] = CurDAG.getTargetConstant(SubRegClass[i], DL, MVT::i32);
143 }
144 return SDValue(
145 CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops), 0);
146}
147
148} // end anonymous namespace
149
151 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
152 false)
153INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
155#ifdef EXPENSIVE_CHECKS
158#endif
160 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
161 false)
162
163/// This pass converts a legalized DAG into a AMDGPU-specific
164// DAG, ready for instruction scheduling.
166 CodeGenOptLevel OptLevel) {
167 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
168}
169
173
175 Subtarget = &MF.getSubtarget<GCNSubtarget>();
176 Subtarget->checkSubtargetFeatures(MF.getFunction());
177 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
179}
180
181bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
182 // XXX - only need to list legal operations.
183 switch (Opc) {
184 case ISD::FADD:
185 case ISD::FSUB:
186 case ISD::FMUL:
187 case ISD::FDIV:
188 case ISD::FREM:
190 case ISD::UINT_TO_FP:
191 case ISD::SINT_TO_FP:
192 case ISD::FABS:
193 // Fabs is lowered to a bit operation, but it's an and which will clear the
194 // high bits anyway.
195 case ISD::FSQRT:
196 case ISD::FSIN:
197 case ISD::FCOS:
198 case ISD::FPOWI:
199 case ISD::FPOW:
200 case ISD::FLOG:
201 case ISD::FLOG2:
202 case ISD::FLOG10:
203 case ISD::FEXP:
204 case ISD::FEXP2:
205 case ISD::FCEIL:
206 case ISD::FTRUNC:
207 case ISD::FRINT:
208 case ISD::FNEARBYINT:
209 case ISD::FROUNDEVEN:
210 case ISD::FROUND:
211 case ISD::FFLOOR:
212 case ISD::FMINNUM:
213 case ISD::FMAXNUM:
214 case ISD::FLDEXP:
215 case AMDGPUISD::FRACT:
216 case AMDGPUISD::CLAMP:
217 case AMDGPUISD::COS_HW:
218 case AMDGPUISD::SIN_HW:
219 case AMDGPUISD::FMIN3:
220 case AMDGPUISD::FMAX3:
221 case AMDGPUISD::FMED3:
222 case AMDGPUISD::FMAD_FTZ:
223 case AMDGPUISD::RCP:
224 case AMDGPUISD::RSQ:
225 case AMDGPUISD::RCP_IFLAG:
226 // On gfx10, all 16-bit instructions preserve the high bits.
227 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
228 case ISD::FP_ROUND:
229 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
230 // high bits on gfx9.
231 // TODO: If we had the source node we could see if the source was fma/mad
233 case ISD::FMA:
234 case ISD::FMAD:
235 case AMDGPUISD::DIV_FIXUP:
237 default:
238 // fcopysign, select and others may be lowered to 32-bit bit operations
239 // which don't zero the high bits.
240 return false;
241 }
242}
243
245#ifdef EXPENSIVE_CHECKS
247 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
248 for (auto &L : LI->getLoopsInPreorder()) {
249 assert(L->isLCSSAForm(DT));
250 }
251#endif
253}
254
263
265 assert(Subtarget->d16PreservesUnusedBits());
266 MVT VT = N->getValueType(0).getSimpleVT();
267 if (VT != MVT::v2i16 && VT != MVT::v2f16)
268 return false;
269
270 SDValue Lo = N->getOperand(0);
271 SDValue Hi = N->getOperand(1);
272
273 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
274
275 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
276 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
277 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
278
279 // Need to check for possible indirect dependencies on the other half of the
280 // vector to avoid introducing a cycle.
281 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
282 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
283
284 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
285 SDValue Ops[] = {
286 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
287 };
288
289 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
290 if (LdHi->getMemoryVT() == MVT::i8) {
291 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
292 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
293 } else {
294 assert(LdHi->getMemoryVT() == MVT::i16);
295 }
296
297 SDValue NewLoadHi =
298 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
299 Ops, LdHi->getMemoryVT(),
300 LdHi->getMemOperand());
301
302 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
303 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
304 return true;
305 }
306
307 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
308 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
309 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
310 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
311 if (LdLo && Lo.hasOneUse()) {
312 SDValue TiedIn = getHi16Elt(Hi);
313 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
314 return false;
315
316 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
317 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
318 if (LdLo->getMemoryVT() == MVT::i8) {
319 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
320 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
321 } else {
322 assert(LdLo->getMemoryVT() == MVT::i16);
323 }
324
325 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
326
327 SDValue Ops[] = {
328 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
329 };
330
331 SDValue NewLoadLo =
332 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
333 Ops, LdLo->getMemoryVT(),
334 LdLo->getMemOperand());
335
336 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
337 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
338 return true;
339 }
340
341 return false;
342}
343
345 if (!Subtarget->d16PreservesUnusedBits())
346 return;
347
348 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
349
350 bool MadeChange = false;
351 while (Position != CurDAG->allnodes_begin()) {
352 SDNode *N = &*--Position;
353 if (N->use_empty())
354 continue;
355
356 switch (N->getOpcode()) {
358 // TODO: Match load d16 from shl (extload:i16), 16
359 MadeChange |= matchLoadD16FromBuildVector(N);
360 break;
361 default:
362 break;
363 }
364 }
365
366 if (MadeChange) {
367 CurDAG->RemoveDeadNodes();
368 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
369 CurDAG->dump(););
370 }
371}
372
373bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
374 if (N->isUndef())
375 return true;
376
377 const SIInstrInfo *TII = Subtarget->getInstrInfo();
379 return TII->isInlineConstant(C->getAPIntValue());
380
382 return TII->isInlineConstant(C->getValueAPF());
383
384 return false;
385}
386
387/// Determine the register class for \p OpNo
388/// \returns The register class of the virtual register that will be used for
389/// the given operand number \OpNo or NULL if the register class cannot be
390/// determined.
391const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
392 unsigned OpNo) const {
393 if (!N->isMachineOpcode()) {
394 if (N->getOpcode() == ISD::CopyToReg) {
395 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
396 if (Reg.isVirtual()) {
398 return MRI.getRegClass(Reg);
399 }
400
401 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
402 return TRI->getPhysRegBaseClass(Reg);
403 }
404
405 return nullptr;
406 }
407
408 switch (N->getMachineOpcode()) {
409 default: {
410 const SIInstrInfo *TII = Subtarget->getInstrInfo();
411 const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());
412 unsigned OpIdx = Desc.getNumDefs() + OpNo;
413 if (OpIdx >= Desc.getNumOperands())
414 return nullptr;
415
416 int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);
417 if (RegClass == -1)
418 return nullptr;
419
420 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
421 }
422 case AMDGPU::REG_SEQUENCE: {
423 unsigned RCID = N->getConstantOperandVal(0);
424 const TargetRegisterClass *SuperRC =
425 Subtarget->getRegisterInfo()->getRegClass(RCID);
426
427 SDValue SubRegOp = N->getOperand(OpNo + 1);
428 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
429 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
430 SubRegIdx);
431 }
432 }
433}
434
435SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
436 SDValue Glue) const {
438 Ops.push_back(NewChain); // Replace the chain.
439 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
440 Ops.push_back(N->getOperand(i));
441
442 Ops.push_back(Glue);
443 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
444}
445
446SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
447 const SITargetLowering& Lowering =
448 *static_cast<const SITargetLowering*>(getTargetLowering());
449
450 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
451
452 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
453 return glueCopyToOp(N, M0, M0.getValue(1));
454}
455
456SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
457 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
458 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
459 if (Subtarget->ldsRequiresM0Init())
460 return glueCopyToM0(
461 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
462 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
463 MachineFunction &MF = CurDAG->getMachineFunction();
464 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
465 return
466 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
467 }
468 return N;
469}
470
471MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
472 EVT VT) const {
473 SDNode *Lo = CurDAG->getMachineNode(
474 AMDGPU::S_MOV_B32, DL, MVT::i32,
475 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
476 SDNode *Hi = CurDAG->getMachineNode(
477 AMDGPU::S_MOV_B32, DL, MVT::i32,
478 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
479 const SDValue Ops[] = {
480 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
481 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
482 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
483
484 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
485}
486
487SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
488 SelectionDAG &DAG) const {
489 // TODO: Handle undef as zero
490
491 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
492 uint32_t LHSVal, RHSVal;
493 if (getConstantValue(N->getOperand(0), LHSVal) &&
494 getConstantValue(N->getOperand(1), RHSVal)) {
495 SDLoc SL(N);
496 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
497 return DAG.getMachineNode(
498 isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
499 N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
500 }
501
502 return nullptr;
503}
504
505void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
506 EVT VT = N->getValueType(0);
507 unsigned NumVectorElts = VT.getVectorNumElements();
508 EVT EltVT = VT.getVectorElementType();
509 SDLoc DL(N);
510 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
511
512 if (NumVectorElts == 1) {
513 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
514 RegClass);
515 return;
516 }
517
518 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
519 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
520 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
521 uint64_t C = 0;
522 bool AllConst = true;
523 unsigned EltSize = EltVT.getSizeInBits();
524 for (unsigned I = 0; I < NumVectorElts; ++I) {
525 SDValue Op = N->getOperand(I);
526 if (Op.isUndef()) {
527 AllConst = false;
528 break;
529 }
530 uint64_t Val;
532 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
533 } else
534 Val = cast<ConstantSDNode>(Op)->getZExtValue();
535 C |= Val << (EltSize * I);
536 }
537 if (AllConst) {
538 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
539 MachineSDNode *Copy =
540 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
541 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
542 RegClass);
543 return;
544 }
545 }
546
547 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
548 "supported yet");
549 // 32 = Max Num Vector Elements
550 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
551 // 1 = Vector Register Class
552 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
553
554 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
555 bool IsRegSeq = true;
556 unsigned NOps = N->getNumOperands();
557 unsigned EltSizeInRegs = EltVT.getSizeInBits() / 32;
558 assert(IsGCN || EltSizeInRegs == 1);
559 for (unsigned i = 0; i < NOps; i++) {
560 // XXX: Why is this here?
561 if (isa<RegisterSDNode>(N->getOperand(i))) {
562 IsRegSeq = false;
563 break;
564 }
565 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(
566 i * EltSizeInRegs, EltSizeInRegs)
568 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
569 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
570 }
571 if (NOps != NumVectorElts) {
572 // Fill in the missing undef elements if this was a scalar_to_vector.
573 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
574 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
575 DL, EltVT);
576 for (unsigned i = NOps; i < NumVectorElts; ++i) {
577 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(
578 i * EltSizeInRegs, EltSizeInRegs)
580 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
581 RegSeqArgs[1 + (2 * i) + 1] =
582 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
583 }
584 }
585
586 if (!IsRegSeq)
587 SelectCode(N);
588 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
589}
590
592 EVT VT = N->getValueType(0);
593 EVT EltVT = VT.getVectorElementType();
594
595 // TODO: Handle 16-bit element vectors with even aligned masks.
596 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
597 VT.getVectorNumElements() != 2) {
598 SelectCode(N);
599 return;
600 }
601
602 auto *SVN = cast<ShuffleVectorSDNode>(N);
603
604 SDValue Src0 = SVN->getOperand(0);
605 SDValue Src1 = SVN->getOperand(1);
606 ArrayRef<int> Mask = SVN->getMask();
607 SDLoc DL(N);
608
609 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
610 Mask[0] < 4 && Mask[1] < 4);
611
612 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
613 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
614 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
615 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
616
617 if (Mask[0] < 0) {
618 Src0SubReg = Src1SubReg;
619 MachineSDNode *ImpDef =
620 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
621 VSrc0 = SDValue(ImpDef, 0);
622 }
623
624 if (Mask[1] < 0) {
625 Src1SubReg = Src0SubReg;
626 MachineSDNode *ImpDef =
627 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
628 VSrc1 = SDValue(ImpDef, 0);
629 }
630
631 // SGPR case needs to lower to copies.
632 //
633 // Also use subregister extract when we can directly blend the registers with
634 // a simple subregister copy.
635 //
636 // TODO: Maybe we should fold this out earlier
637 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
638 Src1SubReg == AMDGPU::sub0) {
639 // The low element of the result always comes from src0.
640 // The high element of the result always comes from src1.
641 // op_sel selects the high half of src0.
642 // op_sel_hi selects the high half of src1.
643
644 unsigned Src0OpSel =
645 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
646 unsigned Src1OpSel =
647 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
648
649 // Enable op_sel_hi to avoid printing it. This should have no effect on the
650 // result.
651 Src0OpSel |= SISrcMods::OP_SEL_1;
652 Src1OpSel |= SISrcMods::OP_SEL_1;
653
654 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
655 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
656 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
657
658 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
659 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
660 ZeroMods, // clamp
661 ZeroMods, // op_sel
662 ZeroMods, // op_sel_hi
663 ZeroMods, // neg_lo
664 ZeroMods}); // neg_hi
665 return;
666 }
667
668 SDValue ResultElt0 =
669 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
670 SDValue ResultElt1 =
671 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
672
673 const SDValue Ops[] = {
674 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
675 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
676 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
677 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
678}
679
681 unsigned int Opc = N->getOpcode();
682 if (N->isMachineOpcode()) {
683 N->setNodeId(-1);
684 return; // Already selected.
685 }
686
687 // isa<MemSDNode> almost works but is slightly too permissive for some DS
688 // intrinsics.
689 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
690 N = glueCopyToM0LDSInit(N);
691 SelectCode(N);
692 return;
693 }
694
695 switch (Opc) {
696 default:
697 break;
698 case ISD::UADDO_CARRY:
699 case ISD::USUBO_CARRY:
700 if (N->getValueType(0) == MVT::i64) {
701 SelectAddcSubbI64(N);
702 return;
703 }
704
705 if (N->getValueType(0) != MVT::i32)
706 break;
707
708 SelectAddcSubb(N);
709 return;
710 case ISD::UADDO:
711 case ISD::USUBO: {
712 if (N->getValueType(0) == MVT::i64) {
713 SelectAddcSubbI64(N);
714 return;
715 }
716
717 SelectUADDO_USUBO(N);
718 return;
719 }
720 case AMDGPUISD::FMUL_W_CHAIN: {
721 SelectFMUL_W_CHAIN(N);
722 return;
723 }
724 case AMDGPUISD::FMA_W_CHAIN: {
725 SelectFMA_W_CHAIN(N);
726 return;
727 }
728
730 case ISD::BUILD_VECTOR: {
731 EVT VT = N->getValueType(0);
732 unsigned NumVectorElts = VT.getVectorNumElements();
733 if (VT.getScalarSizeInBits() == 16) {
734 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
735 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
736 ReplaceNode(N, Packed);
737 return;
738 }
739 }
740
741 break;
742 }
743
744 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
745 EVT EltTy = VT.getVectorElementType();
746 assert(EltTy.bitsEq(MVT::i32) || EltTy.bitsEq(MVT::i64));
747 unsigned VecInBits = NumVectorElts * EltTy.getScalarSizeInBits();
748 const TargetRegisterClass *RegClass =
749 N->isDivergent() ? TRI->getDefaultVectorSuperClassForBitWidth(VecInBits)
751
752 SelectBuildVector(N, RegClass->getID());
753 return;
754 }
757 return;
758 case ISD::BUILD_PAIR: {
759 SDValue RC, SubReg0, SubReg1;
760 SDLoc DL(N);
761 if (N->getValueType(0) == MVT::i128) {
762 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
763 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
764 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
765 } else if (N->getValueType(0) == MVT::i64) {
766 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
767 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
768 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
769 } else {
770 llvm_unreachable("Unhandled value type for BUILD_PAIR");
771 }
772 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
773 N->getOperand(1), SubReg1 };
774 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
775 N->getValueType(0), Ops));
776 return;
777 }
778
779 case ISD::Constant:
780 case ISD::ConstantFP: {
781 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
782 Subtarget->has64BitLiterals())
783 break;
784
785 uint64_t Imm;
787 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
788 if (AMDGPU::isValid32BitLiteral(Imm, true))
789 break;
790 } else {
792 Imm = C->getZExtValue();
793 if (AMDGPU::isValid32BitLiteral(Imm, false))
794 break;
795 }
796
797 SDLoc DL(N);
798 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
799 return;
800 }
801 case AMDGPUISD::BFE_I32:
802 case AMDGPUISD::BFE_U32: {
803 // There is a scalar version available, but unlike the vector version which
804 // has a separate operand for the offset and width, the scalar version packs
805 // the width and offset into a single operand. Try to move to the scalar
806 // version if the offsets are constant, so that we can try to keep extended
807 // loads of kernel arguments in SGPRs.
808
809 // TODO: Technically we could try to pattern match scalar bitshifts of
810 // dynamic values, but it's probably not useful.
812 if (!Offset)
813 break;
814
815 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
816 if (!Width)
817 break;
818
819 bool Signed = Opc == AMDGPUISD::BFE_I32;
820
821 uint32_t OffsetVal = Offset->getZExtValue();
822 uint32_t WidthVal = Width->getZExtValue();
823
824 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
825 WidthVal));
826 return;
827 }
828 case AMDGPUISD::DIV_SCALE: {
829 SelectDIV_SCALE(N);
830 return;
831 }
834 SelectMAD_64_32(N);
835 return;
836 }
837 case ISD::SMUL_LOHI:
838 case ISD::UMUL_LOHI:
839 return SelectMUL_LOHI(N);
840 case ISD::CopyToReg: {
842 *static_cast<const SITargetLowering*>(getTargetLowering());
843 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
844 break;
845 }
846 case ISD::AND:
847 case ISD::SRL:
848 case ISD::SRA:
850 if (N->getValueType(0) != MVT::i32)
851 break;
852
853 SelectS_BFE(N);
854 return;
855 case ISD::BRCOND:
856 SelectBRCOND(N);
857 return;
858 case ISD::FP_EXTEND:
859 SelectFP_EXTEND(N);
860 return;
861 case AMDGPUISD::CVT_PKRTZ_F16_F32:
862 case AMDGPUISD::CVT_PKNORM_I16_F32:
863 case AMDGPUISD::CVT_PKNORM_U16_F32:
864 case AMDGPUISD::CVT_PK_U16_U32:
865 case AMDGPUISD::CVT_PK_I16_I32: {
866 // Hack around using a legal type if f16 is illegal.
867 if (N->getValueType(0) == MVT::i32) {
868 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
869 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
870 { N->getOperand(0), N->getOperand(1) });
871 SelectCode(N);
872 return;
873 }
874
875 break;
876 }
878 SelectINTRINSIC_W_CHAIN(N);
879 return;
880 }
882 SelectINTRINSIC_WO_CHAIN(N);
883 return;
884 }
885 case ISD::INTRINSIC_VOID: {
886 SelectINTRINSIC_VOID(N);
887 return;
888 }
890 SelectWAVE_ADDRESS(N);
891 return;
892 }
893 case ISD::STACKRESTORE: {
894 SelectSTACKRESTORE(N);
895 return;
896 }
897 }
898
899 SelectCode(N);
900}
901
903 if (!Subtarget->hasSDWA())
904 return false;
905
906 if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
907 EVT VT = cast<VTSDNode>(N->getOperand(1))->getVT();
908 return VT.getScalarSizeInBits() == 8 || VT.getScalarSizeInBits() == 16;
909 }
910
911 if (N->getOpcode() == ISD::AND)
912 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
913 return RHS->getZExtValue() == 0xFF || RHS->getZExtValue() == 0xFFFF;
914
915 if (N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL)
916 if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))
917 return (RHS->getZExtValue() % 8) == 0;
918
919 return false;
920}
921
922bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
923 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
924 const Instruction *Term = BB->getTerminator();
925 return Term->getMetadata("amdgpu.uniform") ||
926 Term->getMetadata("structurizecfg.uniform");
927}
928
929bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
930 unsigned ShAmtBits) const {
931 assert(N->getOpcode() == ISD::AND);
932
933 const APInt &RHS = N->getConstantOperandAPInt(1);
934 if (RHS.countr_one() >= ShAmtBits)
935 return true;
936
937 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
938 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
939}
940
942 SDValue &N0, SDValue &N1) {
943 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
945 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
946 // (i64 (bitcast (v2i32 (build_vector
947 // (or (extract_vector_elt V, 0), OFFSET),
948 // (extract_vector_elt V, 1)))))
949 SDValue Lo = Addr.getOperand(0).getOperand(0);
950 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
951 SDValue BaseLo = Lo.getOperand(0);
952 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
953 // Check that split base (Lo and Hi) are extracted from the same one.
954 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
956 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
957 // Lo is statically extracted from index 0.
958 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
959 BaseLo.getConstantOperandVal(1) == 0 &&
960 // Hi is statically extracted from index 0.
961 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
962 BaseHi.getConstantOperandVal(1) == 1) {
963 N0 = BaseLo.getOperand(0).getOperand(0);
964 N1 = Lo.getOperand(1);
965 return true;
966 }
967 }
968 }
969 return false;
970}
971
972bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
973 SDValue &RHS) const {
974 if (CurDAG->isBaseWithConstantOffset(Addr)) {
975 LHS = Addr.getOperand(0);
976 RHS = Addr.getOperand(1);
977 return true;
978 }
979
982 return true;
983 }
984
985 return false;
986}
987
989 return "AMDGPU DAG->DAG Pattern Instruction Selection";
990}
991
995
999#ifdef EXPENSIVE_CHECKS
1001 .getManager();
1002 auto &F = MF.getFunction();
1003 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
1004 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
1005 for (auto &L : LI.getLoopsInPreorder())
1006 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
1007#endif
1008 return SelectionDAGISelPass::run(MF, MFAM);
1009}
1010
1011//===----------------------------------------------------------------------===//
1012// Complex Patterns
1013//===----------------------------------------------------------------------===//
1014
1015bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
1016 SDValue &Offset) {
1017 return false;
1018}
1019
1020bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
1021 SDValue &Offset) {
1023 SDLoc DL(Addr);
1024
1025 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
1026 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1027 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1028 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
1029 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
1030 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1031 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1032 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
1033 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
1034 Base = Addr.getOperand(0);
1035 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1036 } else {
1037 Base = Addr;
1038 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1039 }
1040
1041 return true;
1042}
1043
1044SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1045 const SDLoc &DL) const {
1046 SDNode *Mov = CurDAG->getMachineNode(
1047 AMDGPU::S_MOV_B32, DL, MVT::i32,
1048 CurDAG->getTargetConstant(Val, DL, MVT::i32));
1049 return SDValue(Mov, 0);
1050}
1051
1052void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1053 SDValue LHS = N->getOperand(0);
1054 SDValue RHS = N->getOperand(1);
1055 SDValue CI = N->getOperand(2);
1056
1057 if (N->isDivergent()) {
1058 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1059 : AMDGPU::V_SUBB_U32_e64;
1060 CurDAG->SelectNodeTo(
1061 N, Opc, N->getVTList(),
1062 {LHS, RHS, CI,
1063 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1064 } else {
1065 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1066 : AMDGPU::S_SUB_CO_PSEUDO;
1067 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1068 }
1069}
1070
1071void AMDGPUDAGToDAGISel::SelectAddcSubbI64(SDNode *N) {
1072 SDLoc DL(N);
1073 SDValue LHS = N->getOperand(0);
1074 SDValue RHS = N->getOperand(1);
1075
1076 unsigned Opcode = N->getOpcode();
1077 bool ConsumeCarry = Opcode == ISD::UADDO_CARRY || Opcode == ISD::USUBO_CARRY;
1078 bool IsAdd = Opcode == ISD::UADDO || Opcode == ISD::UADDO_CARRY;
1079
1080 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1081 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1082
1083 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1084 MVT::i32, LHS, Sub0);
1085 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1086 MVT::i32, LHS, Sub1);
1087
1088 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1089 MVT::i32, RHS, Sub0);
1090 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
1091 MVT::i32, RHS, Sub1);
1092
1093 SDVTList VTList = CurDAG->getVTList(MVT::i32, N->getValueType(1));
1094
1095 static const unsigned NoCarryOpcMap[2][2] = {
1096 {AMDGPU::S_USUBO_PSEUDO, AMDGPU::S_UADDO_PSEUDO},
1097 {AMDGPU::V_SUB_CO_U32_e64, AMDGPU::V_ADD_CO_U32_e64}};
1098 static const unsigned CarryOpcMap[2][2] = {
1099 {AMDGPU::S_SUB_CO_PSEUDO, AMDGPU::S_ADD_CO_PSEUDO},
1100 {AMDGPU::V_SUBB_U32_e64, AMDGPU::V_ADDC_U32_e64}};
1101
1102 bool IsVALU = N->isDivergent();
1103
1104 unsigned NoCarryOpc = NoCarryOpcMap[IsVALU][IsAdd];
1105 unsigned CarryOpc = CarryOpcMap[IsVALU][IsAdd];
1106 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1107
1108 SDNode *AddLo;
1109 if (!ConsumeCarry) {
1110 if (IsVALU) {
1111 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), Clamp};
1112 AddLo = CurDAG->getMachineNode(NoCarryOpc, DL, VTList, Args);
1113 } else {
1114 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0)};
1115 AddLo = CurDAG->getMachineNode(NoCarryOpc, DL, VTList, Args);
1116 }
1117 } else {
1118 if (IsVALU) {
1119 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2),
1120 Clamp};
1121 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1122 } else {
1123 SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2)};
1124 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1125 }
1126 }
1127
1128 SDNode *AddHi;
1129 if (IsVALU) {
1130 SDValue Args[] = {SDValue(Hi0, 0), SDValue(Hi1, 0), SDValue(AddLo, 1),
1131 Clamp};
1132 AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1133 } else {
1134 SDValue Args[] = {SDValue(Hi0, 0), SDValue(Hi1, 0), SDValue(AddLo, 1)};
1135 AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1136 }
1137
1138 unsigned RC = IsVALU ? AMDGPU::VReg_64RegClassID : AMDGPU::SReg_64RegClassID;
1139 SDValue RegSequenceArgs[] = {CurDAG->getTargetConstant(RC, DL, MVT::i32),
1140 SDValue(AddLo, 0), Sub0, SDValue(AddHi, 0),
1141 Sub1};
1142 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1143 MVT::i64, RegSequenceArgs);
1144
1145 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1146 ReplaceNode(N, RegSequence);
1147}
1148
1149void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1150 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1151 // carry out despite the _i32 name. These were renamed in VI to _U32.
1152 // FIXME: We should probably rename the opcodes here.
1153 bool IsAdd = N->getOpcode() == ISD::UADDO;
1154 bool IsVALU = N->isDivergent();
1155
1156 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1157 ++UI)
1158 if (UI.getUse().getResNo() == 1) {
1159 if (UI->isMachineOpcode()) {
1160 if (UI->getMachineOpcode() !=
1161 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1162 IsVALU = true;
1163 break;
1164 }
1165 } else {
1166 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1167 IsVALU = true;
1168 break;
1169 }
1170 }
1171 }
1172
1173 if (IsVALU) {
1174 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1175
1176 CurDAG->SelectNodeTo(
1177 N, Opc, N->getVTList(),
1178 {N->getOperand(0), N->getOperand(1),
1179 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1180 } else {
1181 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1182
1183 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1184 {N->getOperand(0), N->getOperand(1)});
1185 }
1186}
1187
1188void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1189 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1190 SDValue Ops[10];
1191
1192 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1193 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1194 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1195 Ops[8] = N->getOperand(0);
1196 Ops[9] = N->getOperand(4);
1197
1198 // If there are no source modifiers, prefer fmac over fma because it can use
1199 // the smaller VOP2 encoding.
1200 bool UseFMAC = Subtarget->hasDLInsts() &&
1201 cast<ConstantSDNode>(Ops[0])->isZero() &&
1202 cast<ConstantSDNode>(Ops[2])->isZero() &&
1203 cast<ConstantSDNode>(Ops[4])->isZero();
1204 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1205 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1206}
1207
1208void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1209 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1210 SDValue Ops[8];
1211
1212 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1213 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1214 Ops[6] = N->getOperand(0);
1215 Ops[7] = N->getOperand(3);
1216
1217 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1218}
1219
1220// We need to handle this here because tablegen doesn't support matching
1221// instructions with multiple outputs.
1222void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1223 EVT VT = N->getValueType(0);
1224
1225 assert(VT == MVT::f32 || VT == MVT::f64);
1226
1227 unsigned Opc
1228 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1229
1230 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1231 // omod
1232 SDValue Ops[8];
1233 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1234 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1235 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1236 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1237}
1238
1239// We need to handle this here because tablegen doesn't support matching
1240// instructions with multiple outputs.
1241void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1242 SDLoc SL(N);
1243 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1244 unsigned Opc;
1245 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() && !N->hasAnyUseOfValue(1);
1246 if (Subtarget->hasMADIntraFwdBug())
1247 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1248 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1249 else if (UseNoCarry)
1250 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1251 else
1252 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1253
1254 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1255 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1256 Clamp };
1257
1258 if (UseNoCarry) {
1259 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1260 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1261 CurDAG->RemoveDeadNode(N);
1262 return;
1263 }
1264
1265 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1266}
1267
1268// We need to handle this here because tablegen doesn't support matching
1269// instructions with multiple outputs.
1270void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1271 SDLoc SL(N);
1272 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1273 SDVTList VTList;
1274 unsigned Opc;
1275 if (Subtarget->hasMadNC64_32Insts()) {
1276 VTList = CurDAG->getVTList(MVT::i64);
1277 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1278 } else {
1279 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1280 if (Subtarget->hasMADIntraFwdBug()) {
1281 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1282 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1283 } else {
1284 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1285 }
1286 }
1287
1288 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1289 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1290 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1291 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1292 if (!SDValue(N, 0).use_empty()) {
1293 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1294 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1295 MVT::i32, SDValue(Mad, 0), Sub0);
1296 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1297 }
1298 if (!SDValue(N, 1).use_empty()) {
1299 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1300 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1301 MVT::i32, SDValue(Mad, 0), Sub1);
1302 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1303 }
1304 CurDAG->RemoveDeadNode(N);
1305}
1306
1307bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1308 if (!isUInt<16>(Offset))
1309 return false;
1310
1311 if (!Base || Subtarget->hasUsableDSOffset() ||
1312 Subtarget->unsafeDSOffsetFoldingEnabled())
1313 return true;
1314
1315 // On Southern Islands instruction with a negative base value and an offset
1316 // don't seem to work.
1317 return CurDAG->SignBitIsZero(Base);
1318}
1319
1320bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1321 SDValue &Offset) const {
1322 SDLoc DL(Addr);
1323 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1324 SDValue N0 = Addr.getOperand(0);
1325 SDValue N1 = Addr.getOperand(1);
1326 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1327 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1328 // (add n0, c0)
1329 Base = N0;
1330 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1331 return true;
1332 }
1333 } else if (Addr.getOpcode() == ISD::SUB) {
1334 // sub C, x -> add (sub 0, x), C
1335 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1336 int64_t ByteOffset = C->getSExtValue();
1337 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1338 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1339
1340 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1341 // the known bits in isDSOffsetLegal. We need to emit the selected node
1342 // here, so this is thrown away.
1343 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1344 Zero, Addr.getOperand(1));
1345
1346 if (isDSOffsetLegal(Sub, ByteOffset)) {
1348 Opnds.push_back(Zero);
1349 Opnds.push_back(Addr.getOperand(1));
1350
1351 // FIXME: Select to VOP3 version for with-carry.
1352 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1353 if (Subtarget->hasAddNoCarryInsts()) {
1354 SubOp = AMDGPU::V_SUB_U32_e64;
1355 Opnds.push_back(
1356 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1357 }
1358
1359 MachineSDNode *MachineSub =
1360 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1361
1362 Base = SDValue(MachineSub, 0);
1363 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1364 return true;
1365 }
1366 }
1367 }
1368 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1369 // If we have a constant address, prefer to put the constant into the
1370 // offset. This can save moves to load the constant address since multiple
1371 // operations can share the zero base address register, and enables merging
1372 // into read2 / write2 instructions.
1373
1374 SDLoc DL(Addr);
1375
1376 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1377 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1378 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1379 DL, MVT::i32, Zero);
1380 Base = SDValue(MovZero, 0);
1381 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1382 return true;
1383 }
1384 }
1385
1386 // default case
1387 Base = Addr;
1388 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1389 return true;
1390}
1391
1392bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1393 unsigned Offset1,
1394 unsigned Size) const {
1395 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1396 return false;
1397 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1398 return false;
1399
1400 if (!Base || Subtarget->hasUsableDSOffset() ||
1401 Subtarget->unsafeDSOffsetFoldingEnabled())
1402 return true;
1403
1404 // On Southern Islands instruction with a negative base value and an offset
1405 // don't seem to work.
1406 return CurDAG->SignBitIsZero(Base);
1407}
1408
1409// Return whether the operation has NoUnsignedWrap property.
1410static bool isNoUnsignedWrap(SDValue Addr) {
1411 return (Addr.getOpcode() == ISD::ADD &&
1412 Addr->getFlags().hasNoUnsignedWrap()) ||
1413 Addr->getOpcode() == ISD::OR;
1414}
1415
1416// Check that the base address of flat scratch load/store in the form of `base +
1417// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1418// requirement). We always treat the first operand as the base address here.
1419bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1420 if (isNoUnsignedWrap(Addr))
1421 return true;
1422
1423 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1424 // values.
1425 if (Subtarget->hasSignedScratchOffsets())
1426 return true;
1427
1428 auto LHS = Addr.getOperand(0);
1429 auto RHS = Addr.getOperand(1);
1430
1431 // If the immediate offset is negative and within certain range, the base
1432 // address cannot also be negative. If the base is also negative, the sum
1433 // would be either negative or much larger than the valid range of scratch
1434 // memory a thread can access.
1435 ConstantSDNode *ImmOp = nullptr;
1436 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1437 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1438 return true;
1439 }
1440
1441 return CurDAG->SignBitIsZero(LHS);
1442}
1443
1444// Check address value in SGPR/VGPR are legal for flat scratch in the form
1445// of: SGPR + VGPR.
1446bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1447 if (isNoUnsignedWrap(Addr))
1448 return true;
1449
1450 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1451 // values.
1452 if (Subtarget->hasSignedScratchOffsets())
1453 return true;
1454
1455 auto LHS = Addr.getOperand(0);
1456 auto RHS = Addr.getOperand(1);
1457 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1458}
1459
1460// Check address value in SGPR/VGPR are legal for flat scratch in the form
1461// of: SGPR + VGPR + Imm.
1462bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1463 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1464 // values.
1465 if (AMDGPU::isGFX12Plus(*Subtarget))
1466 return true;
1467
1468 auto Base = Addr.getOperand(0);
1469 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1470 // If the immediate offset is negative and within certain range, the base
1471 // address cannot also be negative. If the base is also negative, the sum
1472 // would be either negative or much larger than the valid range of scratch
1473 // memory a thread can access.
1474 if (isNoUnsignedWrap(Base) &&
1475 (isNoUnsignedWrap(Addr) ||
1476 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1477 return true;
1478
1479 auto LHS = Base.getOperand(0);
1480 auto RHS = Base.getOperand(1);
1481 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1482}
1483
1484// TODO: If offset is too big, put low 16-bit into offset.
1485bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1486 SDValue &Offset0,
1487 SDValue &Offset1) const {
1488 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1489}
1490
1491bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1492 SDValue &Offset0,
1493 SDValue &Offset1) const {
1494 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1495}
1496
1497bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1498 SDValue &Offset0, SDValue &Offset1,
1499 unsigned Size) const {
1500 SDLoc DL(Addr);
1501
1502 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1503 SDValue N0 = Addr.getOperand(0);
1504 SDValue N1 = Addr.getOperand(1);
1505 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1506 unsigned OffsetValue0 = C1->getZExtValue();
1507 unsigned OffsetValue1 = OffsetValue0 + Size;
1508
1509 // (add n0, c0)
1510 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1511 Base = N0;
1512 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1513 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1514 return true;
1515 }
1516 } else if (Addr.getOpcode() == ISD::SUB) {
1517 // sub C, x -> add (sub 0, x), C
1518 if (const ConstantSDNode *C =
1520 unsigned OffsetValue0 = C->getZExtValue();
1521 unsigned OffsetValue1 = OffsetValue0 + Size;
1522
1523 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1524 SDLoc DL(Addr);
1525 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1526
1527 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1528 // the known bits in isDSOffsetLegal. We need to emit the selected node
1529 // here, so this is thrown away.
1530 SDValue Sub =
1531 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1532
1533 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1535 Opnds.push_back(Zero);
1536 Opnds.push_back(Addr.getOperand(1));
1537 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1538 if (Subtarget->hasAddNoCarryInsts()) {
1539 SubOp = AMDGPU::V_SUB_U32_e64;
1540 Opnds.push_back(
1541 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1542 }
1543
1544 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1545 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1546
1547 Base = SDValue(MachineSub, 0);
1548 Offset0 =
1549 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1550 Offset1 =
1551 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1552 return true;
1553 }
1554 }
1555 }
1556 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1557 unsigned OffsetValue0 = CAddr->getZExtValue();
1558 unsigned OffsetValue1 = OffsetValue0 + Size;
1559
1560 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1561 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1562 MachineSDNode *MovZero =
1563 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1564 Base = SDValue(MovZero, 0);
1565 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1566 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1567 return true;
1568 }
1569 }
1570
1571 // default case
1572
1573 Base = Addr;
1574 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1575 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1576 return true;
1577}
1578
1579bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1580 SDValue &SOffset, SDValue &Offset,
1581 SDValue &Offen, SDValue &Idxen,
1582 SDValue &Addr64) const {
1583 // Subtarget prefers to use flat instruction
1584 // FIXME: This should be a pattern predicate and not reach here
1585 if (Subtarget->useFlatForGlobal())
1586 return false;
1587
1588 SDLoc DL(Addr);
1589
1590 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1591 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1592 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1593 SOffset = Subtarget->hasRestrictedSOffset()
1594 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1595 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1596
1597 ConstantSDNode *C1 = nullptr;
1598 SDValue N0 = Addr;
1599 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1600 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1601 if (isUInt<32>(C1->getZExtValue()))
1602 N0 = Addr.getOperand(0);
1603 else
1604 C1 = nullptr;
1605 }
1606
1607 if (N0->isAnyAdd()) {
1608 // (add N2, N3) -> addr64, or
1609 // (add (add N2, N3), C1) -> addr64
1610 SDValue N2 = N0.getOperand(0);
1611 SDValue N3 = N0.getOperand(1);
1612 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1613
1614 if (N2->isDivergent()) {
1615 if (N3->isDivergent()) {
1616 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1617 // addr64, and construct the resource from a 0 address.
1618 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1619 VAddr = N0;
1620 } else {
1621 // N2 is divergent, N3 is not.
1622 Ptr = N3;
1623 VAddr = N2;
1624 }
1625 } else {
1626 // N2 is not divergent.
1627 Ptr = N2;
1628 VAddr = N3;
1629 }
1630 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1631 } else if (N0->isDivergent()) {
1632 // N0 is divergent. Use it as the addr64, and construct the resource from a
1633 // 0 address.
1634 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1635 VAddr = N0;
1636 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1637 } else {
1638 // N0 -> offset, or
1639 // (N0 + C1) -> offset
1640 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1641 Ptr = N0;
1642 }
1643
1644 if (!C1) {
1645 // No offset.
1646 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1647 return true;
1648 }
1649
1650 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1651 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1652 // Legal offset for instruction.
1653 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1654 return true;
1655 }
1656
1657 // Illegal offset, store it in soffset.
1658 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1659 SOffset =
1660 SDValue(CurDAG->getMachineNode(
1661 AMDGPU::S_MOV_B32, DL, MVT::i32,
1662 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1663 0);
1664 return true;
1665}
1666
1667bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1668 SDValue &VAddr, SDValue &SOffset,
1669 SDValue &Offset) const {
1670 SDValue Ptr, Offen, Idxen, Addr64;
1671
1672 // addr64 bit was removed for volcanic islands.
1673 // FIXME: This should be a pattern predicate and not reach here
1674 if (!Subtarget->hasAddr64())
1675 return false;
1676
1677 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1678 return false;
1679
1680 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1681 if (C->getSExtValue()) {
1682 SDLoc DL(Addr);
1683
1684 const SITargetLowering& Lowering =
1685 *static_cast<const SITargetLowering*>(getTargetLowering());
1686
1687 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1688 return true;
1689 }
1690
1691 return false;
1692}
1693
1694std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1695 SDLoc DL(N);
1696
1697 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1698 SDValue TFI =
1699 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1700
1701 // We rebase the base address into an absolute stack address and hence
1702 // use constant 0 for soffset. This value must be retained until
1703 // frame elimination and eliminateFrameIndex will choose the appropriate
1704 // frame register if need be.
1705 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1706}
1707
1708bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1709 SDValue Addr, SDValue &Rsrc,
1710 SDValue &VAddr, SDValue &SOffset,
1711 SDValue &ImmOffset) const {
1712
1713 SDLoc DL(Addr);
1714 MachineFunction &MF = CurDAG->getMachineFunction();
1715 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1716
1717 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1718
1719 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1720 int64_t Imm = CAddr->getSExtValue();
1721 const int64_t NullPtr =
1723 // Don't fold null pointer.
1724 if (Imm != NullPtr) {
1725 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1726 SDValue HighBits =
1727 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1728 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1729 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1730 VAddr = SDValue(MovHighBits, 0);
1731
1732 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1733 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1734 return true;
1735 }
1736 }
1737
1738 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1739 // (add n0, c1)
1740
1741 SDValue N0 = Addr.getOperand(0);
1742 uint64_t C1 = Addr.getConstantOperandVal(1);
1743
1744 // Offsets in vaddr must be positive if range checking is enabled.
1745 //
1746 // The total computation of vaddr + soffset + offset must not overflow. If
1747 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1748 // overflowing.
1749 //
1750 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1751 // always perform a range check. If a negative vaddr base index was used,
1752 // this would fail the range check. The overall address computation would
1753 // compute a valid address, but this doesn't happen due to the range
1754 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1755 //
1756 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1757 // MUBUF vaddr, but not on older subtargets which can only do this if the
1758 // sign bit is known 0.
1759 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1760 if (TII->isLegalMUBUFImmOffset(C1) &&
1761 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1762 CurDAG->SignBitIsZero(N0))) {
1763 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1764 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1765 return true;
1766 }
1767 }
1768
1769 // (node)
1770 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1771 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1772 return true;
1773}
1774
1775static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1776 if (Val.getOpcode() != ISD::CopyFromReg)
1777 return false;
1778 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1779 if (!Reg.isPhysical())
1780 return false;
1781 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1782 return RC && TRI.isSGPRClass(RC);
1783}
1784
1785bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1786 SDValue Addr,
1787 SDValue &SRsrc,
1788 SDValue &SOffset,
1789 SDValue &Offset) const {
1790 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1791 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1792 MachineFunction &MF = CurDAG->getMachineFunction();
1793 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1794 SDLoc DL(Addr);
1795
1796 // CopyFromReg <sgpr>
1797 if (IsCopyFromSGPR(*TRI, Addr)) {
1798 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1799 SOffset = Addr;
1800 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1801 return true;
1802 }
1803
1804 ConstantSDNode *CAddr;
1805 if (Addr.getOpcode() == ISD::ADD) {
1806 // Add (CopyFromReg <sgpr>) <constant>
1807 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1808 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1809 return false;
1810 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1811 return false;
1812
1813 SOffset = Addr.getOperand(0);
1814 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1815 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1816 // <constant>
1817 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1818 } else {
1819 return false;
1820 }
1821
1822 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1823
1824 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1825 return true;
1826}
1827
1828bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1829 SDValue &SOffset, SDValue &Offset
1830 ) const {
1831 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1832 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1833
1834 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1835 return false;
1836
1837 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1838 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1839 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1840 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1841 maskTrailingOnes<uint64_t>(32); // Size
1842 SDLoc DL(Addr);
1843
1844 const SITargetLowering& Lowering =
1845 *static_cast<const SITargetLowering*>(getTargetLowering());
1846
1847 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1848 return true;
1849 }
1850 return false;
1851}
1852
1853bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1854 SDValue &SOffset) const {
1855 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1856 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1857 return true;
1858 }
1859
1860 SOffset = ByteOffsetNode;
1861 return true;
1862}
1863
1864// Find a load or store from corresponding pattern root.
1865// Roots may be build_vector, bitconvert or their combinations.
1868 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1869 return MN;
1871 for (SDValue V : N->op_values())
1872 if (MemSDNode *MN =
1874 return MN;
1875 llvm_unreachable("cannot find MemSDNode in the pattern!");
1876}
1877
1878bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(
1879 SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset,
1880 AMDGPU::FlatAddrSpace FlatVariant) const {
1882 int64_t OffsetVal = 0;
1883
1884 unsigned AS = findMemSDNode(N)->getAddressSpace();
1885
1886 bool CanHaveFlatSegmentOffsetBug =
1887 Subtarget->hasFlatSegmentOffsetBug() &&
1888 FlatVariant == FlatAddrSpace::FLAT &&
1890
1891 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1892 SDValue N0, N1;
1893 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1894 (FlatVariant != FlatAddrSpace::FlatScratch ||
1895 isFlatScratchBaseLegal(Addr))) {
1896 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1897
1898 // Adding the offset to the base address in a FLAT instruction must not
1899 // change the memory aperture in which the address falls. Therefore we can
1900 // only fold offsets from inbounds GEPs into FLAT instructions.
1901 bool IsInBounds =
1902 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1903 if (COffsetVal == 0 || FlatVariant != FlatAddrSpace::FLAT || IsInBounds) {
1904 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1905 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1906 Addr = N0;
1907 OffsetVal = COffsetVal;
1908 } else {
1909 // If the offset doesn't fit, put the low bits into the offset field
1910 // and add the rest.
1911 //
1912 // For a FLAT instruction the hardware decides whether to access
1913 // global/scratch/shared memory based on the high bits of vaddr,
1914 // ignoring the offset field, so we have to ensure that when we add
1915 // remainder to vaddr it still points into the same underlying object.
1916 // The easiest way to do that is to make sure that we split the offset
1917 // into two pieces that are both >= 0 or both <= 0.
1918
1919 SDLoc DL(N);
1920 uint64_t RemainderOffset;
1921
1922 std::tie(OffsetVal, RemainderOffset) =
1923 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1924
1925 SDValue AddOffsetLo =
1926 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1927 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1928
1929 if (Addr.getValueType().getSizeInBits() == 32) {
1931 Opnds.push_back(N0);
1932 Opnds.push_back(AddOffsetLo);
1933 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1934 if (Subtarget->hasAddNoCarryInsts()) {
1935 AddOp = AMDGPU::V_ADD_U32_e64;
1936 Opnds.push_back(Clamp);
1937 }
1938 Addr =
1939 SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1940 } else {
1941 // TODO: Should this try to use a scalar add pseudo if the base
1942 // address is uniform and saddr is usable?
1943 SDValue Sub0 =
1944 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1945 SDValue Sub1 =
1946 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1947
1948 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1949 DL, MVT::i32, N0, Sub0);
1950 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1951 DL, MVT::i32, N0, Sub1);
1952
1953 SDValue AddOffsetHi =
1954 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1955
1956 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1957
1958 SDNode *Add =
1959 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1960 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1961
1962 SDNode *Addc = CurDAG->getMachineNode(
1963 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1964 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1965
1966 SDValue RegSequenceArgs[] = {
1967 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1968 MVT::i32),
1969 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1970
1971 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1972 MVT::i64, RegSequenceArgs),
1973 0);
1974 }
1975 }
1976 }
1977 }
1978 }
1979
1980 VAddr = Addr;
1981 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1982 return true;
1983}
1984
1985bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1986 SDValue &VAddr,
1987 SDValue &Offset) const {
1988 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1990}
1991
1992bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1993 SDValue &VAddr,
1994 SDValue &Offset) const {
1995 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1997}
1998
1999bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
2000 SDValue &VAddr,
2001 SDValue &Offset) const {
2002 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
2004}
2005
2006// If this matches *_extend i32:x, return x
2007// Otherwise if the value is I32 returns x.
2009 const SelectionDAG *DAG) {
2010 if (Op.getValueType() == MVT::i32)
2011 return Op;
2012
2013 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
2014 Op.getOpcode() != ISD::ANY_EXTEND &&
2015 !(DAG->SignBitIsZero(Op) &&
2016 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
2017 return SDValue();
2018
2019 SDValue ExtSrc = Op.getOperand(0);
2020 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
2021}
2022
2023// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
2024// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
2025bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2026 SDValue &SAddr, SDValue &VOffset,
2027 SDValue &Offset, bool &ScaleOffset,
2028 bool NeedIOffset) const {
2030 int64_t ImmOffset = 0;
2031 ScaleOffset = false;
2032
2033 // Match the immediate offset first, which canonically is moved as low as
2034 // possible.
2035
2036 SDValue LHS, RHS;
2037 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2038 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2039 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2040
2041 if (NeedIOffset &&
2042 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
2043 FlatAddrSpace::FlatGlobal)) {
2044 Addr = LHS;
2045 ImmOffset = COffsetVal;
2046 } else if (!LHS->isDivergent()) {
2047 if (COffsetVal > 0) {
2048 SDLoc SL(N);
2049 // saddr + large_offset -> saddr +
2050 // (voffset = large_offset & ~MaxOffset) +
2051 // (large_offset & MaxOffset);
2052 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2053 if (NeedIOffset) {
2054 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2055 COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, FlatAddrSpace::FlatGlobal);
2056 }
2057
2058 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
2059 : isUInt<32>(RemainderOffset)) {
2060 SDNode *VMov = CurDAG->getMachineNode(
2061 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2062 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2063 VOffset = SDValue(VMov, 0);
2064 SAddr = LHS;
2065 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2066 return true;
2067 }
2068 }
2069
2070 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2071 // is 1 we would need to perform 1 or 2 extra moves for each half of
2072 // the constant and it is better to do a scalar add and then issue a
2073 // single VALU instruction to materialize zero. Otherwise it is less
2074 // instructions to perform VALU adds with immediates or inline literals.
2075 unsigned NumLiterals =
2076 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
2077 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
2078 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
2079 return false;
2080 }
2081 }
2082
2083 // Match the variable offset.
2084 if (Addr->isAnyAdd()) {
2085 LHS = Addr.getOperand(0);
2086
2087 if (!LHS->isDivergent()) {
2088 // add (i64 sgpr), (*_extend (i32 vgpr))
2089 RHS = Addr.getOperand(1);
2090 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2091 if (SDValue ExtRHS = matchExtFromI32orI32(
2092 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2093 SAddr = LHS;
2094 VOffset = ExtRHS;
2095 }
2096 }
2097
2098 RHS = Addr.getOperand(1);
2099 if (!SAddr && !RHS->isDivergent()) {
2100 // add (*_extend (i32 vgpr)), (i64 sgpr)
2101 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2102 if (SDValue ExtLHS = matchExtFromI32orI32(
2103 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2104 SAddr = RHS;
2105 VOffset = ExtLHS;
2106 }
2107 }
2108
2109 if (SAddr) {
2110 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2111 return true;
2112 }
2113 }
2114
2115 if (Subtarget->hasScaleOffset() &&
2116 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2119 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2120 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2121 Addr.getOperand(0)->isDivergent() &&
2123 !Addr.getOperand(2)->isDivergent()) {
2124 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2125 unsigned Size =
2126 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2127 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2128 if (ScaleOffset) {
2129 SAddr = Addr.getOperand(2);
2130 VOffset = Addr.getOperand(0);
2131 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2132 return true;
2133 }
2134 }
2135
2136 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2137 isa<ConstantSDNode>(Addr))
2138 return false;
2139
2140 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2141 // moves required to copy a 64-bit SGPR to VGPR.
2142 SAddr = Addr;
2143 SDNode *VMov =
2144 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2145 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2146 VOffset = SDValue(VMov, 0);
2147 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2148 return true;
2149}
2150
2151bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2152 SDValue &SAddr, SDValue &VOffset,
2153 SDValue &Offset,
2154 SDValue &CPol) const {
2155 bool ScaleOffset;
2156 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2157 return false;
2158
2159 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2160 SDLoc(), MVT::i32);
2161 return true;
2162}
2163
2164bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2165 SDValue &SAddr, SDValue &VOffset,
2166 SDValue &Offset,
2167 SDValue &CPol) const {
2168 bool ScaleOffset;
2169 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2170 return false;
2171
2172 // We are assuming CPol is always the last operand of the intrinsic.
2173 auto PassedCPol =
2174 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2175 CPol = CurDAG->getTargetConstant(
2176 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2177 return true;
2178}
2179
2180bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2181 SDValue &SAddr,
2182 SDValue &VOffset,
2183 SDValue &Offset,
2184 SDValue &CPol) const {
2185 bool ScaleOffset;
2186 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2187 return false;
2188
2189 // We are assuming CPol is second from last operand of the intrinsic.
2190 auto PassedCPol =
2191 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2192 CPol = CurDAG->getTargetConstant(
2193 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2194 return true;
2195}
2196
2197bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2198 SDValue &SAddr, SDValue &VOffset,
2199 SDValue &Offset,
2200 SDValue &CPol) const {
2201 bool ScaleOffset;
2202 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2203 return false;
2204
2205 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2206 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2207 return true;
2208}
2209
2210bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2211 SDValue &SAddr,
2212 SDValue &VOffset,
2213 SDValue &CPol) const {
2214 bool ScaleOffset;
2215 SDValue DummyOffset;
2216 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2217 false))
2218 return false;
2219
2220 // We are assuming CPol is always the last operand of the intrinsic.
2221 auto PassedCPol =
2222 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2223 CPol = CurDAG->getTargetConstant(
2224 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2225 return true;
2226}
2227
2228bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2229 SDValue &SAddr,
2230 SDValue &VOffset,
2231 SDValue &CPol) const {
2232 bool ScaleOffset;
2233 SDValue DummyOffset;
2234 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2235 false))
2236 return false;
2237
2238 // We are assuming CPol is second from last operand of the intrinsic.
2239 auto PassedCPol =
2240 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2241 CPol = CurDAG->getTargetConstant(
2242 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2243 return true;
2244}
2245
2247 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2248 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2249 } else if (SAddr.getOpcode() == ISD::ADD &&
2251 // Materialize this into a scalar move for scalar address to avoid
2252 // readfirstlane.
2253 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2254 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2255 FI->getValueType(0));
2256 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2257 MVT::i32, TFI, SAddr.getOperand(1)),
2258 0);
2259 }
2260
2261 return SAddr;
2262}
2263
2264// Match (32-bit SGPR base) + sext(imm offset)
2265bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2266 SDValue &SAddr,
2267 SDValue &Offset) const {
2269 if (Addr->isDivergent())
2270 return false;
2271
2272 SDLoc DL(Addr);
2273
2274 int64_t COffsetVal = 0;
2275
2276 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2277 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2278 SAddr = Addr.getOperand(0);
2279 } else {
2280 SAddr = Addr;
2281 }
2282
2283 SAddr = SelectSAddrFI(CurDAG, SAddr);
2284
2285 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2286
2287 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2288 FlatAddrSpace::FlatScratch)) {
2289 int64_t SplitImmOffset, RemainderOffset;
2290 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2291 COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, FlatAddrSpace::FlatScratch);
2292
2293 COffsetVal = SplitImmOffset;
2294
2295 SDValue AddOffset =
2297 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2298 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2299 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2300 SAddr, AddOffset),
2301 0);
2302 }
2303
2304 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2305
2306 return true;
2307}
2308
2309// Check whether the flat scratch SVS swizzle bug affects this access.
2310bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2311 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2312 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2313 return false;
2314
2315 // The bug affects the swizzling of SVS accesses if there is any carry out
2316 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2317 // voffset to (soffset + inst_offset).
2318 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2319 KnownBits SKnown =
2320 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2321 KnownBits::makeConstant(APInt(32, ImmOffset,
2322 /*isSigned=*/true)));
2323 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2324 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2325 return (VMax & 3) + (SMax & 3) >= 4;
2326}
2327
2328bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2329 SDValue &VAddr, SDValue &SAddr,
2330 SDValue &Offset,
2331 SDValue &CPol) const {
2332 int64_t ImmOffset = 0;
2333
2334 SDValue LHS, RHS;
2335 SDValue OrigAddr = Addr;
2336 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2337 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2338 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2339
2340 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2342 Addr = LHS;
2343 ImmOffset = COffsetVal;
2344 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2345 SDLoc SL(N);
2346 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2347 // (large_offset & MaxOffset);
2348 int64_t SplitImmOffset, RemainderOffset;
2349 std::tie(SplitImmOffset, RemainderOffset) =
2350 TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2352
2353 if (isUInt<32>(RemainderOffset)) {
2354 SDNode *VMov = CurDAG->getMachineNode(
2355 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2356 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2357 VAddr = SDValue(VMov, 0);
2358 SAddr = LHS;
2359 if (!isFlatScratchBaseLegal(Addr))
2360 return false;
2361 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2362 return false;
2363 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2364 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2365 return true;
2366 }
2367 }
2368 }
2369
2370 if (Addr.getOpcode() != ISD::ADD)
2371 return false;
2372
2373 LHS = Addr.getOperand(0);
2374 RHS = Addr.getOperand(1);
2375
2376 if (!LHS->isDivergent() && RHS->isDivergent()) {
2377 SAddr = LHS;
2378 VAddr = RHS;
2379 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2380 SAddr = RHS;
2381 VAddr = LHS;
2382 } else {
2383 return false;
2384 }
2385
2386 if (OrigAddr != Addr) {
2387 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2388 return false;
2389 } else {
2390 if (!isFlatScratchBaseLegalSV(OrigAddr))
2391 return false;
2392 }
2393
2394 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2395 return false;
2396 SAddr = SelectSAddrFI(CurDAG, SAddr);
2397 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2398
2399 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2400 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2401 SDLoc(), MVT::i32);
2402 return true;
2403}
2404
2405// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2406// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2407// Handle the case where the Immediate Offset + SOffset is negative.
2408bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2409 bool Imm32Only,
2410 bool IsBuffer,
2411 int64_t ImmOffset) const {
2412 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2413 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2414 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2415 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2416 return false;
2417 }
2418
2419 return true;
2420}
2421
2422// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2423// the load byte size. If it is update \p Offset to a pre-scaled value and
2424// return true.
2425bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2426 bool IsSigned) const {
2427 bool ScaleOffset = false;
2428 if (!Subtarget->hasScaleOffset() || !Offset)
2429 return false;
2430
2431 unsigned Size =
2432 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2433
2434 SDValue Off = Offset;
2435 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2436 Off = Ext;
2437
2438 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2439 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2440 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2441 } else if (Offset.getOpcode() == ISD::MUL ||
2442 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2443 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2444 (Offset.isMachineOpcode() &&
2445 Offset.getMachineOpcode() ==
2446 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2447 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2448 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2449 ScaleOffset = C->getZExtValue() == Size;
2450 }
2451
2452 if (ScaleOffset)
2453 Offset = Off.getOperand(0);
2454
2455 return ScaleOffset;
2456}
2457
2458// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2459// not null) offset. If Imm32Only is true, match only 32-bit immediate
2460// offsets available on CI.
2461bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2462 SDValue *SOffset, SDValue *Offset,
2463 bool Imm32Only, bool IsBuffer,
2464 bool HasSOffset, int64_t ImmOffset,
2465 bool *ScaleOffset) const {
2466 assert((!SOffset || !Offset) &&
2467 "Cannot match both soffset and offset at the same time!");
2468
2469 if (ScaleOffset) {
2470 assert(N && SOffset);
2471
2472 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2473 }
2474
2475 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2476 if (!C) {
2477 if (!SOffset)
2478 return false;
2479
2480 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2481 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2482 *SOffset = ByteOffsetNode;
2483 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2484 ImmOffset);
2485 }
2486 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2487 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2488 *SOffset = ByteOffsetNode.getOperand(0);
2489 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2490 ImmOffset);
2491 }
2492 }
2493 return false;
2494 }
2495
2496 SDLoc SL(ByteOffsetNode);
2497
2498 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2499 // offset for S_BUFFER instructions is unsigned.
2500 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2501 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2502 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2503 if (EncodedOffset && Offset && !Imm32Only) {
2504 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2505 return true;
2506 }
2507
2508 // SGPR and literal offsets are unsigned.
2509 if (ByteOffset < 0)
2510 return false;
2511
2512 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2513 if (EncodedOffset && Offset && Imm32Only) {
2514 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2515 return true;
2516 }
2517
2518 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2519 return false;
2520
2521 if (SOffset) {
2522 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2523 *SOffset = SDValue(
2524 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2525 return true;
2526 }
2527
2528 return false;
2529}
2530
2531SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2532 if (Addr.getValueType() != MVT::i32)
2533 return Addr;
2534
2535 // Zero-extend a 32-bit address.
2536 SDLoc SL(Addr);
2537
2538 const MachineFunction &MF = CurDAG->getMachineFunction();
2539 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2540 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2541 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2542
2543 const SDValue Ops[] = {
2544 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2545 Addr,
2546 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2547 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2548 0),
2549 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2550 };
2551
2552 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2553 Ops), 0);
2554}
2555
2556// Match a base and an immediate (if Offset is not null) or an SGPR (if
2557// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2558// true, match only 32-bit immediate offsets available on CI.
2559bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2560 SDValue &SBase, SDValue *SOffset,
2561 SDValue *Offset, bool Imm32Only,
2562 bool IsBuffer, bool HasSOffset,
2563 int64_t ImmOffset,
2564 bool *ScaleOffset) const {
2565 if (SOffset && Offset) {
2566 assert(!Imm32Only && !IsBuffer);
2567 SDValue B;
2568
2569 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2570 return false;
2571
2572 int64_t ImmOff = 0;
2573 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2574 ImmOff = C->getSExtValue();
2575
2576 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2577 true, ImmOff, ScaleOffset);
2578 }
2579
2580 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2581 // wraparound, because s_load instructions perform the addition in 64 bits.
2582 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2583 !Addr->getFlags().hasNoUnsignedWrap())
2584 return false;
2585
2586 SDValue N0, N1;
2587 // Extract the base and offset if possible.
2588 if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {
2589 N0 = Addr.getOperand(0);
2590 N1 = Addr.getOperand(1);
2591 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2592 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2593 }
2594 if (!N0 || !N1)
2595 return false;
2596
2597 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2598 ImmOffset, ScaleOffset)) {
2599 SBase = N0;
2600 return true;
2601 }
2602 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2603 ImmOffset, ScaleOffset)) {
2604 SBase = N1;
2605 return true;
2606 }
2607 return false;
2608}
2609
2610bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2611 SDValue *SOffset, SDValue *Offset,
2612 bool Imm32Only, bool *ScaleOffset) const {
2613 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2614 /* IsBuffer */ false, /* HasSOffset */ false,
2615 /* ImmOffset */ 0, ScaleOffset)) {
2616 SBase = Expand32BitAddress(SBase);
2617 return true;
2618 }
2619
2620 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2621 SBase = Expand32BitAddress(Addr);
2622 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2623 return true;
2624 }
2625
2626 return false;
2627}
2628
2629bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2630 SDValue &Offset) const {
2631 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2632 &Offset);
2633}
2634
2635bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2636 SDValue &Offset) const {
2637 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2638 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2639 &Offset, /* Imm32Only */ true);
2640}
2641
2642bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2643 SDValue &SOffset, SDValue &CPol) const {
2644 bool ScaleOffset;
2645 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2646 /* Imm32Only */ false, &ScaleOffset))
2647 return false;
2648
2649 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2650 SDLoc(N), MVT::i32);
2651 return true;
2652}
2653
2654bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2655 SDValue &SBase, SDValue &SOffset,
2656 SDValue &Offset,
2657 SDValue &CPol) const {
2658 bool ScaleOffset;
2659 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2660 return false;
2661
2662 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2663 SDLoc(N), MVT::i32);
2664 return true;
2665}
2666
2667bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2668 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2669 /* Imm32Only */ false, /* IsBuffer */ true);
2670}
2671
2672bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2673 SDValue &Offset) const {
2674 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2675 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2676 /* Imm32Only */ true, /* IsBuffer */ true);
2677}
2678
2679bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2680 SDValue &Offset) const {
2681 // Match the (soffset + offset) pair as a 32-bit register base and
2682 // an immediate offset.
2683 return N.getValueType() == MVT::i32 &&
2684 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2685 /* SOffset*/ nullptr, &Offset,
2686 /* Imm32Only */ false, /* IsBuffer */ true);
2687}
2688
2689bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2690 SDValue &Base,
2691 SDValue &Offset) const {
2692 SDLoc DL(Index);
2693
2694 if (CurDAG->isBaseWithConstantOffset(Index)) {
2695 SDValue N0 = Index.getOperand(0);
2696 SDValue N1 = Index.getOperand(1);
2697 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2698
2699 // (add n0, c0)
2700 // Don't peel off the offset (c0) if doing so could possibly lead
2701 // the base (n0) to be negative.
2702 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2703 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2704 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2705 Base = N0;
2706 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2707 return true;
2708 }
2709 }
2710
2711 if (isa<ConstantSDNode>(Index))
2712 return false;
2713
2714 Base = Index;
2715 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2716 return true;
2717}
2718
2719SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2720 SDValue Val, uint32_t Offset,
2721 uint32_t Width) {
2722 if (Val->isDivergent()) {
2723 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2724 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2725 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2726
2727 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2728 }
2729 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2730 // Transformation function, pack the offset and width of a BFE into
2731 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2732 // source, bits [5:0] contain the offset and bits [22:16] the width.
2733 uint32_t PackedVal = Offset | (Width << 16);
2734 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2735
2736 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2737}
2738
2739void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2740 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2741 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2742 // Predicate: 0 < b <= c < 32
2743
2744 const SDValue &Shl = N->getOperand(0);
2745 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2746 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2747
2748 if (B && C) {
2749 uint32_t BVal = B->getZExtValue();
2750 uint32_t CVal = C->getZExtValue();
2751
2752 if (0 < BVal && BVal <= CVal && CVal < 32) {
2753 bool Signed = N->getOpcode() == ISD::SRA;
2754 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2755 32 - CVal));
2756 return;
2757 }
2758 }
2759 SelectCode(N);
2760}
2761
2762void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2763 switch (N->getOpcode()) {
2764 case ISD::AND:
2765 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2766 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2767 // Predicate: isMask(mask)
2768 const SDValue &Srl = N->getOperand(0);
2769 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2770 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2771
2772 if (Shift && Mask) {
2773 uint32_t ShiftVal = Shift->getZExtValue();
2774 uint32_t MaskVal = Mask->getZExtValue();
2775
2776 if (isMask_32(MaskVal)) {
2777 uint32_t WidthVal = llvm::popcount(MaskVal);
2778 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2779 WidthVal));
2780 return;
2781 }
2782 }
2783 }
2784 break;
2785 case ISD::SRL:
2786 if (N->getOperand(0).getOpcode() == ISD::AND) {
2787 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2788 // Predicate: isMask(mask >> b)
2789 const SDValue &And = N->getOperand(0);
2790 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2791 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2792
2793 if (Shift && Mask) {
2794 uint32_t ShiftVal = Shift->getZExtValue();
2795 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2796
2797 if (isMask_32(MaskVal)) {
2798 uint32_t WidthVal = llvm::popcount(MaskVal);
2799 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2800 WidthVal));
2801 return;
2802 }
2803 }
2804 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2805 SelectS_BFEFromShifts(N);
2806 return;
2807 }
2808 break;
2809 case ISD::SRA:
2810 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2811 SelectS_BFEFromShifts(N);
2812 return;
2813 }
2814 break;
2815
2817 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2818 SDValue Src = N->getOperand(0);
2819 if (Src.getOpcode() != ISD::SRL)
2820 break;
2821
2822 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2823 if (!Amt)
2824 break;
2825
2826 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2827 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2828 Amt->getZExtValue(), Width));
2829 return;
2830 }
2831 }
2832
2833 SelectCode(N);
2834}
2835
2836bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2837 assert(N->getOpcode() == ISD::BRCOND);
2838 if (!N->hasOneUse())
2839 return false;
2840
2841 SDValue Cond = N->getOperand(1);
2842 if (Cond.getOpcode() == ISD::CopyToReg)
2843 Cond = Cond.getOperand(2);
2844
2845 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2846 return false;
2847
2848 MVT VT = Cond.getOperand(0).getSimpleValueType();
2849 if (VT == MVT::i32)
2850 return true;
2851
2852 if (VT == MVT::i64) {
2853 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2854 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2855 Subtarget->hasScalarCompareEq64();
2856 }
2857
2858 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2859 return true;
2860
2861 return false;
2862}
2863
2864static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2865 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2866 // Special case for amdgcn.ballot:
2867 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2868 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2869 // =>
2870 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2871 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2872 // Cond becomes a i(WaveSize) full mask value.
2873 // Note that ballot doesn't use SETEQ condition but its easy to support it
2874 // here for completeness, so in this case Negate is set true on return.
2875 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2876 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2877 isNullConstant(VCMP.getOperand(1))) {
2878
2879 auto Cond = VCMP.getOperand(0);
2880 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2881 Cond = Cond.getOperand(0);
2882
2883 if (isBoolSGPR(Cond)) {
2884 Negate = VCMP_CC == ISD::SETEQ;
2885 return Cond;
2886 }
2887 }
2888 return SDValue();
2889}
2890
2891void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2892 SDValue Cond = N->getOperand(1);
2893
2894 if (Cond.isUndef()) {
2895 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2896 N->getOperand(2), N->getOperand(0));
2897 return;
2898 }
2899
2900 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2901
2902 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2903 bool AndExec = !UseSCCBr;
2904 bool Negate = false;
2905
2906 if (Cond.getOpcode() == ISD::SETCC &&
2907 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2908 SDValue VCMP = Cond->getOperand(0);
2909 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2910 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2911 isNullConstant(Cond->getOperand(1)) &&
2912 // We may encounter ballot.i64 in wave32 mode on -O0.
2913 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2914 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2915 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2916 // BRCOND i1 %C, %BB
2917 // =>
2918 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2919 // VCC = COPY i(WaveSize) %VCMP
2920 // S_CBRANCH_VCCNZ/VCCZ %BB
2921 Negate = CC == ISD::SETEQ;
2922 bool NegatedBallot = false;
2923 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2924 Cond = BallotCond;
2925 UseSCCBr = !BallotCond->isDivergent();
2926 Negate = Negate ^ NegatedBallot;
2927 } else {
2928 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2929 // selected as V_CMP, but this may change for uniform condition.
2930 Cond = VCMP;
2931 UseSCCBr = false;
2932 }
2933 }
2934 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2935 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2936 // used.
2937 AndExec = false;
2938 }
2939
2940 unsigned BrOp =
2941 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2942 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2943 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2944 SDLoc SL(N);
2945
2946 if (AndExec) {
2947 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2948 // analyzed what generates the vcc value, so we do not know whether vcc
2949 // bits for disabled lanes are 0. Thus we need to mask out bits for
2950 // disabled lanes.
2951 //
2952 // For the case that we select S_CBRANCH_SCC1 and it gets
2953 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2954 // SIInstrInfo::moveToVALU which inserts the S_AND).
2955 //
2956 // We could add an analysis of what generates the vcc value here and omit
2957 // the S_AND when is unnecessary. But it would be better to add a separate
2958 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2959 // catches both cases.
2960 Cond = SDValue(
2961 CurDAG->getMachineNode(
2962 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2963 MVT::i1,
2964 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2965 : AMDGPU::EXEC,
2966 MVT::i1),
2967 Cond),
2968 0);
2969 }
2970
2971 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2972 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2973 N->getOperand(2), // Basic Block
2974 VCC.getValue(0));
2975}
2976
2977void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2978 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2979 !N->isDivergent()) {
2980 SDValue Src = N->getOperand(0);
2981 if (Src.getValueType() == MVT::f16) {
2982 if (isExtractHiElt(Src, Src)) {
2983 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2984 {Src});
2985 return;
2986 }
2987 }
2988 }
2989
2990 SelectCode(N);
2991}
2992
2993void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2994 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2995 // be copied to an SGPR with readfirstlane.
2996 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2997 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2998
2999 SDValue Chain = N->getOperand(0);
3000 SDValue Ptr = N->getOperand(2);
3001 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3002 MachineMemOperand *MMO = M->getMemOperand();
3003 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
3004
3006 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
3007 SDValue PtrBase = Ptr.getOperand(0);
3008 SDValue PtrOffset = Ptr.getOperand(1);
3009
3010 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
3011 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
3012 N = glueCopyToM0(N, PtrBase);
3013 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
3014 }
3015 }
3016
3017 if (!Offset) {
3018 N = glueCopyToM0(N, Ptr);
3019 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
3020 }
3021
3022 SDValue Ops[] = {
3023 Offset,
3024 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
3025 Chain,
3026 N->getOperand(N->getNumOperands() - 1) // New glue
3027 };
3028
3029 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3030 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3031}
3032
3033// We need to handle this here because tablegen doesn't support matching
3034// instructions with multiple outputs.
3035void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
3036 unsigned Opc;
3037 switch (IntrID) {
3038 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3039 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3040 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
3041 break;
3042 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3043 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
3044 break;
3045 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3046 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
3047 break;
3048 }
3049 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
3050 N->getOperand(5), N->getOperand(0)};
3051
3052 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3053 MachineMemOperand *MMO = M->getMemOperand();
3054 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3055 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3056}
3057
3058void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3059 bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3060 unsigned Opc =
3061 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3062
3063 SmallVector<SDValue, 7> TensorOps;
3064 // First two groups
3065 TensorOps.push_back(N->getOperand(2)); // D# group 0
3066 TensorOps.push_back(N->getOperand(3)); // D# group 1
3067
3068 // Use _D2 version if both group 2 and 3 are zero-initialized.
3069 SDValue Group2 = N->getOperand(4);
3070 SDValue Group3 = N->getOperand(5);
3071 if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
3073 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3074 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3075 } else { // Has at least 4 groups
3076 TensorOps.push_back(Group2); // D# group 2
3077 TensorOps.push_back(Group3); // D# group 3
3078 }
3079
3080 // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3081 // for now because all existing targets only support up to 4 groups.
3082 TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
3083 TensorOps.push_back(N->getOperand(7)); // cache policy
3084 TensorOps.push_back(N->getOperand(0)); // chain
3085
3086 (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
3087}
3088
3089static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3090 switch (IntrID) {
3091 case Intrinsic::amdgcn_ds_gws_init:
3092 return AMDGPU::DS_GWS_INIT;
3093 case Intrinsic::amdgcn_ds_gws_barrier:
3094 return AMDGPU::DS_GWS_BARRIER;
3095 case Intrinsic::amdgcn_ds_gws_sema_v:
3096 return AMDGPU::DS_GWS_SEMA_V;
3097 case Intrinsic::amdgcn_ds_gws_sema_br:
3098 return AMDGPU::DS_GWS_SEMA_BR;
3099 case Intrinsic::amdgcn_ds_gws_sema_p:
3100 return AMDGPU::DS_GWS_SEMA_P;
3101 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3102 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3103 default:
3104 llvm_unreachable("not a gws intrinsic");
3105 }
3106}
3107
3108void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3109 if (!Subtarget->hasGWS() ||
3110 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3111 !Subtarget->hasGWSSemaReleaseAll())) {
3112 // Let this error.
3113 SelectCode(N);
3114 return;
3115 }
3116
3117 // Chain, intrinsic ID, vsrc, offset
3118 const bool HasVSrc = N->getNumOperands() == 4;
3119 assert(HasVSrc || N->getNumOperands() == 3);
3120
3121 SDLoc SL(N);
3122 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3123 int ImmOffset = 0;
3124 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3125 MachineMemOperand *MMO = M->getMemOperand();
3126
3127 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3128 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3129
3130 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3131 // offset field) % 64. Some versions of the programming guide omit the m0
3132 // part, or claim it's from offset 0.
3133 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3134 // If we have a constant offset, try to use the 0 in m0 as the base.
3135 // TODO: Look into changing the default m0 initialization value. If the
3136 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3137 // the immediate offset.
3138 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3139 ImmOffset = ConstOffset->getZExtValue();
3140 } else {
3141 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3142 ImmOffset = BaseOffset.getConstantOperandVal(1);
3143 BaseOffset = BaseOffset.getOperand(0);
3144 }
3145
3146 // Prefer to do the shift in an SGPR since it should be possible to use m0
3147 // as the result directly. If it's already an SGPR, it will be eliminated
3148 // later.
3149 SDNode *SGPROffset
3150 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3151 BaseOffset);
3152 // Shift to offset in m0
3153 SDNode *M0Base
3154 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3155 SDValue(SGPROffset, 0),
3156 CurDAG->getTargetConstant(16, SL, MVT::i32));
3157 glueCopyToM0(N, SDValue(M0Base, 0));
3158 }
3159
3160 SDValue Chain = N->getOperand(0);
3161 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3162
3163 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3164
3165 const MCInstrDesc &InstrDesc = TII->get(Opc);
3166 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
3167
3168 const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
3169
3171 if (HasVSrc) {
3172 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3173
3174 SDValue Data = N->getOperand(2);
3175 MVT DataVT = Data.getValueType().getSimpleVT();
3176 if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
3177 // Normal 32-bit case.
3178 Ops.push_back(N->getOperand(2));
3179 } else {
3180 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3181 // even aligned 64-bit register class.
3182 const SDValue RegSeqOps[] = {
3183 CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
3184 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3185 SDValue(
3186 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
3187 0),
3188 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
3189
3190 Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
3191 SL, MVT::v2i32, RegSeqOps),
3192 0));
3193 }
3194 }
3195
3196 Ops.push_back(OffsetField);
3197 Ops.push_back(Chain);
3198
3199 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3200 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3201}
3202
3203void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3204 if (Subtarget->getLDSBankCount() != 16) {
3205 // This is a single instruction with a pattern.
3206 SelectCode(N);
3207 return;
3208 }
3209
3210 SDLoc DL(N);
3211
3212 // This requires 2 instructions. It is possible to write a pattern to support
3213 // this, but the generated isel emitter doesn't correctly deal with multiple
3214 // output instructions using the same physical register input. The copy to m0
3215 // is incorrectly placed before the second instruction.
3216 //
3217 // TODO: Match source modifiers.
3218 //
3219 // def : Pat <
3220 // (int_amdgcn_interp_p1_f16
3221 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3222 // (i32 timm:$attrchan), (i32 timm:$attr),
3223 // (i1 timm:$high), M0),
3224 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3225 // timm:$attrchan, 0,
3226 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3227 // let Predicates = [has16BankLDS];
3228 // }
3229
3230 // 16 bank LDS
3231 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3232 N->getOperand(5), SDValue());
3233
3234 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3235
3236 SDNode *InterpMov =
3237 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3238 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3239 N->getOperand(3), // Attr
3240 N->getOperand(2), // Attrchan
3241 ToM0.getValue(1) // In glue
3242 });
3243
3244 SDNode *InterpP1LV =
3245 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3246 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3247 N->getOperand(1), // Src0
3248 N->getOperand(3), // Attr
3249 N->getOperand(2), // Attrchan
3250 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3251 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3252 N->getOperand(4), // high
3253 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3254 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3255 SDValue(InterpMov, 1)
3256 });
3257
3258 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3259}
3260
3261void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3262 unsigned IntrID = N->getConstantOperandVal(1);
3263 switch (IntrID) {
3264 case Intrinsic::amdgcn_ds_append:
3265 case Intrinsic::amdgcn_ds_consume: {
3266 if (N->getValueType(0) != MVT::i32)
3267 break;
3268 SelectDSAppendConsume(N, IntrID);
3269 return;
3270 }
3271 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3272 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3273 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3274 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3275 SelectDSBvhStackIntrinsic(N, IntrID);
3276 return;
3277 case Intrinsic::amdgcn_init_whole_wave:
3278 CurDAG->getMachineFunction()
3279 .getInfo<SIMachineFunctionInfo>()
3280 ->setInitWholeWave();
3281 break;
3282 }
3283
3284 SelectCode(N);
3285}
3286
3287void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3288 unsigned IntrID = N->getConstantOperandVal(0);
3289 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3290 SDNode *ConvGlueNode = N->getGluedNode();
3291 if (ConvGlueNode) {
3292 // FIXME: Possibly iterate over multiple glue nodes?
3293 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3294 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3295 ConvGlueNode =
3296 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3297 MVT::Glue, SDValue(ConvGlueNode, 0));
3298 } else {
3299 ConvGlueNode = nullptr;
3300 }
3301 switch (IntrID) {
3302 case Intrinsic::amdgcn_wqm:
3303 Opcode = AMDGPU::WQM;
3304 break;
3305 case Intrinsic::amdgcn_softwqm:
3306 Opcode = AMDGPU::SOFT_WQM;
3307 break;
3308 case Intrinsic::amdgcn_wwm:
3309 case Intrinsic::amdgcn_strict_wwm:
3310 Opcode = AMDGPU::STRICT_WWM;
3311 break;
3312 case Intrinsic::amdgcn_strict_wqm:
3313 Opcode = AMDGPU::STRICT_WQM;
3314 break;
3315 case Intrinsic::amdgcn_interp_p1_f16:
3316 SelectInterpP1F16(N);
3317 return;
3318 case Intrinsic::amdgcn_permlane16_swap:
3319 case Intrinsic::amdgcn_permlane32_swap: {
3320 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3321 !Subtarget->hasPermlane16Swap()) ||
3322 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3323 !Subtarget->hasPermlane32Swap())) {
3324 SelectCode(N); // Hit the default error
3325 return;
3326 }
3327
3328 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3329 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3330 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3331
3332 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3333 if (ConvGlueNode)
3334 NewOps.push_back(SDValue(ConvGlueNode, 0));
3335
3336 bool FI = N->getConstantOperandVal(3);
3337 NewOps[2] = CurDAG->getTargetConstant(
3338 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3339
3340 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3341 return;
3342 }
3343 default:
3344 SelectCode(N);
3345 break;
3346 }
3347
3348 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3349 SDValue Src = N->getOperand(1);
3350 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3351 }
3352
3353 if (ConvGlueNode) {
3354 SmallVector<SDValue, 4> NewOps(N->ops());
3355 NewOps.push_back(SDValue(ConvGlueNode, 0));
3356 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3357 }
3358}
3359
3360void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3361 unsigned IntrID = N->getConstantOperandVal(1);
3362 switch (IntrID) {
3363 case Intrinsic::amdgcn_ds_gws_init:
3364 case Intrinsic::amdgcn_ds_gws_barrier:
3365 case Intrinsic::amdgcn_ds_gws_sema_v:
3366 case Intrinsic::amdgcn_ds_gws_sema_br:
3367 case Intrinsic::amdgcn_ds_gws_sema_p:
3368 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3369 SelectDS_GWS(N, IntrID);
3370 return;
3371 case Intrinsic::amdgcn_tensor_load_to_lds:
3372 case Intrinsic::amdgcn_tensor_store_from_lds:
3373 SelectTensorLoadStore(N, IntrID);
3374 return;
3375 default:
3376 break;
3377 }
3378
3379 SelectCode(N);
3380}
3381
3382void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3383 SDValue Log2WaveSize =
3384 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3385 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3386 {N->getOperand(0), Log2WaveSize});
3387}
3388
3389void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3390 SDValue SrcVal = N->getOperand(1);
3391 if (SrcVal.getValueType() != MVT::i32) {
3392 SelectCode(N); // Emit default error
3393 return;
3394 }
3395
3396 SDValue CopyVal;
3397 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3398 SDLoc SL(N);
3399
3400 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3401 CopyVal = SrcVal.getOperand(0);
3402 } else {
3403 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3404 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3405
3406 if (N->isDivergent()) {
3407 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3408 MVT::i32, SrcVal),
3409 0);
3410 }
3411
3412 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3413 {SrcVal, Log2WaveSize}),
3414 0);
3415 }
3416
3417 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3418 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3419}
3420
3421bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3422 unsigned &Mods,
3423 bool IsCanonicalizing,
3424 bool AllowAbs) const {
3425 Mods = SISrcMods::NONE;
3426 Src = In;
3427
3428 if (Src.getOpcode() == ISD::FNEG) {
3429 Mods |= SISrcMods::NEG;
3430 Src = Src.getOperand(0);
3431 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3432 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3433 // denormal mode, but we're implicitly canonicalizing in a source operand.
3434 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3435 if (LHS && LHS->isZero()) {
3436 Mods |= SISrcMods::NEG;
3437 Src = Src.getOperand(1);
3438 }
3439 }
3440
3441 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3442 Mods |= SISrcMods::ABS;
3443 Src = Src.getOperand(0);
3444 }
3445
3446 if (Mods != SISrcMods::NONE)
3447 return true;
3448
3449 // Convert various sign-bit masks on integers to src mods. Currently disabled
3450 // for 16-bit types as the codegen replaces the operand without adding a
3451 // srcmod. This is intentionally finding the cases where we are performing
3452 // float neg and abs on int types, the goal is not to obtain two's complement
3453 // neg or abs. Limit converison to select operands via the nonCanonalizing
3454 // pattern.
3455 // TODO: Add 16-bit support.
3456 if (IsCanonicalizing)
3457 return true;
3458
3459 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3460 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3461 // through the extract to the bitwise op.
3462 SDValue PeekSrc =
3463 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3464 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3465 // types as the codegen replaces the operand without adding a srcmod.
3466 // This is intentionally finding the cases where we are performing float neg
3467 // and abs on int types, the goal is not to obtain two's complement neg or
3468 // abs.
3469 // TODO: Add 16-bit support.
3470 unsigned Opc = PeekSrc.getOpcode();
3471 EVT VT = Src.getValueType();
3472 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3473 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3474 return true;
3475
3476 ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
3477 if (!CRHS)
3478 return true;
3479
3480 auto ReplaceSrc = [&]() -> SDValue {
3481 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3482 return Src.getOperand(0);
3483
3484 SDValue LHS = PeekSrc->getOperand(0);
3485 SDValue Index = Src->getOperand(1);
3486 return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3487 Src.getValueType(), LHS, Index);
3488 };
3489
3490 // Recognise Srcmods:
3491 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3492 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3493 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3494 // SrcModifiers.
3495 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3496 Mods |= SISrcMods::NEG;
3497 Src = ReplaceSrc();
3498 } else if (Opc == ISD::AND && AllowAbs &&
3499 CRHS->getAPIntValue().isMaxSignedValue()) {
3500 Mods |= SISrcMods::ABS;
3501 Src = ReplaceSrc();
3502 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3504 Src = ReplaceSrc();
3505 }
3506
3507 return true;
3508}
3509
3510bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3511 SDValue &SrcMods) const {
3512 unsigned Mods;
3513 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3514 /*AllowAbs=*/true)) {
3515 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3516 return true;
3517 }
3518
3519 return false;
3520}
3521
3522bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3523 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3524 unsigned Mods;
3525 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3526 /*AllowAbs=*/true)) {
3527 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3528 return true;
3529 }
3530
3531 return false;
3532}
3533
3534bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3535 SDValue &SrcMods) const {
3536 unsigned Mods;
3537 if (SelectVOP3ModsImpl(In, Src, Mods,
3538 /*IsCanonicalizing=*/true,
3539 /*AllowAbs=*/false)) {
3540 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3541 return true;
3542 }
3543
3544 return false;
3545}
3546
3547bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3548 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3549 return false;
3550
3551 Src = In;
3552 return true;
3553}
3554
3555bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3556 SDValue &SrcMods,
3557 bool OpSel) const {
3558 unsigned Mods;
3559 if (SelectVOP3ModsImpl(In, Src, Mods,
3560 /*IsCanonicalizing=*/true,
3561 /*AllowAbs=*/false)) {
3562 if (OpSel)
3563 Mods |= SISrcMods::OP_SEL_0;
3564 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3565 return true;
3566 }
3567
3568 return false;
3569}
3570
3571bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3572 SDValue &SrcMods) const {
3573 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3574}
3575
3576bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3577 SDValue &SrcMods) const {
3578 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3579}
3580
3581bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3582 SDValue &SrcMods, SDValue &Clamp,
3583 SDValue &Omod) const {
3584 SDLoc DL(In);
3585 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3586 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3587
3588 return SelectVOP3Mods(In, Src, SrcMods);
3589}
3590
3591bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3592 SDValue &SrcMods, SDValue &Clamp,
3593 SDValue &Omod) const {
3594 SDLoc DL(In);
3595 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3596 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3597
3598 return SelectVOP3BMods(In, Src, SrcMods);
3599}
3600
3601bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3602 SDValue &Clamp, SDValue &Omod) const {
3603 Src = In;
3604
3605 SDLoc DL(In);
3606 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3607 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3608
3609 return true;
3610}
3611
3612bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3613 SDValue &SrcMods, bool IsDOT) const {
3614 unsigned Mods = SISrcMods::NONE;
3615 Src = In;
3616
3617 // TODO: Handle G_FSUB 0 as fneg
3618 if (Src.getOpcode() == ISD::FNEG) {
3620 Src = Src.getOperand(0);
3621 }
3622
3623 // 64-bit VOP3P instructions do not have OPSEL or ABS.
3624 bool HasOpSel = Src.getValueSizeInBits() != 128;
3625
3626 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3627 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3628 unsigned VecMods = Mods;
3629
3630 SDValue Lo = stripBitcast(Src.getOperand(0));
3631 SDValue Hi = stripBitcast(Src.getOperand(1));
3632
3633 if (Lo.getOpcode() == ISD::FNEG) {
3634 Lo = stripBitcast(Lo.getOperand(0));
3635 Mods ^= SISrcMods::NEG;
3636 }
3637
3638 if (Hi.getOpcode() == ISD::FNEG) {
3639 Hi = stripBitcast(Hi.getOperand(0));
3640 Mods ^= SISrcMods::NEG_HI;
3641 }
3642
3643 if (HasOpSel) {
3644 if (isExtractHiElt(Lo, Lo))
3645 Mods |= SISrcMods::OP_SEL_0;
3646
3647 if (isExtractHiElt(Hi, Hi))
3648 Mods |= SISrcMods::OP_SEL_1;
3649 }
3650
3651 unsigned VecSize = Src.getValueSizeInBits();
3652 Lo = stripExtractLoElt(Lo);
3653 Hi = stripExtractLoElt(Hi);
3654
3655 if (Lo.getValueSizeInBits() > VecSize) {
3656 Lo = CurDAG->getTargetExtractSubreg(
3657 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3658 MVT::getIntegerVT(VecSize), Lo);
3659 }
3660
3661 if (Hi.getValueSizeInBits() > VecSize) {
3662 Hi = CurDAG->getTargetExtractSubreg(
3663 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3664 MVT::getIntegerVT(VecSize), Hi);
3665 }
3666
3667 assert(Lo.getValueSizeInBits() <= VecSize &&
3668 Hi.getValueSizeInBits() <= VecSize);
3669
3670 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3671 // Really a scalar input. Just select from the low half of the register to
3672 // avoid packing.
3673
3674 if (VecSize == Lo.getValueSizeInBits()) {
3675 Src = Lo;
3676 } else if (VecSize == 32) {
3677 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3678 } else {
3679 assert((Lo.getValueSizeInBits() == 32 && VecSize == 64) ||
3680 (Lo.getValueSizeInBits() == 64 && VecSize == 128));
3681
3682 SDLoc SL(In);
3684 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3685 Lo.getValueType()), 0);
3686 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3687 // <2 x 64> instructions do not have OPSEL and also replicate low 64
3688 // bits of a scalar input into high 64 bits. Use VGPRs in this case.
3689 // TODO: This fact can be exploited but we need to set proper OPSEL for
3690 // codegen folding purposes. It will not affect a final instruction.
3691 auto RC = (Lo->isDivergent() || !HasOpSel)
3692 ? TRI->getVGPRClassForBitWidth(VecSize)
3693 : TRI->getSGPRClassForBitWidth(VecSize);
3694 unsigned NumRegs = Lo.getValueSizeInBits() == 32 ? 1 : 2;
3695 const SDValue Ops[] = {
3696 CurDAG->getTargetConstant(RC->getID(), SL, MVT::i32), Lo,
3697 CurDAG->getTargetConstant(TRI->getSubRegFromChannel(0, NumRegs), SL,
3698 MVT::i32),
3699 HasOpSel ? Undef : Hi,
3700 CurDAG->getTargetConstant(
3701 TRI->getSubRegFromChannel(NumRegs, NumRegs), SL, MVT::i32)};
3702
3703 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3704 Src.getValueType(), Ops), 0);
3705 }
3706 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3707 return true;
3708 }
3709
3710 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3711 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3712 .bitcastToAPInt().getZExtValue();
3713 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3714 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3715 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3716 return true;
3717 }
3718 }
3719
3720 Mods = VecMods;
3721 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3722 Src.getNumOperands() == 2) {
3723
3724 // TODO: We should repeat the build_vector source check above for the
3725 // vector_shuffle for negates and casts of individual elements.
3726
3727 assert(Src.getValueSizeInBits() != 128 &&
3728 "<2 x 64> VECTOR_SHUFFLE should not be legal.");
3729
3730 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3731 ArrayRef<int> Mask = SVN->getMask();
3732
3733 if (Mask[0] < 2 && Mask[1] < 2) {
3734 // src1 should be undef.
3735 SDValue ShuffleSrc = SVN->getOperand(0);
3736
3737 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3738 ShuffleSrc = ShuffleSrc.getOperand(0);
3740 }
3741
3742 if (Mask[0] == 1)
3743 Mods |= SISrcMods::OP_SEL_0;
3744 if (Mask[1] == 1)
3745 Mods |= SISrcMods::OP_SEL_1;
3746
3747 Src = ShuffleSrc;
3748 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3749 return true;
3750 }
3751 }
3752
3753 // Packed instructions do not have abs modifiers.
3754 Mods |= SISrcMods::OP_SEL_1;
3755
3756 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3757 return true;
3758}
3759
3760bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3761 SDValue &SrcMods) const {
3762 return SelectVOP3PMods(In, Src, SrcMods, true);
3763}
3764
3765bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const {
3766 SDValue SrcTmp, SrcModsTmp;
3767 SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true);
3768 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3769 Src = SrcTmp;
3770 return true;
3771 }
3772
3773 return false;
3774}
3775
3776bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
3777 SDValue &SrcMods) const {
3778 SelectVOP3Mods(In, Src, SrcMods);
3779 unsigned Mods = SISrcMods::OP_SEL_1;
3780 Mods |= cast<ConstantSDNode>(SrcMods)->getZExtValue();
3781 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3782 return true;
3783}
3784
3785bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const {
3786 SDValue SrcTmp, SrcModsTmp;
3787 SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp);
3788 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3789 Src = SrcTmp;
3790 return true;
3791 }
3792
3793 return false;
3794}
3795
3796bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3797 SDValue &Src) const {
3798 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3799 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3800
3801 unsigned Mods = SISrcMods::OP_SEL_1;
3802 unsigned SrcVal = C->getZExtValue();
3803 if (SrcVal == 1)
3804 Mods |= SISrcMods::OP_SEL_0;
3805
3806 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3807 return true;
3808}
3809
3811AMDGPUDAGToDAGISel::buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3812 const SDLoc &DL) const {
3813 unsigned DstRegClass;
3814 EVT DstTy;
3815 switch (Elts.size()) {
3816 case 8:
3817 DstRegClass = AMDGPU::VReg_256RegClassID;
3818 DstTy = MVT::v8i32;
3819 break;
3820 case 4:
3821 DstRegClass = AMDGPU::VReg_128RegClassID;
3822 DstTy = MVT::v4i32;
3823 break;
3824 case 2:
3825 DstRegClass = AMDGPU::VReg_64RegClassID;
3826 DstTy = MVT::v2i32;
3827 break;
3828 default:
3829 llvm_unreachable("unhandled Reg sequence size");
3830 }
3831
3833 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3834 for (unsigned i = 0; i < Elts.size(); ++i) {
3835 Ops.push_back(Elts[i]);
3836 Ops.push_back(CurDAG->getTargetConstant(
3838 }
3839 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3840}
3841
3843AMDGPUDAGToDAGISel::buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3844 const SDLoc &DL) const {
3845 SmallVector<SDValue, 8> PackedElts;
3846 assert("unhandled Reg sequence size" &&
3847 (Elts.size() == 8 || Elts.size() == 16));
3848
3849 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3850 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3851 for (unsigned i = 0; i < Elts.size(); i += 2) {
3852 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3853 SDValue HiSrc;
3854 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3855 PackedElts.push_back(HiSrc);
3856 } else {
3857 if (Subtarget->useRealTrue16Insts()) {
3858 // FIXME-TRUE16. For now pack VGPR_32 for 16-bit source before
3859 // passing to v_perm_b32. Eventually we should use replace v_perm_b32
3860 // by reg_sequence.
3862 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i16),
3863 0);
3864 Elts[i] =
3865 emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID, MVT::i32,
3866 {Elts[i], Undef}, {AMDGPU::lo16, AMDGPU::hi16}, DL);
3867 Elts[i + 1] = emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID,
3868 MVT::i32, {Elts[i + 1], Undef},
3869 {AMDGPU::lo16, AMDGPU::hi16}, DL);
3870 }
3871 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3872 MachineSDNode *Packed =
3873 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3874 {Elts[i + 1], Elts[i], PackLoLo});
3875 PackedElts.push_back(SDValue(Packed, 0));
3876 }
3877 }
3878 return buildRegSequence32(PackedElts, DL);
3879}
3880
3882AMDGPUDAGToDAGISel::buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3883 const SDLoc &DL,
3884 unsigned ElementSize) const {
3885 if (ElementSize == 16)
3886 return buildRegSequence16(Elts, DL);
3887 if (ElementSize == 32)
3888 return buildRegSequence32(Elts, DL);
3889 llvm_unreachable("Unhandled element size");
3890}
3891
3892void AMDGPUDAGToDAGISel::selectWMMAModsNegAbs(unsigned ModOpcode,
3893 unsigned &Mods,
3895 SDValue &Src, const SDLoc &DL,
3896 unsigned ElementSize) const {
3897 if (ModOpcode == ISD::FNEG) {
3898 Mods |= SISrcMods::NEG;
3899 // Check if all elements also have abs modifier
3900 SmallVector<SDValue, 8> NegAbsElts;
3901 for (auto El : Elts) {
3902 if (El.getOpcode() != ISD::FABS)
3903 break;
3904 NegAbsElts.push_back(El->getOperand(0));
3905 }
3906 if (Elts.size() != NegAbsElts.size()) {
3907 // Neg
3908 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3909 } else {
3910 // Neg and Abs
3911 Mods |= SISrcMods::NEG_HI;
3912 Src = SDValue(buildRegSequence(NegAbsElts, DL, ElementSize), 0);
3913 }
3914 } else {
3915 assert(ModOpcode == ISD::FABS);
3916 // Abs
3917 Mods |= SISrcMods::NEG_HI;
3918 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3919 }
3920}
3921
3922// Check all f16 elements for modifiers while looking through b32 and v2b16
3923// build vector, stop if element does not satisfy ModifierCheck.
3924static void
3926 std::function<bool(SDValue)> ModifierCheck) {
3927 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3928 if (auto *F16Pair =
3929 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3930 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3931 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3932 if (!ModifierCheck(ElF16))
3933 break;
3934 }
3935 }
3936 }
3937}
3938
3939bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3940 SDValue &SrcMods) const {
3941 Src = In;
3942 unsigned Mods = SISrcMods::OP_SEL_1;
3943
3944 // mods are on f16 elements
3945 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3947
3948 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3949 if (Element.getOpcode() != ISD::FNEG)
3950 return false;
3951 EltsF16.push_back(Element.getOperand(0));
3952 return true;
3953 });
3954
3955 // All elements have neg modifier
3956 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3957 Src = SDValue(buildRegSequence16(EltsF16, SDLoc(In)), 0);
3958 Mods |= SISrcMods::NEG;
3959 Mods |= SISrcMods::NEG_HI;
3960 }
3961 }
3962
3963 // mods are on v2f16 elements
3964 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3965 SmallVector<SDValue, 8> EltsV2F16;
3966 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3967 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3968 // Based on first element decide which mod we match, neg or abs
3969 if (ElV2f16.getOpcode() != ISD::FNEG)
3970 break;
3971 EltsV2F16.push_back(ElV2f16.getOperand(0));
3972 }
3973
3974 // All pairs of elements have neg modifier
3975 if (BV->getNumOperands() == EltsV2F16.size()) {
3976 Src = SDValue(buildRegSequence32(EltsV2F16, SDLoc(In)), 0);
3977 Mods |= SISrcMods::NEG;
3978 Mods |= SISrcMods::NEG_HI;
3979 }
3980 }
3981
3982 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3983 return true;
3984}
3985
3986bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3987 SDValue &SrcMods) const {
3988 Src = In;
3989 unsigned Mods = SISrcMods::OP_SEL_1;
3990 unsigned ModOpcode;
3991
3992 // mods are on f16 elements
3993 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3995 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3996 // Based on first element decide which mod we match, neg or abs
3997 if (EltsF16.empty())
3998 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3999 if (ElF16.getOpcode() != ModOpcode)
4000 return false;
4001 EltsF16.push_back(ElF16.getOperand(0));
4002 return true;
4003 });
4004
4005 // All elements have ModOpcode modifier
4006 if (BV->getNumOperands() * 2 == EltsF16.size())
4007 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, SDLoc(In), 16);
4008 }
4009
4010 // mods are on v2f16 elements
4011 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4012 SmallVector<SDValue, 8> EltsV2F16;
4013
4014 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
4015 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
4016 // Based on first element decide which mod we match, neg or abs
4017 if (EltsV2F16.empty())
4018 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
4019 if (ElV2f16->getOpcode() != ModOpcode)
4020 break;
4021 EltsV2F16.push_back(ElV2f16->getOperand(0));
4022 }
4023
4024 // All elements have ModOpcode modifier
4025 if (BV->getNumOperands() == EltsV2F16.size())
4026 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, SDLoc(In), 32);
4027 }
4028
4029 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4030 return true;
4031}
4032
4033bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
4034 SDValue &SrcMods) const {
4035 Src = In;
4036 unsigned Mods = SISrcMods::OP_SEL_1;
4038
4039 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
4040 assert(BV->getNumOperands() > 0);
4041 // Based on first element decide which mod we match, neg or abs
4042 SDValue ElF32 = stripBitcast(BV->getOperand(0));
4043 unsigned ModOpcode =
4044 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
4045 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
4046 SDValue ElF32 = stripBitcast(BV->getOperand(i));
4047 if (ElF32.getOpcode() != ModOpcode)
4048 break;
4049 EltsF32.push_back(ElF32.getOperand(0));
4050 }
4051
4052 // All elements had ModOpcode modifier
4053 if (BV->getNumOperands() == EltsF32.size())
4054 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, SDLoc(In), 32);
4055 }
4056
4057 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4058 return true;
4059}
4060
4061bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
4062 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
4063 BitVector UndefElements;
4064 if (SDValue Splat = BV->getSplatValue(&UndefElements))
4065 if (isInlineImmediate(Splat.getNode())) {
4066 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
4067 unsigned Imm = C->getAPIntValue().getSExtValue();
4068 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4069 return true;
4070 }
4071 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
4072 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
4073 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4074 return true;
4075 }
4076 llvm_unreachable("unhandled Constant node");
4077 }
4078 }
4079
4080 // 16 bit splat
4081 SDValue SplatSrc32 = stripBitcast(In);
4082 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
4083 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
4084 SDValue SplatSrc16 = stripBitcast(Splat32);
4085 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
4086 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
4087 const SIInstrInfo *TII = Subtarget->getInstrInfo();
4088 std::optional<APInt> RawValue;
4089 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
4090 RawValue = C->getValueAPF().bitcastToAPInt();
4091 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
4092 RawValue = C->getAPIntValue();
4093
4094 if (RawValue.has_value()) {
4095 EVT VT = In.getValueType().getScalarType();
4096 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
4097 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
4100 RawValue.value());
4101 if (TII->isInlineConstant(FloatVal)) {
4102 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4103 MVT::i16);
4104 return true;
4105 }
4106 } else if (VT.getSimpleVT() == MVT::i16) {
4107 if (TII->isInlineConstant(RawValue.value())) {
4108 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4109 MVT::i16);
4110 return true;
4111 }
4112 } else
4113 llvm_unreachable("unknown 16-bit type");
4114 }
4115 }
4116 }
4117
4118 // Currently f64 immediate vectors are represented as vectors of v2i32, with
4119 // different lo and hi 32-bit values even though double values are splated.
4120 // So we have to manually compare to determine whether it is splated.
4121 if (CurDAG->isConstantIntBuildVectorOrConstantInt(SplatSrc32)) {
4122 int64_t Imm64 = 0;
4123 for (unsigned i = 0; i < SplatSrc32->getNumOperands(); i += 2) {
4124 auto Lo32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i));
4125 auto Hi32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i + 1));
4126 int64_t LoImm = Lo32->getAPIntValue().getSExtValue();
4127 int64_t HiImm = Hi32->getAPIntValue().getSExtValue();
4128 int64_t Imm64I = (HiImm << 32) + LoImm;
4129 if (i == 0) {
4130 if (!isInlineImmediate(APInt(64, Imm64I)))
4131 return false;
4132 Imm64 = Imm64I;
4133 } else if (Imm64I != Imm64)
4134 return false;
4135 } // end for
4136
4137 Src = CurDAG->getTargetConstant(Imm64, SDLoc(In), MVT::i64);
4138 return true;
4139 }
4140
4141 return false;
4142}
4143
4144bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4145 SDValue &IndexKey) const {
4146 unsigned Key = 0;
4147 Src = In;
4148
4149 if (In.getOpcode() == ISD::SRL) {
4150 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4151 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4152 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4153 ShiftAmt->getZExtValue() % 8 == 0) {
4154 Key = ShiftAmt->getZExtValue() / 8;
4155 Src = ShiftSrc;
4156 }
4157 }
4158
4159 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4160 return true;
4161}
4162
4163bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4164 SDValue &IndexKey) const {
4165 unsigned Key = 0;
4166 Src = In;
4167
4168 if (In.getOpcode() == ISD::SRL) {
4169 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4170 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4171 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4172 ShiftAmt->getZExtValue() == 16) {
4173 Key = 1;
4174 Src = ShiftSrc;
4175 }
4176 }
4177
4178 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4179 return true;
4180}
4181
4182bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4183 SDValue &IndexKey) const {
4184 unsigned Key = 0;
4185 Src = In;
4186
4187 SDValue InI32;
4188
4189 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4190 const SDValue &ExtendSrc = In.getOperand(0);
4191 if (ExtendSrc.getValueSizeInBits() == 32)
4192 InI32 = ExtendSrc;
4193 } else if (In->getOpcode() == ISD::BITCAST) {
4194 const SDValue &CastSrc = In.getOperand(0);
4195 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4196 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
4197 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
4198 if (Zero && Zero->getZExtValue() == 0)
4199 InI32 = CastSrc.getOperand(0);
4200 }
4201 }
4202
4203 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4204 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
4205 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
4206 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4207 EltIdx->getZExtValue() == 1) {
4208 Key = 1;
4209 Src = ExtractVecEltSrc;
4210 }
4211 }
4212
4213 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4214 return true;
4215}
4216
4217bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4218 SDValue &SrcMods) const {
4219 Src = In;
4220 // FIXME: Handle op_sel
4221 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
4222 return true;
4223}
4224
4225bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4226 SDValue &SrcMods) const {
4227 // FIXME: Handle op_sel
4228 return SelectVOP3Mods(In, Src, SrcMods);
4229}
4230
4231// Match lowered fpext from bf16 to f32. This is a bit operation extending
4232// a 16-bit value with 16-bit of zeroes at LSB:
4233//
4234// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4235// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4236// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4237static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4238 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4239 return SDValue();
4240 Op = Op.getOperand(0);
4241
4242 IsExtractHigh = false;
4243 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4244 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
4245 if (!Low16 || !Low16->isZero())
4246 return SDValue();
4247 Op = stripBitcast(Op.getOperand(1));
4248 if (Op.getValueType() != MVT::bf16)
4249 return SDValue();
4250 return Op;
4251 }
4252
4253 if (Op.getValueType() != MVT::i32)
4254 return SDValue();
4255
4256 if (Op.getOpcode() == ISD::AND) {
4257 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4258 if (Mask->getZExtValue() == 0xffff0000) {
4259 IsExtractHigh = true;
4260 return Op.getOperand(0);
4261 }
4262 }
4263 return SDValue();
4264 }
4265
4266 if (Op.getOpcode() == ISD::SHL) {
4267 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4268 if (Amt->getZExtValue() == 16)
4269 return Op.getOperand(0);
4270 }
4271 }
4272
4273 return SDValue();
4274}
4275
4276// The return value is not whether the match is possible (which it always is),
4277// but whether or not it a conversion is really used.
4278bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4279 unsigned &Mods,
4280 MVT VT) const {
4281 Mods = 0;
4282 SelectVOP3ModsImpl(In, Src, Mods);
4283
4284 bool IsExtractHigh = false;
4285 if (Src.getOpcode() == ISD::FP_EXTEND) {
4286 Src = Src.getOperand(0);
4287 } else if (VT == MVT::bf16) {
4288 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4289 if (!B16)
4290 return false;
4291 Src = B16;
4292 } else
4293 return false;
4294
4295 if (Src.getValueType() != VT &&
4296 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4297 return false;
4298
4299 Src = stripBitcast(Src);
4300
4301 // Be careful about folding modifiers if we already have an abs. fneg is
4302 // applied last, so we don't want to apply an earlier fneg.
4303 if ((Mods & SISrcMods::ABS) == 0) {
4304 unsigned ModsTmp;
4305 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4306
4307 if ((ModsTmp & SISrcMods::NEG) != 0)
4308 Mods ^= SISrcMods::NEG;
4309
4310 if ((ModsTmp & SISrcMods::ABS) != 0)
4311 Mods |= SISrcMods::ABS;
4312 }
4313
4314 // op_sel/op_sel_hi decide the source type and source.
4315 // If the source's op_sel_hi is set, it indicates to do a conversion from
4316 // fp16. If the sources's op_sel is set, it picks the high half of the source
4317 // register.
4318
4319 Mods |= SISrcMods::OP_SEL_1;
4320 if (Src.getValueSizeInBits() == 16) {
4321 if (isExtractHiElt(Src, Src)) {
4322 Mods |= SISrcMods::OP_SEL_0;
4323
4324 // TODO: Should we try to look for neg/abs here?
4325 return true;
4326 }
4327
4328 if (Src.getOpcode() == ISD::TRUNCATE &&
4329 Src.getOperand(0).getValueType() == MVT::i32) {
4330 Src = Src.getOperand(0);
4331 return true;
4332 }
4333
4334 if (Subtarget->useRealTrue16Insts())
4335 // In true16 mode, pack src to a 32bit
4336 Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4337 } else if (IsExtractHigh)
4338 Mods |= SISrcMods::OP_SEL_0;
4339
4340 return true;
4341}
4342
4343bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4344 SDValue &SrcMods) const {
4345 unsigned Mods = 0;
4346 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4347 return false;
4348 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4349 return true;
4350}
4351
4352bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4353 SDValue &SrcMods) const {
4354 unsigned Mods = 0;
4355 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4356 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4357 return true;
4358}
4359
4360bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4361 SDValue &SrcMods) const {
4362 unsigned Mods = 0;
4363 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4364 return false;
4365 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4366 return true;
4367}
4368
4369bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4370 SDValue &SrcMods) const {
4371 unsigned Mods = 0;
4372 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4373 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4374 return true;
4375}
4376
4377// Match BITOP3 operation and return a number of matched instructions plus
4378// truth table.
4379static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4381 unsigned NumOpcodes = 0;
4382 uint8_t LHSBits, RHSBits;
4383
4384 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4385 // Define truth table given Src0, Src1, Src2 bits permutations:
4386 // 0 0 0
4387 // 0 0 1
4388 // 0 1 0
4389 // 0 1 1
4390 // 1 0 0
4391 // 1 0 1
4392 // 1 1 0
4393 // 1 1 1
4394 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4395
4396 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4397 if (C->isAllOnes()) {
4398 Bits = 0xff;
4399 return true;
4400 }
4401 if (C->isZero()) {
4402 Bits = 0;
4403 return true;
4404 }
4405 }
4406
4407 for (unsigned I = 0; I < Src.size(); ++I) {
4408 // Try to find existing reused operand
4409 if (Src[I] == Op) {
4410 Bits = SrcBits[I];
4411 return true;
4412 }
4413 // Try to replace parent operator
4414 if (Src[I] == In) {
4415 Bits = SrcBits[I];
4416 Src[I] = Op;
4417 return true;
4418 }
4419 }
4420
4421 if (Src.size() == 3) {
4422 // No room left for operands. Try one last time, there can be a 'not' of
4423 // one of our source operands. In this case we can compute the bits
4424 // without growing Src vector.
4425 if (Op.getOpcode() == ISD::XOR) {
4426 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4427 if (C->isAllOnes()) {
4428 SDValue LHS = Op.getOperand(0);
4429 for (unsigned I = 0; I < Src.size(); ++I) {
4430 if (Src[I] == LHS) {
4431 Bits = ~SrcBits[I];
4432 return true;
4433 }
4434 }
4435 }
4436 }
4437 }
4438
4439 return false;
4440 }
4441
4442 Bits = SrcBits[Src.size()];
4443 Src.push_back(Op);
4444 return true;
4445 };
4446
4447 switch (In.getOpcode()) {
4448 case ISD::AND:
4449 case ISD::OR:
4450 case ISD::XOR: {
4451 SDValue LHS = In.getOperand(0);
4452 SDValue RHS = In.getOperand(1);
4453
4454 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4455 if (!getOperandBits(LHS, LHSBits) ||
4456 !getOperandBits(RHS, RHSBits)) {
4457 Src = std::move(Backup);
4458 return std::make_pair(0, 0);
4459 }
4460
4461 // Recursion is naturally limited by the size of the operand vector.
4462 //
4463 // When LHS and RHS share a common sub-expression, one side's recursion
4464 // may decompose that sub-expression and replace the Src slot the other
4465 // side occupies with sub-operands via the "replace parent" path in
4466 // getOperandBits. The other side's cached bit-pattern then refers to a
4467 // slot whose contents changed, producing a wrong truth table.
4468 //
4469 // We detect this in three ways:
4470 // (A) If LHS recursed, its truth table is valid against the Src state
4471 // when LHS recursion completed (SrcAfterLHS). If RHS recursion
4472 // then mutates a Src slot that LHSBits depends on, LHSBits is
4473 // stale.
4474 // (B) If RHS did not recurse, RHSBits came from getOperandBits and
4475 // refers to a specific Src slot. If that slot's contents changed
4476 // (by either recursion), RHSBits is stale.
4477 // (C) Symmetrically for LHS if it did not recurse.
4478 SmallVector<SDValue, 3> SrcBeforeRecurse(Src.begin(), Src.end());
4479 uint8_t LHSBitsOrig = LHSBits;
4480 uint8_t RHSBitsOrig = RHSBits;
4481
4482 auto LHSOp = BitOp3_Op(LHS, Src);
4483 if (LHSOp.first) {
4484 NumOpcodes += LHSOp.first;
4485 LHSBits = LHSOp.second;
4486 }
4487
4488 SmallVector<SDValue, 3> SrcAfterLHS(Src.begin(), Src.end());
4489
4490 auto RHSOp = BitOp3_Op(RHS, Src);
4491 if (RHSOp.first) {
4492 NumOpcodes += RHSOp.first;
4493 RHSBits = RHSOp.second;
4494 }
4495
4496 // dependsOnSlot: true iff the truth table TT varies with slot Slot.
4497 auto dependsOnSlot = [](uint8_t TT, int Slot) -> bool {
4498 if (Slot < 0 || Slot > 2)
4499 return false;
4500 const uint8_t Masks[3] = {0x0f, 0x33, 0x55};
4501 const int Shifts[3] = {4, 2, 1};
4502 return ((TT ^ (TT >> Shifts[Slot])) & Masks[Slot]) != 0;
4503 };
4504
4505 // findSlot: locate the Src slot a getOperandBits result depends on,
4506 // including negated (XOR with -1) patterns that getOperandBits
4507 // resolves via the NOT shortcut (~SrcBits[I]).
4508 const uint8_t SrcBitsConst[3] = {0xf0, 0xcc, 0xaa};
4509 auto findSlot = [&](uint8_t Bits, SDValue Op,
4510 const SmallVectorImpl<SDValue> &S) -> int {
4511 SDValue NegatedInner;
4512 bool IsNegationOp =
4513 Op.getOpcode() == ISD::XOR && isAllOnesConstant(Op.getOperand(1));
4514 if (IsNegationOp)
4515 NegatedInner = Op.getOperand(0);
4516 for (int I = 0; I < (int)S.size(); I++) {
4517 if (Bits == SrcBitsConst[I] && S[I] == Op)
4518 return I;
4519 if (IsNegationOp && Bits == (uint8_t)~SrcBitsConst[I] &&
4520 S[I] == NegatedInner)
4521 return I;
4522 }
4523 return -1;
4524 };
4525
4526 bool Stale = false;
4527
4528 // (A) LHS recursed: its truth table is against SrcAfterLHS.
4529 // Check if RHS recursion mutated a slot that LHSBits uses.
4530 if (LHSOp.first) {
4531 for (int I = 0; I < (int)SrcAfterLHS.size() && I < 3; I++) {
4532 if (I < (int)Src.size() && Src[I] != SrcAfterLHS[I] &&
4533 dependsOnSlot(LHSBits, I)) {
4534 Stale = true;
4535 break;
4536 }
4537 }
4538 }
4539
4540 // (B) RHS did not recurse: RHSBits from getOperandBits is against
4541 // SrcBeforeRecurse. Check if that slot was mutated since then.
4542 if (!Stale && !RHSOp.first) {
4543 int Slot = findSlot(RHSBitsOrig, RHS, SrcBeforeRecurse);
4544 if (Slot >= 0 &&
4545 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4546 Stale = true;
4547 }
4548
4549 // (C) LHS did not recurse: LHSBits from getOperandBits is against
4550 // SrcBeforeRecurse. Check if that slot was mutated since then.
4551 if (!Stale && !LHSOp.first) {
4552 int Slot = findSlot(LHSBitsOrig, LHS, SrcBeforeRecurse);
4553 if (Slot >= 0 &&
4554 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4555 Stale = true;
4556 }
4557
4558 if (Stale) {
4559 Src = std::move(SrcBeforeRecurse);
4560 LHSBits = LHSBitsOrig;
4561 RHSBits = RHSBitsOrig;
4562 NumOpcodes = 0;
4563 }
4564 break;
4565 }
4566 default:
4567 return std::make_pair(0, 0);
4568 }
4569
4570 uint8_t TTbl;
4571 switch (In.getOpcode()) {
4572 case ISD::AND:
4573 TTbl = LHSBits & RHSBits;
4574 break;
4575 case ISD::OR:
4576 TTbl = LHSBits | RHSBits;
4577 break;
4578 case ISD::XOR:
4579 TTbl = LHSBits ^ RHSBits;
4580 break;
4581 default:
4582 break;
4583 }
4584
4585 return std::make_pair(NumOpcodes + 1, TTbl);
4586}
4587
4588bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4589 SDValue &Src2, SDValue &Tbl) const {
4591 uint8_t TTbl;
4592 unsigned NumOpcodes;
4593
4594 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4595
4596 // Src.empty() case can happen if all operands are all zero or all ones.
4597 // Normally it shall be optimized out before reaching this.
4598 if (NumOpcodes < 2 || Src.empty())
4599 return false;
4600
4601 // For a uniform case threshold should be higher to account for moves between
4602 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4603 // and a readtfirstlane after.
4604 if (NumOpcodes < 4 && !In->isDivergent())
4605 return false;
4606
4607 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4608 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4609 // asm more readable. This cannot be modeled with AddedComplexity because
4610 // selector does not know how many operations did we match.
4611 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4612 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4613 In.getOperand(1).getOpcode() == In.getOpcode()))
4614 return false;
4615
4616 if (In.getOpcode() == ISD::OR &&
4617 (In.getOperand(0).getOpcode() == ISD::AND ||
4618 In.getOperand(1).getOpcode() == ISD::AND))
4619 return false;
4620 }
4621
4622 // Last operand can be ignored, turning a ternary operation into a binary.
4623 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4624 // 'c' with 'a' here without changing the answer. In some pathological
4625 // cases it should be possible to get an operation with a single operand
4626 // too if optimizer would not catch it.
4627 while (Src.size() < 3)
4628 Src.push_back(Src[0]);
4629
4630 Src0 = Src[0];
4631 Src1 = Src[1];
4632 Src2 = Src[2];
4633
4634 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4635 return true;
4636}
4637
4638SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4639 if (In.isUndef())
4640 return CurDAG->getUNDEF(MVT::i32);
4641
4642 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4643 SDLoc SL(In);
4644 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4645 }
4646
4647 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4648 SDLoc SL(In);
4649 return CurDAG->getConstant(
4650 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4651 }
4652
4653 SDValue Src;
4654 if (isExtractHiElt(In, Src))
4655 return Src;
4656
4657 return SDValue();
4658}
4659
4660bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4661 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4662
4663 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4664 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4665
4666 unsigned Limit = 0;
4667 bool AllUsesAcceptSReg = true;
4668 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4669 Limit < 10 && U != E; ++U, ++Limit) {
4670 const TargetRegisterClass *RC =
4671 getOperandRegClass(U->getUser(), U->getOperandNo());
4672
4673 // If the register class is unknown, it could be an unknown
4674 // register class that needs to be an SGPR, e.g. an inline asm
4675 // constraint
4676 if (!RC || SIRI->isSGPRClass(RC))
4677 return false;
4678
4679 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4680 RC != &AMDGPU::VS_64_Align2RegClass) {
4681 AllUsesAcceptSReg = false;
4682 SDNode *User = U->getUser();
4683 if (User->isMachineOpcode()) {
4684 unsigned Opc = User->getMachineOpcode();
4685 const MCInstrDesc &Desc = SII->get(Opc);
4686 if (Desc.isCommutable()) {
4687 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4688 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4689 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4690 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4691 const TargetRegisterClass *CommutedRC =
4692 getOperandRegClass(U->getUser(), CommutedOpNo);
4693 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4694 CommutedRC == &AMDGPU::VS_64RegClass ||
4695 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4696 AllUsesAcceptSReg = true;
4697 }
4698 }
4699 }
4700 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4701 // commuting current user. This means have at least one use
4702 // that strictly require VGPR. Thus, we will not attempt to commute
4703 // other user instructions.
4704 if (!AllUsesAcceptSReg)
4705 break;
4706 }
4707 }
4708 return !AllUsesAcceptSReg && (Limit < 10);
4709}
4710
4711bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4712 const auto *Ld = cast<LoadSDNode>(N);
4713 const MachineMemOperand *MMO = Ld->getMemOperand();
4714
4715 // FIXME: We ought to able able to take the direct isDivergent result. We
4716 // cannot rely on the MMO for a uniformity check, and should stop using
4717 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4718 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4719 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4720 // version, and then this can be dropped.
4721 if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4722 return false;
4723
4724 return MMO->getSize().hasValue() &&
4725 Ld->getAlign() >=
4726 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4727 uint64_t(4))) &&
4728 (MMO->isInvariant() ||
4729 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4730 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4731 (Subtarget->getScalarizeGlobalBehavior() &&
4732 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4733 Ld->isSimple() &&
4734 static_cast<const SITargetLowering *>(getTargetLowering())
4735 ->isMemOpHasNoClobberedMemOperand(N)));
4736}
4737
4740 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4741 bool IsModified = false;
4742 do {
4743 IsModified = false;
4744
4745 // Go over all selected nodes and try to fold them a bit more
4746 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4747 while (Position != CurDAG->allnodes_end()) {
4748 SDNode *Node = &*Position++;
4750 if (!MachineNode)
4751 continue;
4752
4753 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4754 if (ResNode != Node) {
4755 if (ResNode)
4756 ReplaceUses(Node, ResNode);
4757 IsModified = true;
4758 }
4759 }
4760 CurDAG->RemoveDeadNodes();
4761 } while (IsModified);
4762}
4763
4768
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:119
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
bool isSDWAOperand(const SDNode *N) const
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static SDValue stripBitcast(SDValue Val)
static const fltSemantics & BFloat()
Definition APFloat.h:296
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:406
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1679
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:306
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
LLVM_ABI PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ TargetFrameIndex
Definition ISDOpcodes.h:187
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ BRCOND
BRCOND - Conditional branch.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Undef
Value of the register doesn't matter.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
static bool getConstantValue(SDValue N, uint32_t &Out)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:860
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:279
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.