LLVM 18.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
38 "amdgpu-global-isel-risky-select",
39 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
40 cl::init(false),
42
43#define GET_GLOBALISEL_IMPL
44#define AMDGPUSubtarget GCNSubtarget
45#include "AMDGPUGenGlobalISel.inc"
46#undef GET_GLOBALISEL_IMPL
47#undef AMDGPUSubtarget
48
50 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
52 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
53 STI(STI),
54 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
56#include "AMDGPUGenGlobalISel.inc"
59#include "AMDGPUGenGlobalISel.inc"
61{
62}
63
64const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
65
67 CodeGenCoverage *CoverageInfo,
69 BlockFrequencyInfo *BFI) {
70 MRI = &MF.getRegInfo();
71 Subtarget = &MF.getSubtarget<GCNSubtarget>();
73}
74
75// Return the wave level SGPR base address if this is a wave address.
77 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
78 ? Def->getOperand(1).getReg()
79 : Register();
80}
81
82bool AMDGPUInstructionSelector::isVCC(Register Reg,
83 const MachineRegisterInfo &MRI) const {
84 // The verifier is oblivious to s1 being a valid value for wavesize registers.
85 if (Reg.isPhysical())
86 return false;
87
88 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
89 const TargetRegisterClass *RC =
90 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
91 if (RC) {
92 const LLT Ty = MRI.getType(Reg);
93 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
94 return false;
95 // G_TRUNC s1 result is never vcc.
96 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
97 RC->hasSuperClassEq(TRI.getBoolRC());
98 }
99
100 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
101 return RB->getID() == AMDGPU::VCCRegBankID;
102}
103
104bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
105 unsigned NewOpc) const {
106 MI.setDesc(TII.get(NewOpc));
107 MI.removeOperand(1); // Remove intrinsic ID.
108 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
109
110 MachineOperand &Dst = MI.getOperand(0);
111 MachineOperand &Src = MI.getOperand(1);
112
113 // TODO: This should be legalized to s32 if needed
114 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
115 return false;
116
117 const TargetRegisterClass *DstRC
118 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
119 const TargetRegisterClass *SrcRC
120 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
121 if (!DstRC || DstRC != SrcRC)
122 return false;
123
124 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
125 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
126}
127
128bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
129 const DebugLoc &DL = I.getDebugLoc();
130 MachineBasicBlock *BB = I.getParent();
131 I.setDesc(TII.get(TargetOpcode::COPY));
132
133 const MachineOperand &Src = I.getOperand(1);
134 MachineOperand &Dst = I.getOperand(0);
135 Register DstReg = Dst.getReg();
136 Register SrcReg = Src.getReg();
137
138 if (isVCC(DstReg, *MRI)) {
139 if (SrcReg == AMDGPU::SCC) {
140 const TargetRegisterClass *RC
141 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
142 if (!RC)
143 return true;
144 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
145 }
146
147 if (!isVCC(SrcReg, *MRI)) {
148 // TODO: Should probably leave the copy and let copyPhysReg expand it.
149 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
150 return false;
151
152 const TargetRegisterClass *SrcRC
153 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
154
155 std::optional<ValueAndVReg> ConstVal =
156 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
157 if (ConstVal) {
158 unsigned MovOpc =
159 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
160 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
161 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
162 } else {
163 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
164
165 // We can't trust the high bits at this point, so clear them.
166
167 // TODO: Skip masking high bits if def is known boolean.
168
169 bool IsSGPR = TRI.isSGPRClass(SrcRC);
170 unsigned AndOpc =
171 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
172 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
173 .addImm(1)
174 .addReg(SrcReg);
175 if (IsSGPR)
176 And.setOperandDead(3); // Dead scc
177
178 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
179 .addImm(0)
180 .addReg(MaskedReg);
181 }
182
183 if (!MRI->getRegClassOrNull(SrcReg))
184 MRI->setRegClass(SrcReg, SrcRC);
185 I.eraseFromParent();
186 return true;
187 }
188
189 const TargetRegisterClass *RC =
191 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
192 return false;
193
194 return true;
195 }
196
197 for (const MachineOperand &MO : I.operands()) {
198 if (MO.getReg().isPhysical())
199 continue;
200
201 const TargetRegisterClass *RC =
203 if (!RC)
204 continue;
205 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
206 }
207 return true;
208}
209
210bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
211 const Register DefReg = I.getOperand(0).getReg();
212 const LLT DefTy = MRI->getType(DefReg);
213 if (DefTy == LLT::scalar(1)) {
214 if (!AllowRiskySelect) {
215 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
216 return false;
217 }
218
219 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
220 }
221
222 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
223
224 const RegClassOrRegBank &RegClassOrBank =
225 MRI->getRegClassOrRegBank(DefReg);
226
227 const TargetRegisterClass *DefRC
228 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
229 if (!DefRC) {
230 if (!DefTy.isValid()) {
231 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
232 return false;
233 }
234
235 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
236 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
237 if (!DefRC) {
238 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
239 return false;
240 }
241 }
242
243 // TODO: Verify that all registers have the same bank
244 I.setDesc(TII.get(TargetOpcode::PHI));
245 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
246}
247
249AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
250 const TargetRegisterClass &SubRC,
251 unsigned SubIdx) const {
252
253 MachineInstr *MI = MO.getParent();
255 Register DstReg = MRI->createVirtualRegister(&SubRC);
256
257 if (MO.isReg()) {
258 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
259 Register Reg = MO.getReg();
260 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
261 .addReg(Reg, 0, ComposedSubIdx);
262
263 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
264 MO.isKill(), MO.isDead(), MO.isUndef(),
265 MO.isEarlyClobber(), 0, MO.isDebug(),
266 MO.isInternalRead());
267 }
268
269 assert(MO.isImm());
270
271 APInt Imm(64, MO.getImm());
272
273 switch (SubIdx) {
274 default:
275 llvm_unreachable("do not know to split immediate with this sub index.");
276 case AMDGPU::sub0:
277 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
278 case AMDGPU::sub1:
279 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
280 }
281}
282
283static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
284 switch (Opc) {
285 case AMDGPU::G_AND:
286 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
287 case AMDGPU::G_OR:
288 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
289 case AMDGPU::G_XOR:
290 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
291 default:
292 llvm_unreachable("not a bit op");
293 }
294}
295
296bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
297 Register DstReg = I.getOperand(0).getReg();
298 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
299
300 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
301 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
302 DstRB->getID() != AMDGPU::VCCRegBankID)
303 return false;
304
305 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
306 STI.isWave64());
307 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
308
309 // Dead implicit-def of scc
310 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
311 true, // isImp
312 false, // isKill
313 true)); // isDead
314 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
315}
316
317bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
318 MachineBasicBlock *BB = I.getParent();
320 Register DstReg = I.getOperand(0).getReg();
321 const DebugLoc &DL = I.getDebugLoc();
322 LLT Ty = MRI->getType(DstReg);
323 if (Ty.isVector())
324 return false;
325
326 unsigned Size = Ty.getSizeInBits();
327 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
328 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
329 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
330
331 if (Size == 32) {
332 if (IsSALU) {
333 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
335 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
336 .add(I.getOperand(1))
337 .add(I.getOperand(2))
338 .setOperandDead(3); // Dead scc
339 I.eraseFromParent();
340 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341 }
342
343 if (STI.hasAddNoCarry()) {
344 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
345 I.setDesc(TII.get(Opc));
346 I.addOperand(*MF, MachineOperand::CreateImm(0));
347 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
348 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
349 }
350
351 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
352
353 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
355 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
356 .addDef(UnusedCarry, RegState::Dead)
357 .add(I.getOperand(1))
358 .add(I.getOperand(2))
359 .addImm(0);
360 I.eraseFromParent();
361 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
362 }
363
364 assert(!Sub && "illegal sub should not reach here");
365
366 const TargetRegisterClass &RC
367 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
368 const TargetRegisterClass &HalfRC
369 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
370
371 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
372 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
373 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
374 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
375
376 Register DstLo = MRI->createVirtualRegister(&HalfRC);
377 Register DstHi = MRI->createVirtualRegister(&HalfRC);
378
379 if (IsSALU) {
380 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
381 .add(Lo1)
382 .add(Lo2);
383 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
384 .add(Hi1)
385 .add(Hi2)
386 .setOperandDead(3); // Dead scc
387 } else {
388 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
389 Register CarryReg = MRI->createVirtualRegister(CarryRC);
390 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
391 .addDef(CarryReg)
392 .add(Lo1)
393 .add(Lo2)
394 .addImm(0);
395 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
396 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
397 .add(Hi1)
398 .add(Hi2)
399 .addReg(CarryReg, RegState::Kill)
400 .addImm(0);
401
402 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
403 return false;
404 }
405
406 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
407 .addReg(DstLo)
408 .addImm(AMDGPU::sub0)
409 .addReg(DstHi)
410 .addImm(AMDGPU::sub1);
411
412
413 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
414 return false;
415
416 I.eraseFromParent();
417 return true;
418}
419
420bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
421 MachineInstr &I) const {
422 MachineBasicBlock *BB = I.getParent();
424 const DebugLoc &DL = I.getDebugLoc();
425 Register Dst0Reg = I.getOperand(0).getReg();
426 Register Dst1Reg = I.getOperand(1).getReg();
427 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
428 I.getOpcode() == AMDGPU::G_UADDE;
429 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
430 I.getOpcode() == AMDGPU::G_USUBE;
431
432 if (isVCC(Dst1Reg, *MRI)) {
433 unsigned NoCarryOpc =
434 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
435 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
436 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
437 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
438 I.addOperand(*MF, MachineOperand::CreateImm(0));
439 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
440 }
441
442 Register Src0Reg = I.getOperand(2).getReg();
443 Register Src1Reg = I.getOperand(3).getReg();
444
445 if (HasCarryIn) {
446 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
447 .addReg(I.getOperand(4).getReg());
448 }
449
450 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
451 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
452
453 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
454 .add(I.getOperand(2))
455 .add(I.getOperand(3));
456
457 if (MRI->use_nodbg_empty(Dst1Reg)) {
458 CarryInst.setOperandDead(3); // Dead scc
459 } else {
460 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
461 .addReg(AMDGPU::SCC);
462 if (!MRI->getRegClassOrNull(Dst1Reg))
463 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
464 }
465
466 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
467 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
468 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
469 return false;
470
471 if (HasCarryIn &&
472 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
473 AMDGPU::SReg_32RegClass, *MRI))
474 return false;
475
476 I.eraseFromParent();
477 return true;
478}
479
480bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
481 MachineInstr &I) const {
482 MachineBasicBlock *BB = I.getParent();
484 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
485
486 unsigned Opc;
487 if (Subtarget->hasMADIntraFwdBug())
488 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
489 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
490 else
491 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
492 I.setDesc(TII.get(Opc));
493 I.addOperand(*MF, MachineOperand::CreateImm(0));
494 I.addImplicitDefUseOperands(*MF);
495 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
496}
497
498// TODO: We should probably legalize these to only using 32-bit results.
499bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
500 MachineBasicBlock *BB = I.getParent();
501 Register DstReg = I.getOperand(0).getReg();
502 Register SrcReg = I.getOperand(1).getReg();
503 LLT DstTy = MRI->getType(DstReg);
504 LLT SrcTy = MRI->getType(SrcReg);
505 const unsigned SrcSize = SrcTy.getSizeInBits();
506 unsigned DstSize = DstTy.getSizeInBits();
507
508 // TODO: Should handle any multiple of 32 offset.
509 unsigned Offset = I.getOperand(2).getImm();
510 if (Offset % 32 != 0 || DstSize > 128)
511 return false;
512
513 // 16-bit operations really use 32-bit registers.
514 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
515 if (DstSize == 16)
516 DstSize = 32;
517
518 const TargetRegisterClass *DstRC =
519 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
520 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
521 return false;
522
523 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
524 const TargetRegisterClass *SrcRC =
525 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
526 if (!SrcRC)
527 return false;
529 DstSize / 32);
530 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
531 if (!SrcRC)
532 return false;
533
534 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
535 *SrcRC, I.getOperand(1));
536 const DebugLoc &DL = I.getDebugLoc();
537 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
538 .addReg(SrcReg, 0, SubReg);
539
540 I.eraseFromParent();
541 return true;
542}
543
544bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
545 MachineBasicBlock *BB = MI.getParent();
546 Register DstReg = MI.getOperand(0).getReg();
547 LLT DstTy = MRI->getType(DstReg);
548 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
549
550 const unsigned SrcSize = SrcTy.getSizeInBits();
551 if (SrcSize < 32)
552 return selectImpl(MI, *CoverageInfo);
553
554 const DebugLoc &DL = MI.getDebugLoc();
555 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
556 const unsigned DstSize = DstTy.getSizeInBits();
557 const TargetRegisterClass *DstRC =
558 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
559 if (!DstRC)
560 return false;
561
562 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
564 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
565 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
566 MachineOperand &Src = MI.getOperand(I + 1);
567 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
568 MIB.addImm(SubRegs[I]);
569
570 const TargetRegisterClass *SrcRC
571 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
572 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
573 return false;
574 }
575
576 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
577 return false;
578
579 MI.eraseFromParent();
580 return true;
581}
582
583bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
584 MachineBasicBlock *BB = MI.getParent();
585 const int NumDst = MI.getNumOperands() - 1;
586
587 MachineOperand &Src = MI.getOperand(NumDst);
588
589 Register SrcReg = Src.getReg();
590 Register DstReg0 = MI.getOperand(0).getReg();
591 LLT DstTy = MRI->getType(DstReg0);
592 LLT SrcTy = MRI->getType(SrcReg);
593
594 const unsigned DstSize = DstTy.getSizeInBits();
595 const unsigned SrcSize = SrcTy.getSizeInBits();
596 const DebugLoc &DL = MI.getDebugLoc();
597 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
598
599 const TargetRegisterClass *SrcRC =
600 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
601 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
602 return false;
603
604 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
605 // source, and this relies on the fact that the same subregister indices are
606 // used for both.
607 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
608 for (int I = 0, E = NumDst; I != E; ++I) {
609 MachineOperand &Dst = MI.getOperand(I);
610 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
611 .addReg(SrcReg, 0, SubRegs[I]);
612
613 // Make sure the subregister index is valid for the source register.
614 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
615 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
616 return false;
617
618 const TargetRegisterClass *DstRC =
620 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
621 return false;
622 }
623
624 MI.eraseFromParent();
625 return true;
626}
627
628bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
629 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
630 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
631
632 Register Src0 = MI.getOperand(1).getReg();
633 Register Src1 = MI.getOperand(2).getReg();
634 LLT SrcTy = MRI->getType(Src0);
635 const unsigned SrcSize = SrcTy.getSizeInBits();
636
637 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
638 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
639 return selectG_MERGE_VALUES(MI);
640 }
641
642 // Selection logic below is for V2S16 only.
643 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
644 Register Dst = MI.getOperand(0).getReg();
645 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
646 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
647 SrcTy != LLT::scalar(32)))
648 return selectImpl(MI, *CoverageInfo);
649
650 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
651 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
652 return false;
653
654 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
655 DstBank->getID() == AMDGPU::VGPRRegBankID);
656 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
657
658 const DebugLoc &DL = MI.getDebugLoc();
659 MachineBasicBlock *BB = MI.getParent();
660
661 // First, before trying TableGen patterns, check if both sources are
662 // constants. In those cases, we can trivially compute the final constant
663 // and emit a simple move.
664 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
665 if (ConstSrc1) {
666 auto ConstSrc0 =
667 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
668 if (ConstSrc0) {
669 const int64_t K0 = ConstSrc0->Value.getSExtValue();
670 const int64_t K1 = ConstSrc1->Value.getSExtValue();
671 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
672 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
673 uint32_t Imm = Lo16 | (Hi16 << 16);
674
675 // VALU
676 if (IsVector) {
677 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
678 MI.eraseFromParent();
679 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
680 }
681
682 // SALU
683 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
684 MI.eraseFromParent();
685 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
686 }
687 }
688
689 // Now try TableGen patterns.
690 if (selectImpl(MI, *CoverageInfo))
691 return true;
692
693 // TODO: This should probably be a combine somewhere
694 // (build_vector $src0, undef) -> copy $src0
695 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
696 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
697 MI.setDesc(TII.get(AMDGPU::COPY));
698 MI.removeOperand(2);
699 const auto &RC =
700 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
701 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
702 RBI.constrainGenericRegister(Src0, RC, *MRI);
703 }
704
705 // TODO: Can be improved?
706 if (IsVector) {
707 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
708 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
709 .addImm(0xFFFF)
710 .addReg(Src0);
711 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
712 return false;
713
714 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
715 .addReg(Src1)
716 .addImm(16)
717 .addReg(TmpReg);
718 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
719 return false;
720
721 MI.eraseFromParent();
722 return true;
723 }
724
725 Register ShiftSrc0;
726 Register ShiftSrc1;
727
728 // With multiple uses of the shift, this will duplicate the shift and
729 // increase register pressure.
730 //
731 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
732 // => (S_PACK_HH_B32_B16 $src0, $src1)
733 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
734 // => (S_PACK_HL_B32_B16 $src0, $src1)
735 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
736 // => (S_PACK_LH_B32_B16 $src0, $src1)
737 // (build_vector $src0, $src1)
738 // => (S_PACK_LL_B32_B16 $src0, $src1)
739
740 bool Shift0 = mi_match(
741 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
742
743 bool Shift1 = mi_match(
744 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
745
746 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
747 if (Shift0 && Shift1) {
748 Opc = AMDGPU::S_PACK_HH_B32_B16;
749 MI.getOperand(1).setReg(ShiftSrc0);
750 MI.getOperand(2).setReg(ShiftSrc1);
751 } else if (Shift1) {
752 Opc = AMDGPU::S_PACK_LH_B32_B16;
753 MI.getOperand(2).setReg(ShiftSrc1);
754 } else if (Shift0) {
755 auto ConstSrc1 =
756 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
757 if (ConstSrc1 && ConstSrc1->Value == 0) {
758 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
759 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
760 .addReg(ShiftSrc0)
761 .addImm(16)
762 .setOperandDead(3); // Dead scc
763
764 MI.eraseFromParent();
765 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
766 }
767 if (STI.hasSPackHL()) {
768 Opc = AMDGPU::S_PACK_HL_B32_B16;
769 MI.getOperand(1).setReg(ShiftSrc0);
770 }
771 }
772
773 MI.setDesc(TII.get(Opc));
774 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
775}
776
777bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
778 return selectG_ADD_SUB(I);
779}
780
781bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
782 const MachineOperand &MO = I.getOperand(0);
783
784 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
785 // regbank check here is to know why getConstrainedRegClassForOperand failed.
787 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
788 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
789 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
790 return true;
791 }
792
793 return false;
794}
795
796bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
797 MachineBasicBlock *BB = I.getParent();
798
799 Register DstReg = I.getOperand(0).getReg();
800 Register Src0Reg = I.getOperand(1).getReg();
801 Register Src1Reg = I.getOperand(2).getReg();
802 LLT Src1Ty = MRI->getType(Src1Reg);
803
804 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
805 unsigned InsSize = Src1Ty.getSizeInBits();
806
807 int64_t Offset = I.getOperand(3).getImm();
808
809 // FIXME: These cases should have been illegal and unnecessary to check here.
810 if (Offset % 32 != 0 || InsSize % 32 != 0)
811 return false;
812
813 // Currently not handled by getSubRegFromChannel.
814 if (InsSize > 128)
815 return false;
816
817 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
818 if (SubReg == AMDGPU::NoSubRegister)
819 return false;
820
821 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
822 const TargetRegisterClass *DstRC =
823 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
824 if (!DstRC)
825 return false;
826
827 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
828 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
829 const TargetRegisterClass *Src0RC =
830 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
831 const TargetRegisterClass *Src1RC =
832 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
833
834 // Deal with weird cases where the class only partially supports the subreg
835 // index.
836 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
837 if (!Src0RC || !Src1RC)
838 return false;
839
840 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
841 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
842 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
843 return false;
844
845 const DebugLoc &DL = I.getDebugLoc();
846 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
847 .addReg(Src0Reg)
848 .addReg(Src1Reg)
849 .addImm(SubReg);
850
851 I.eraseFromParent();
852 return true;
853}
854
855bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
856 Register DstReg = MI.getOperand(0).getReg();
857 Register SrcReg = MI.getOperand(1).getReg();
858 Register OffsetReg = MI.getOperand(2).getReg();
859 Register WidthReg = MI.getOperand(3).getReg();
860
861 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
862 "scalar BFX instructions are expanded in regbankselect");
863 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
864 "64-bit vector BFX instructions are expanded in regbankselect");
865
866 const DebugLoc &DL = MI.getDebugLoc();
867 MachineBasicBlock *MBB = MI.getParent();
868
869 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
870 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
871 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
872 .addReg(SrcReg)
873 .addReg(OffsetReg)
874 .addReg(WidthReg);
875 MI.eraseFromParent();
876 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
877}
878
879bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
880 if (STI.getLDSBankCount() != 16)
881 return selectImpl(MI, *CoverageInfo);
882
883 Register Dst = MI.getOperand(0).getReg();
884 Register Src0 = MI.getOperand(2).getReg();
885 Register M0Val = MI.getOperand(6).getReg();
886 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
887 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
888 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
889 return false;
890
891 // This requires 2 instructions. It is possible to write a pattern to support
892 // this, but the generated isel emitter doesn't correctly deal with multiple
893 // output instructions using the same physical register input. The copy to m0
894 // is incorrectly placed before the second instruction.
895 //
896 // TODO: Match source modifiers.
897
898 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
899 const DebugLoc &DL = MI.getDebugLoc();
900 MachineBasicBlock *MBB = MI.getParent();
901
902 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
903 .addReg(M0Val);
904 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
905 .addImm(2)
906 .addImm(MI.getOperand(4).getImm()) // $attr
907 .addImm(MI.getOperand(3).getImm()); // $attrchan
908
909 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
910 .addImm(0) // $src0_modifiers
911 .addReg(Src0) // $src0
912 .addImm(MI.getOperand(4).getImm()) // $attr
913 .addImm(MI.getOperand(3).getImm()) // $attrchan
914 .addImm(0) // $src2_modifiers
915 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
916 .addImm(MI.getOperand(5).getImm()) // $high
917 .addImm(0) // $clamp
918 .addImm(0); // $omod
919
920 MI.eraseFromParent();
921 return true;
922}
923
924// Writelane is special in that it can use SGPR and M0 (which would normally
925// count as using the constant bus twice - but in this case it is allowed since
926// the lane selector doesn't count as a use of the constant bus). However, it is
927// still required to abide by the 1 SGPR rule. Fix this up if we might have
928// multiple SGPRs.
929bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
930 // With a constant bus limit of at least 2, there's no issue.
931 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
932 return selectImpl(MI, *CoverageInfo);
933
934 MachineBasicBlock *MBB = MI.getParent();
935 const DebugLoc &DL = MI.getDebugLoc();
936 Register VDst = MI.getOperand(0).getReg();
937 Register Val = MI.getOperand(2).getReg();
938 Register LaneSelect = MI.getOperand(3).getReg();
939 Register VDstIn = MI.getOperand(4).getReg();
940
941 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
942
943 std::optional<ValueAndVReg> ConstSelect =
944 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
945 if (ConstSelect) {
946 // The selector has to be an inline immediate, so we can use whatever for
947 // the other operands.
948 MIB.addReg(Val);
949 MIB.addImm(ConstSelect->Value.getSExtValue() &
950 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
951 } else {
952 std::optional<ValueAndVReg> ConstVal =
954
955 // If the value written is an inline immediate, we can get away without a
956 // copy to m0.
957 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
958 STI.hasInv2PiInlineImm())) {
959 MIB.addImm(ConstVal->Value.getSExtValue());
960 MIB.addReg(LaneSelect);
961 } else {
962 MIB.addReg(Val);
963
964 // If the lane selector was originally in a VGPR and copied with
965 // readfirstlane, there's a hazard to read the same SGPR from the
966 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
967 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
968
969 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
970 .addReg(LaneSelect);
971 MIB.addReg(AMDGPU::M0);
972 }
973 }
974
975 MIB.addReg(VDstIn);
976
977 MI.eraseFromParent();
978 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
979}
980
981// We need to handle this here because tablegen doesn't support matching
982// instructions with multiple outputs.
983bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
984 Register Dst0 = MI.getOperand(0).getReg();
985 Register Dst1 = MI.getOperand(1).getReg();
986
987 LLT Ty = MRI->getType(Dst0);
988 unsigned Opc;
989 if (Ty == LLT::scalar(32))
990 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
991 else if (Ty == LLT::scalar(64))
992 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
993 else
994 return false;
995
996 // TODO: Match source modifiers.
997
998 const DebugLoc &DL = MI.getDebugLoc();
999 MachineBasicBlock *MBB = MI.getParent();
1000
1001 Register Numer = MI.getOperand(3).getReg();
1002 Register Denom = MI.getOperand(4).getReg();
1003 unsigned ChooseDenom = MI.getOperand(5).getImm();
1004
1005 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1006
1007 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1008 .addDef(Dst1)
1009 .addImm(0) // $src0_modifiers
1010 .addUse(Src0) // $src0
1011 .addImm(0) // $src1_modifiers
1012 .addUse(Denom) // $src1
1013 .addImm(0) // $src2_modifiers
1014 .addUse(Numer) // $src2
1015 .addImm(0) // $clamp
1016 .addImm(0); // $omod
1017
1018 MI.eraseFromParent();
1019 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1020}
1021
1022bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1023 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1024 switch (IntrinsicID) {
1025 case Intrinsic::amdgcn_if_break: {
1026 MachineBasicBlock *BB = I.getParent();
1027
1028 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1029 // SelectionDAG uses for wave32 vs wave64.
1030 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1031 .add(I.getOperand(0))
1032 .add(I.getOperand(2))
1033 .add(I.getOperand(3));
1034
1035 Register DstReg = I.getOperand(0).getReg();
1036 Register Src0Reg = I.getOperand(2).getReg();
1037 Register Src1Reg = I.getOperand(3).getReg();
1038
1039 I.eraseFromParent();
1040
1041 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1042 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1043
1044 return true;
1045 }
1046 case Intrinsic::amdgcn_interp_p1_f16:
1047 return selectInterpP1F16(I);
1048 case Intrinsic::amdgcn_wqm:
1049 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1050 case Intrinsic::amdgcn_softwqm:
1051 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1052 case Intrinsic::amdgcn_strict_wwm:
1053 case Intrinsic::amdgcn_wwm:
1054 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1055 case Intrinsic::amdgcn_strict_wqm:
1056 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1057 case Intrinsic::amdgcn_writelane:
1058 return selectWritelane(I);
1059 case Intrinsic::amdgcn_div_scale:
1060 return selectDivScale(I);
1061 case Intrinsic::amdgcn_icmp:
1062 case Intrinsic::amdgcn_fcmp:
1063 if (selectImpl(I, *CoverageInfo))
1064 return true;
1065 return selectIntrinsicCmp(I);
1066 case Intrinsic::amdgcn_ballot:
1067 return selectBallot(I);
1068 case Intrinsic::amdgcn_inverse_ballot:
1069 return selectInverseBallot(I);
1070 case Intrinsic::amdgcn_reloc_constant:
1071 return selectRelocConstant(I);
1072 case Intrinsic::amdgcn_groupstaticsize:
1073 return selectGroupStaticSize(I);
1074 case Intrinsic::returnaddress:
1075 return selectReturnAddress(I);
1076 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1077 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1078 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1079 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1080 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1081 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1082 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1083 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1084 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1085 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1086 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1087 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1088 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1089 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1090 return selectSMFMACIntrin(I);
1091 default:
1092 return selectImpl(I, *CoverageInfo);
1093 }
1094}
1095
1097 const GCNSubtarget &ST) {
1098 if (Size != 16 && Size != 32 && Size != 64)
1099 return -1;
1100
1101 if (Size == 16 && !ST.has16BitInsts())
1102 return -1;
1103
1104 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1105 unsigned S64Opc) {
1106 if (Size == 16)
1107 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1108 if (Size == 32)
1109 return S32Opc;
1110 return S64Opc;
1111 };
1112
1113 switch (P) {
1114 default:
1115 llvm_unreachable("Unknown condition code!");
1116 case CmpInst::ICMP_NE:
1117 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1118 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1119 case CmpInst::ICMP_EQ:
1120 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1121 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1122 case CmpInst::ICMP_SGT:
1123 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1124 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1125 case CmpInst::ICMP_SGE:
1126 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1127 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1128 case CmpInst::ICMP_SLT:
1129 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1130 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1131 case CmpInst::ICMP_SLE:
1132 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1133 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1134 case CmpInst::ICMP_UGT:
1135 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1136 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1137 case CmpInst::ICMP_UGE:
1138 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1139 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1140 case CmpInst::ICMP_ULT:
1141 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1142 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1143 case CmpInst::ICMP_ULE:
1144 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1145 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1146
1147 case CmpInst::FCMP_OEQ:
1148 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1149 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1150 case CmpInst::FCMP_OGT:
1151 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1152 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1153 case CmpInst::FCMP_OGE:
1154 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1155 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1156 case CmpInst::FCMP_OLT:
1157 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1158 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1159 case CmpInst::FCMP_OLE:
1160 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1161 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1162 case CmpInst::FCMP_ONE:
1163 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1164 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1165 case CmpInst::FCMP_ORD:
1166 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1167 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1168 case CmpInst::FCMP_UNO:
1169 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1170 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1171 case CmpInst::FCMP_UEQ:
1172 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1173 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1174 case CmpInst::FCMP_UGT:
1175 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1176 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1177 case CmpInst::FCMP_UGE:
1178 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1179 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1180 case CmpInst::FCMP_ULT:
1181 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1182 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1183 case CmpInst::FCMP_ULE:
1184 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1185 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1186 case CmpInst::FCMP_UNE:
1187 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1188 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1189 case CmpInst::FCMP_TRUE:
1190 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1191 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1193 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1194 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1195 }
1196}
1197
1198int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1199 unsigned Size) const {
1200 if (Size == 64) {
1201 if (!STI.hasScalarCompareEq64())
1202 return -1;
1203
1204 switch (P) {
1205 case CmpInst::ICMP_NE:
1206 return AMDGPU::S_CMP_LG_U64;
1207 case CmpInst::ICMP_EQ:
1208 return AMDGPU::S_CMP_EQ_U64;
1209 default:
1210 return -1;
1211 }
1212 }
1213
1214 if (Size == 32) {
1215 switch (P) {
1216 case CmpInst::ICMP_NE:
1217 return AMDGPU::S_CMP_LG_U32;
1218 case CmpInst::ICMP_EQ:
1219 return AMDGPU::S_CMP_EQ_U32;
1220 case CmpInst::ICMP_SGT:
1221 return AMDGPU::S_CMP_GT_I32;
1222 case CmpInst::ICMP_SGE:
1223 return AMDGPU::S_CMP_GE_I32;
1224 case CmpInst::ICMP_SLT:
1225 return AMDGPU::S_CMP_LT_I32;
1226 case CmpInst::ICMP_SLE:
1227 return AMDGPU::S_CMP_LE_I32;
1228 case CmpInst::ICMP_UGT:
1229 return AMDGPU::S_CMP_GT_U32;
1230 case CmpInst::ICMP_UGE:
1231 return AMDGPU::S_CMP_GE_U32;
1232 case CmpInst::ICMP_ULT:
1233 return AMDGPU::S_CMP_LT_U32;
1234 case CmpInst::ICMP_ULE:
1235 return AMDGPU::S_CMP_LE_U32;
1236 case CmpInst::FCMP_OEQ:
1237 return AMDGPU::S_CMP_EQ_F32;
1238 case CmpInst::FCMP_OGT:
1239 return AMDGPU::S_CMP_GT_F32;
1240 case CmpInst::FCMP_OGE:
1241 return AMDGPU::S_CMP_GE_F32;
1242 case CmpInst::FCMP_OLT:
1243 return AMDGPU::S_CMP_LT_F32;
1244 case CmpInst::FCMP_OLE:
1245 return AMDGPU::S_CMP_LE_F32;
1246 case CmpInst::FCMP_ONE:
1247 return AMDGPU::S_CMP_LG_F32;
1248 case CmpInst::FCMP_ORD:
1249 return AMDGPU::S_CMP_O_F32;
1250 case CmpInst::FCMP_UNO:
1251 return AMDGPU::S_CMP_U_F32;
1252 case CmpInst::FCMP_UEQ:
1253 return AMDGPU::S_CMP_NLG_F32;
1254 case CmpInst::FCMP_UGT:
1255 return AMDGPU::S_CMP_NLE_F32;
1256 case CmpInst::FCMP_UGE:
1257 return AMDGPU::S_CMP_NLT_F32;
1258 case CmpInst::FCMP_ULT:
1259 return AMDGPU::S_CMP_NGE_F32;
1260 case CmpInst::FCMP_ULE:
1261 return AMDGPU::S_CMP_NGT_F32;
1262 case CmpInst::FCMP_UNE:
1263 return AMDGPU::S_CMP_NEQ_F32;
1264 default:
1265 llvm_unreachable("Unknown condition code!");
1266 }
1267 }
1268
1269 if (Size == 16) {
1270 if (!STI.hasSALUFloatInsts())
1271 return -1;
1272
1273 switch (P) {
1274 case CmpInst::FCMP_OEQ:
1275 return AMDGPU::S_CMP_EQ_F16;
1276 case CmpInst::FCMP_OGT:
1277 return AMDGPU::S_CMP_GT_F16;
1278 case CmpInst::FCMP_OGE:
1279 return AMDGPU::S_CMP_GE_F16;
1280 case CmpInst::FCMP_OLT:
1281 return AMDGPU::S_CMP_LT_F16;
1282 case CmpInst::FCMP_OLE:
1283 return AMDGPU::S_CMP_LE_F16;
1284 case CmpInst::FCMP_ONE:
1285 return AMDGPU::S_CMP_LG_F16;
1286 case CmpInst::FCMP_ORD:
1287 return AMDGPU::S_CMP_O_F16;
1288 case CmpInst::FCMP_UNO:
1289 return AMDGPU::S_CMP_U_F16;
1290 case CmpInst::FCMP_UEQ:
1291 return AMDGPU::S_CMP_NLG_F16;
1292 case CmpInst::FCMP_UGT:
1293 return AMDGPU::S_CMP_NLE_F16;
1294 case CmpInst::FCMP_UGE:
1295 return AMDGPU::S_CMP_NLT_F16;
1296 case CmpInst::FCMP_ULT:
1297 return AMDGPU::S_CMP_NGE_F16;
1298 case CmpInst::FCMP_ULE:
1299 return AMDGPU::S_CMP_NGT_F16;
1300 case CmpInst::FCMP_UNE:
1301 return AMDGPU::S_CMP_NEQ_F16;
1302 default:
1303 llvm_unreachable("Unknown condition code!");
1304 }
1305 }
1306
1307 return -1;
1308}
1309
1310bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1311
1312 MachineBasicBlock *BB = I.getParent();
1313 const DebugLoc &DL = I.getDebugLoc();
1314
1315 Register SrcReg = I.getOperand(2).getReg();
1316 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1317
1318 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1319
1320 Register CCReg = I.getOperand(0).getReg();
1321 if (!isVCC(CCReg, *MRI)) {
1322 int Opcode = getS_CMPOpcode(Pred, Size);
1323 if (Opcode == -1)
1324 return false;
1325 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1326 .add(I.getOperand(2))
1327 .add(I.getOperand(3));
1328 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1329 .addReg(AMDGPU::SCC);
1330 bool Ret =
1331 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1332 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1333 I.eraseFromParent();
1334 return Ret;
1335 }
1336
1337 if (I.getOpcode() == AMDGPU::G_FCMP)
1338 return false;
1339
1340 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1341 if (Opcode == -1)
1342 return false;
1343
1344 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1345 I.getOperand(0).getReg())
1346 .add(I.getOperand(2))
1347 .add(I.getOperand(3));
1349 *TRI.getBoolRC(), *MRI);
1350 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1351 I.eraseFromParent();
1352 return Ret;
1353}
1354
1355bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1356 Register Dst = I.getOperand(0).getReg();
1357 if (isVCC(Dst, *MRI))
1358 return false;
1359
1360 LLT DstTy = MRI->getType(Dst);
1361 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1362 return false;
1363
1364 MachineBasicBlock *BB = I.getParent();
1365 const DebugLoc &DL = I.getDebugLoc();
1366 Register SrcReg = I.getOperand(2).getReg();
1367 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1368
1369 // i1 inputs are not supported in GlobalISel.
1370 if (Size == 1)
1371 return false;
1372
1373 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1374 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1375 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1376 I.eraseFromParent();
1377 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1378 }
1379
1380 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1381 if (Opcode == -1)
1382 return false;
1383
1384 MachineInstrBuilder SelectedMI;
1385 MachineOperand &LHS = I.getOperand(2);
1386 MachineOperand &RHS = I.getOperand(3);
1387 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1388 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1389 Register Src0Reg =
1390 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1391 Register Src1Reg =
1392 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1393 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1394 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1395 SelectedMI.addImm(Src0Mods);
1396 SelectedMI.addReg(Src0Reg);
1397 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1398 SelectedMI.addImm(Src1Mods);
1399 SelectedMI.addReg(Src1Reg);
1400 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1401 SelectedMI.addImm(0); // clamp
1402 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1403 SelectedMI.addImm(0); // op_sel
1404
1405 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1406 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1407 return false;
1408
1409 I.eraseFromParent();
1410 return true;
1411}
1412
1413bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1414 MachineBasicBlock *BB = I.getParent();
1415 const DebugLoc &DL = I.getDebugLoc();
1416 Register DstReg = I.getOperand(0).getReg();
1417 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1418 const bool Is64 = Size == 64;
1419 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1420
1421 // In the common case, the return type matches the wave size.
1422 // However we also support emitting i64 ballots in wave32 mode.
1423 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1424 return false;
1425
1426 std::optional<ValueAndVReg> Arg =
1427 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1428
1429 const auto BuildCopy = [&](Register SrcReg) {
1430 if (Size == STI.getWavefrontSize()) {
1431 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1432 .addReg(SrcReg);
1433 return;
1434 }
1435
1436 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1437 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1438 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1439 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1440 .addReg(SrcReg)
1441 .addImm(AMDGPU::sub0)
1442 .addReg(HiReg)
1443 .addImm(AMDGPU::sub1);
1444 };
1445
1446 if (Arg) {
1447 const int64_t Value = Arg->Value.getSExtValue();
1448 if (Value == 0) {
1449 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1450 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1451 } else if (Value == -1) // all ones
1452 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1453 else
1454 return false;
1455 } else
1456 BuildCopy(I.getOperand(2).getReg());
1457
1458 I.eraseFromParent();
1459 return true;
1460}
1461
1462bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
1463 MachineBasicBlock *BB = I.getParent();
1464 const DebugLoc &DL = I.getDebugLoc();
1465 const Register DstReg = I.getOperand(0).getReg();
1466 const Register MaskReg = I.getOperand(2).getReg();
1467
1468 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
1469 I.eraseFromParent();
1470 return true;
1471}
1472
1473bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1474 Register DstReg = I.getOperand(0).getReg();
1475 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1476 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1477 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1478 return false;
1479
1480 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1481
1483 const MDNode *Metadata = I.getOperand(2).getMetadata();
1484 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1485 auto RelocSymbol = cast<GlobalVariable>(
1486 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1487
1488 MachineBasicBlock *BB = I.getParent();
1489 BuildMI(*BB, &I, I.getDebugLoc(),
1490 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1492
1493 I.eraseFromParent();
1494 return true;
1495}
1496
1497bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1499
1500 Register DstReg = I.getOperand(0).getReg();
1501 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1502 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1503 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1504
1505 MachineBasicBlock *MBB = I.getParent();
1506 const DebugLoc &DL = I.getDebugLoc();
1507
1508 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1509
1510 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1512 MIB.addImm(MFI->getLDSSize());
1513 } else {
1515 const GlobalValue *GV
1516 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1518 }
1519
1520 I.eraseFromParent();
1521 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1522}
1523
1524bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1525 MachineBasicBlock *MBB = I.getParent();
1527 const DebugLoc &DL = I.getDebugLoc();
1528
1529 MachineOperand &Dst = I.getOperand(0);
1530 Register DstReg = Dst.getReg();
1531 unsigned Depth = I.getOperand(2).getImm();
1532
1533 const TargetRegisterClass *RC
1534 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1535 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1536 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1537 return false;
1538
1539 // Check for kernel and shader functions
1540 if (Depth != 0 ||
1542 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1543 .addImm(0);
1544 I.eraseFromParent();
1545 return true;
1546 }
1547
1549 // There is a call to @llvm.returnaddress in this function
1550 MFI.setReturnAddressIsTaken(true);
1551
1552 // Get the return address reg and mark it as an implicit live-in
1553 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1554 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1555 AMDGPU::SReg_64RegClass, DL);
1556 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1557 .addReg(LiveIn);
1558 I.eraseFromParent();
1559 return true;
1560}
1561
1562bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1563 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1564 // SelectionDAG uses for wave32 vs wave64.
1565 MachineBasicBlock *BB = MI.getParent();
1566 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1567 .add(MI.getOperand(1));
1568
1569 Register Reg = MI.getOperand(1).getReg();
1570 MI.eraseFromParent();
1571
1572 if (!MRI->getRegClassOrNull(Reg))
1573 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1574 return true;
1575}
1576
1577bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1578 MachineInstr &MI, Intrinsic::ID IntrID) const {
1579 MachineBasicBlock *MBB = MI.getParent();
1581 const DebugLoc &DL = MI.getDebugLoc();
1582
1583 unsigned IndexOperand = MI.getOperand(7).getImm();
1584 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1585 bool WaveDone = MI.getOperand(9).getImm() != 0;
1586
1587 if (WaveDone && !WaveRelease)
1588 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1589
1590 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1591 IndexOperand &= ~0x3f;
1592 unsigned CountDw = 0;
1593
1595 CountDw = (IndexOperand >> 24) & 0xf;
1596 IndexOperand &= ~(0xf << 24);
1597
1598 if (CountDw < 1 || CountDw > 4) {
1600 "ds_ordered_count: dword count must be between 1 and 4");
1601 }
1602 }
1603
1604 if (IndexOperand)
1605 report_fatal_error("ds_ordered_count: bad index operand");
1606
1607 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1608 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1609
1610 unsigned Offset0 = OrderedCountIndex << 2;
1611 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1612
1614 Offset1 |= (CountDw - 1) << 6;
1615
1617 Offset1 |= ShaderType << 2;
1618
1619 unsigned Offset = Offset0 | (Offset1 << 8);
1620
1621 Register M0Val = MI.getOperand(2).getReg();
1622 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1623 .addReg(M0Val);
1624
1625 Register DstReg = MI.getOperand(0).getReg();
1626 Register ValReg = MI.getOperand(3).getReg();
1628 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1629 .addReg(ValReg)
1630 .addImm(Offset)
1631 .cloneMemRefs(MI);
1632
1633 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1634 return false;
1635
1636 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1637 MI.eraseFromParent();
1638 return Ret;
1639}
1640
1641static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1642 switch (IntrID) {
1643 case Intrinsic::amdgcn_ds_gws_init:
1644 return AMDGPU::DS_GWS_INIT;
1645 case Intrinsic::amdgcn_ds_gws_barrier:
1646 return AMDGPU::DS_GWS_BARRIER;
1647 case Intrinsic::amdgcn_ds_gws_sema_v:
1648 return AMDGPU::DS_GWS_SEMA_V;
1649 case Intrinsic::amdgcn_ds_gws_sema_br:
1650 return AMDGPU::DS_GWS_SEMA_BR;
1651 case Intrinsic::amdgcn_ds_gws_sema_p:
1652 return AMDGPU::DS_GWS_SEMA_P;
1653 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1654 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1655 default:
1656 llvm_unreachable("not a gws intrinsic");
1657 }
1658}
1659
1660bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1661 Intrinsic::ID IID) const {
1662 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1663 !STI.hasGWSSemaReleaseAll()))
1664 return false;
1665
1666 // intrinsic ID, vsrc, offset
1667 const bool HasVSrc = MI.getNumOperands() == 3;
1668 assert(HasVSrc || MI.getNumOperands() == 2);
1669
1670 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1671 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1672 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1673 return false;
1674
1675 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1676 unsigned ImmOffset;
1677
1678 MachineBasicBlock *MBB = MI.getParent();
1679 const DebugLoc &DL = MI.getDebugLoc();
1680
1681 MachineInstr *Readfirstlane = nullptr;
1682
1683 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1684 // incoming offset, in case there's an add of a constant. We'll have to put it
1685 // back later.
1686 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1687 Readfirstlane = OffsetDef;
1688 BaseOffset = OffsetDef->getOperand(1).getReg();
1689 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1690 }
1691
1692 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1693 // If we have a constant offset, try to use the 0 in m0 as the base.
1694 // TODO: Look into changing the default m0 initialization value. If the
1695 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1696 // the immediate offset.
1697
1698 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1699 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1700 .addImm(0);
1701 } else {
1702 std::tie(BaseOffset, ImmOffset) =
1703 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1704
1705 if (Readfirstlane) {
1706 // We have the constant offset now, so put the readfirstlane back on the
1707 // variable component.
1708 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1709 return false;
1710
1711 Readfirstlane->getOperand(1).setReg(BaseOffset);
1712 BaseOffset = Readfirstlane->getOperand(0).getReg();
1713 } else {
1714 if (!RBI.constrainGenericRegister(BaseOffset,
1715 AMDGPU::SReg_32RegClass, *MRI))
1716 return false;
1717 }
1718
1719 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1720 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1721 .addReg(BaseOffset)
1722 .addImm(16)
1723 .setOperandDead(3); // Dead scc
1724
1725 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1726 .addReg(M0Base);
1727 }
1728
1729 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1730 // offset field) % 64. Some versions of the programming guide omit the m0
1731 // part, or claim it's from offset 0.
1732 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1733
1734 if (HasVSrc) {
1735 Register VSrc = MI.getOperand(1).getReg();
1736 MIB.addReg(VSrc);
1737
1738 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1739 return false;
1740 }
1741
1742 MIB.addImm(ImmOffset)
1743 .cloneMemRefs(MI);
1744
1745 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1746
1747 MI.eraseFromParent();
1748 return true;
1749}
1750
1751bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1752 bool IsAppend) const {
1753 Register PtrBase = MI.getOperand(2).getReg();
1754 LLT PtrTy = MRI->getType(PtrBase);
1755 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1756
1757 unsigned Offset;
1758 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1759
1760 // TODO: Should this try to look through readfirstlane like GWS?
1761 if (!isDSOffsetLegal(PtrBase, Offset)) {
1762 PtrBase = MI.getOperand(2).getReg();
1763 Offset = 0;
1764 }
1765
1766 MachineBasicBlock *MBB = MI.getParent();
1767 const DebugLoc &DL = MI.getDebugLoc();
1768 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1769
1770 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1771 .addReg(PtrBase);
1772 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1773 return false;
1774
1775 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1776 .addImm(Offset)
1777 .addImm(IsGDS ? -1 : 0)
1778 .cloneMemRefs(MI);
1779 MI.eraseFromParent();
1780 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1781}
1782
1783bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1785 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1786 if (WGSize <= STI.getWavefrontSize()) {
1787 MachineBasicBlock *MBB = MI.getParent();
1788 const DebugLoc &DL = MI.getDebugLoc();
1789 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1790 MI.eraseFromParent();
1791 return true;
1792 }
1793 }
1794 return selectImpl(MI, *CoverageInfo);
1795}
1796
1797static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1798 bool &IsTexFail) {
1799 if (TexFailCtrl)
1800 IsTexFail = true;
1801
1802 TFE = (TexFailCtrl & 0x1) ? true : false;
1803 TexFailCtrl &= ~(uint64_t)0x1;
1804 LWE = (TexFailCtrl & 0x2) ? true : false;
1805 TexFailCtrl &= ~(uint64_t)0x2;
1806
1807 return TexFailCtrl == 0;
1808}
1809
1810bool AMDGPUInstructionSelector::selectImageIntrinsic(
1812 MachineBasicBlock *MBB = MI.getParent();
1813 const DebugLoc &DL = MI.getDebugLoc();
1814
1815 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1817
1818 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1819 unsigned IntrOpcode = Intr->BaseOpcode;
1820 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1821 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1822
1823 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1824
1825 Register VDataIn, VDataOut;
1826 LLT VDataTy;
1827 int NumVDataDwords = -1;
1828 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1829 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1830
1831 bool Unorm;
1832 if (!BaseOpcode->Sampler)
1833 Unorm = true;
1834 else
1835 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1836
1837 bool TFE;
1838 bool LWE;
1839 bool IsTexFail = false;
1840 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1841 TFE, LWE, IsTexFail))
1842 return false;
1843
1844 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1845 const bool IsA16 = (Flags & 1) != 0;
1846 const bool IsG16 = (Flags & 2) != 0;
1847
1848 // A16 implies 16 bit gradients if subtarget doesn't support G16
1849 if (IsA16 && !STI.hasG16() && !IsG16)
1850 return false;
1851
1852 unsigned DMask = 0;
1853 unsigned DMaskLanes = 0;
1854
1855 if (BaseOpcode->Atomic) {
1856 VDataOut = MI.getOperand(0).getReg();
1857 VDataIn = MI.getOperand(2).getReg();
1858 LLT Ty = MRI->getType(VDataIn);
1859
1860 // Be careful to allow atomic swap on 16-bit element vectors.
1861 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1862 Ty.getSizeInBits() == 128 :
1863 Ty.getSizeInBits() == 64;
1864
1865 if (BaseOpcode->AtomicX2) {
1866 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1867
1868 DMask = Is64Bit ? 0xf : 0x3;
1869 NumVDataDwords = Is64Bit ? 4 : 2;
1870 } else {
1871 DMask = Is64Bit ? 0x3 : 0x1;
1872 NumVDataDwords = Is64Bit ? 2 : 1;
1873 }
1874 } else {
1875 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1876 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1877
1878 if (BaseOpcode->Store) {
1879 VDataIn = MI.getOperand(1).getReg();
1880 VDataTy = MRI->getType(VDataIn);
1881 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1882 } else {
1883 VDataOut = MI.getOperand(0).getReg();
1884 VDataTy = MRI->getType(VDataOut);
1885 NumVDataDwords = DMaskLanes;
1886
1887 if (IsD16 && !STI.hasUnpackedD16VMem())
1888 NumVDataDwords = (DMaskLanes + 1) / 2;
1889 }
1890 }
1891
1892 // Set G16 opcode
1893 if (Subtarget->hasG16() && IsG16) {
1894 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1896 assert(G16MappingInfo);
1897 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1898 }
1899
1900 // TODO: Check this in verifier.
1901 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1902
1903 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1904 if (BaseOpcode->Atomic)
1905 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1906 if (CPol & ~AMDGPU::CPol::ALL)
1907 return false;
1908
1909 int NumVAddrRegs = 0;
1910 int NumVAddrDwords = 0;
1911 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1912 // Skip the $noregs and 0s inserted during legalization.
1913 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1914 if (!AddrOp.isReg())
1915 continue; // XXX - Break?
1916
1917 Register Addr = AddrOp.getReg();
1918 if (!Addr)
1919 break;
1920
1921 ++NumVAddrRegs;
1922 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1923 }
1924
1925 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1926 // NSA, these should have been packed into a single value in the first
1927 // address register
1928 const bool UseNSA =
1929 NumVAddrRegs != 1 &&
1930 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1931 : NumVAddrDwords == NumVAddrRegs);
1932 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1933 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1934 return false;
1935 }
1936
1937 if (IsTexFail)
1938 ++NumVDataDwords;
1939
1940 int Opcode = -1;
1941 if (IsGFX11Plus) {
1942 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1943 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1944 : AMDGPU::MIMGEncGfx11Default,
1945 NumVDataDwords, NumVAddrDwords);
1946 } else if (IsGFX10Plus) {
1947 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1948 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1949 : AMDGPU::MIMGEncGfx10Default,
1950 NumVDataDwords, NumVAddrDwords);
1951 } else {
1952 if (Subtarget->hasGFX90AInsts()) {
1953 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1954 NumVDataDwords, NumVAddrDwords);
1955 if (Opcode == -1) {
1956 LLVM_DEBUG(
1957 dbgs()
1958 << "requested image instruction is not supported on this GPU\n");
1959 return false;
1960 }
1961 }
1962 if (Opcode == -1 &&
1964 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1965 NumVDataDwords, NumVAddrDwords);
1966 if (Opcode == -1)
1967 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1968 NumVDataDwords, NumVAddrDwords);
1969 }
1970 if (Opcode == -1)
1971 return false;
1972
1973 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1974 .cloneMemRefs(MI);
1975
1976 if (VDataOut) {
1977 if (BaseOpcode->AtomicX2) {
1978 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1979
1980 Register TmpReg = MRI->createVirtualRegister(
1981 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1982 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1983
1984 MIB.addDef(TmpReg);
1985 if (!MRI->use_empty(VDataOut)) {
1986 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1987 .addReg(TmpReg, RegState::Kill, SubReg);
1988 }
1989
1990 } else {
1991 MIB.addDef(VDataOut); // vdata output
1992 }
1993 }
1994
1995 if (VDataIn)
1996 MIB.addReg(VDataIn); // vdata input
1997
1998 for (int I = 0; I != NumVAddrRegs; ++I) {
1999 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2000 if (SrcOp.isReg()) {
2001 assert(SrcOp.getReg() != 0);
2002 MIB.addReg(SrcOp.getReg());
2003 }
2004 }
2005
2006 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2007 if (BaseOpcode->Sampler)
2008 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2009
2010 MIB.addImm(DMask); // dmask
2011
2012 if (IsGFX10Plus)
2013 MIB.addImm(DimInfo->Encoding);
2014 MIB.addImm(Unorm);
2015
2016 MIB.addImm(CPol);
2017 MIB.addImm(IsA16 && // a16 or r128
2018 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2019 if (IsGFX10Plus)
2020 MIB.addImm(IsA16 ? -1 : 0);
2021
2022 if (!Subtarget->hasGFX90AInsts()) {
2023 MIB.addImm(TFE); // tfe
2024 } else if (TFE) {
2025 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2026 return false;
2027 }
2028
2029 MIB.addImm(LWE); // lwe
2030 if (!IsGFX10Plus)
2031 MIB.addImm(DimInfo->DA ? -1 : 0);
2032 if (BaseOpcode->HasD16)
2033 MIB.addImm(IsD16 ? -1 : 0);
2034
2035 if (IsTexFail) {
2036 // An image load instruction with TFE/LWE only conditionally writes to its
2037 // result registers. Initialize them to zero so that we always get well
2038 // defined result values.
2039 assert(VDataOut && !VDataIn);
2040 Register Tied = MRI->cloneVirtualRegister(VDataOut);
2041 Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2042 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
2043 .addImm(0);
2044 auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
2045 if (STI.usePRTStrictNull()) {
2046 // With enable-prt-strict-null enabled, initialize all result registers to
2047 // zero.
2048 auto RegSeq =
2049 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2050 for (auto Sub : Parts)
2051 RegSeq.addReg(Zero).addImm(Sub);
2052 } else {
2053 // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
2054 // result register.
2055 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2056 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
2057 auto RegSeq =
2058 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2059 for (auto Sub : Parts.drop_back(1))
2060 RegSeq.addReg(Undef).addImm(Sub);
2061 RegSeq.addReg(Zero).addImm(Parts.back());
2062 }
2063 MIB.addReg(Tied, RegState::Implicit);
2064 MIB->tieOperands(0, MIB->getNumOperands() - 1);
2065 }
2066
2067 MI.eraseFromParent();
2068 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2069 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2070 return true;
2071}
2072
2073// We need to handle this here because tablegen doesn't support matching
2074// instructions with multiple outputs.
2075bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2076 MachineInstr &MI) const {
2077 Register Dst0 = MI.getOperand(0).getReg();
2078 Register Dst1 = MI.getOperand(1).getReg();
2079
2080 const DebugLoc &DL = MI.getDebugLoc();
2081 MachineBasicBlock *MBB = MI.getParent();
2082
2083 Register Addr = MI.getOperand(3).getReg();
2084 Register Data0 = MI.getOperand(4).getReg();
2085 Register Data1 = MI.getOperand(5).getReg();
2086 unsigned Offset = MI.getOperand(6).getImm();
2087
2088 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2089 .addDef(Dst1)
2090 .addUse(Addr)
2091 .addUse(Data0)
2092 .addUse(Data1)
2093 .addImm(Offset)
2094 .cloneMemRefs(MI);
2095
2096 MI.eraseFromParent();
2097 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2098}
2099
2100bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2101 MachineInstr &I) const {
2102 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2103 switch (IntrinsicID) {
2104 case Intrinsic::amdgcn_end_cf:
2105 return selectEndCfIntrinsic(I);
2106 case Intrinsic::amdgcn_ds_ordered_add:
2107 case Intrinsic::amdgcn_ds_ordered_swap:
2108 return selectDSOrderedIntrinsic(I, IntrinsicID);
2109 case Intrinsic::amdgcn_ds_gws_init:
2110 case Intrinsic::amdgcn_ds_gws_barrier:
2111 case Intrinsic::amdgcn_ds_gws_sema_v:
2112 case Intrinsic::amdgcn_ds_gws_sema_br:
2113 case Intrinsic::amdgcn_ds_gws_sema_p:
2114 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2115 return selectDSGWSIntrinsic(I, IntrinsicID);
2116 case Intrinsic::amdgcn_ds_append:
2117 return selectDSAppendConsume(I, true);
2118 case Intrinsic::amdgcn_ds_consume:
2119 return selectDSAppendConsume(I, false);
2120 case Intrinsic::amdgcn_s_barrier:
2121 return selectSBarrier(I);
2122 case Intrinsic::amdgcn_raw_buffer_load_lds:
2123 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2124 case Intrinsic::amdgcn_struct_buffer_load_lds:
2125 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2126 return selectBufferLoadLds(I);
2127 case Intrinsic::amdgcn_global_load_lds:
2128 return selectGlobalLoadLds(I);
2129 case Intrinsic::amdgcn_exp_compr:
2130 if (!STI.hasCompressedExport()) {
2131 Function &F = I.getMF()->getFunction();
2133 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2134 F.getContext().diagnose(NoFpRet);
2135 return false;
2136 }
2137 break;
2138 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2139 return selectDSBvhStackIntrinsic(I);
2140 }
2141 return selectImpl(I, *CoverageInfo);
2142}
2143
2144bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2145 if (selectImpl(I, *CoverageInfo))
2146 return true;
2147
2148 MachineBasicBlock *BB = I.getParent();
2149 const DebugLoc &DL = I.getDebugLoc();
2150
2151 Register DstReg = I.getOperand(0).getReg();
2152 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2153 assert(Size <= 32 || Size == 64);
2154 const MachineOperand &CCOp = I.getOperand(1);
2155 Register CCReg = CCOp.getReg();
2156 if (!isVCC(CCReg, *MRI)) {
2157 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2158 AMDGPU::S_CSELECT_B32;
2159 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2160 .addReg(CCReg);
2161
2162 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2163 // bank, because it does not cover the register class that we used to represent
2164 // for it. So we need to manually set the register class here.
2165 if (!MRI->getRegClassOrNull(CCReg))
2166 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2167 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2168 .add(I.getOperand(2))
2169 .add(I.getOperand(3));
2170
2171 bool Ret = false;
2172 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2173 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2174 I.eraseFromParent();
2175 return Ret;
2176 }
2177
2178 // Wide VGPR select should have been split in RegBankSelect.
2179 if (Size > 32)
2180 return false;
2181
2183 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2184 .addImm(0)
2185 .add(I.getOperand(3))
2186 .addImm(0)
2187 .add(I.getOperand(2))
2188 .add(I.getOperand(1));
2189
2190 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2191 I.eraseFromParent();
2192 return Ret;
2193}
2194
2195static int sizeToSubRegIndex(unsigned Size) {
2196 switch (Size) {
2197 case 32:
2198 return AMDGPU::sub0;
2199 case 64:
2200 return AMDGPU::sub0_sub1;
2201 case 96:
2202 return AMDGPU::sub0_sub1_sub2;
2203 case 128:
2204 return AMDGPU::sub0_sub1_sub2_sub3;
2205 case 256:
2206 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2207 default:
2208 if (Size < 32)
2209 return AMDGPU::sub0;
2210 if (Size > 256)
2211 return -1;
2213 }
2214}
2215
2216bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2217 Register DstReg = I.getOperand(0).getReg();
2218 Register SrcReg = I.getOperand(1).getReg();
2219 const LLT DstTy = MRI->getType(DstReg);
2220 const LLT SrcTy = MRI->getType(SrcReg);
2221 const LLT S1 = LLT::scalar(1);
2222
2223 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2224 const RegisterBank *DstRB;
2225 if (DstTy == S1) {
2226 // This is a special case. We don't treat s1 for legalization artifacts as
2227 // vcc booleans.
2228 DstRB = SrcRB;
2229 } else {
2230 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2231 if (SrcRB != DstRB)
2232 return false;
2233 }
2234
2235 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2236
2237 unsigned DstSize = DstTy.getSizeInBits();
2238 unsigned SrcSize = SrcTy.getSizeInBits();
2239
2240 const TargetRegisterClass *SrcRC =
2241 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2242 const TargetRegisterClass *DstRC =
2243 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2244 if (!SrcRC || !DstRC)
2245 return false;
2246
2247 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2248 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2249 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2250 return false;
2251 }
2252
2253 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2254 MachineBasicBlock *MBB = I.getParent();
2255 const DebugLoc &DL = I.getDebugLoc();
2256
2257 Register LoReg = MRI->createVirtualRegister(DstRC);
2258 Register HiReg = MRI->createVirtualRegister(DstRC);
2259 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2260 .addReg(SrcReg, 0, AMDGPU::sub0);
2261 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2262 .addReg(SrcReg, 0, AMDGPU::sub1);
2263
2264 if (IsVALU && STI.hasSDWA()) {
2265 // Write the low 16-bits of the high element into the high 16-bits of the
2266 // low element.
2267 MachineInstr *MovSDWA =
2268 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2269 .addImm(0) // $src0_modifiers
2270 .addReg(HiReg) // $src0
2271 .addImm(0) // $clamp
2272 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2273 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2274 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2275 .addReg(LoReg, RegState::Implicit);
2276 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2277 } else {
2278 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2279 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2280 Register ImmReg = MRI->createVirtualRegister(DstRC);
2281 if (IsVALU) {
2282 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2283 .addImm(16)
2284 .addReg(HiReg);
2285 } else {
2286 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2287 .addReg(HiReg)
2288 .addImm(16)
2289 .setOperandDead(3); // Dead scc
2290 }
2291
2292 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2293 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2294 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2295
2296 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2297 .addImm(0xffff);
2298 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2299 .addReg(LoReg)
2300 .addReg(ImmReg);
2301 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2302 .addReg(TmpReg0)
2303 .addReg(TmpReg1);
2304
2305 if (!IsVALU) {
2306 And.setOperandDead(3); // Dead scc
2307 Or.setOperandDead(3); // Dead scc
2308 }
2309 }
2310
2311 I.eraseFromParent();
2312 return true;
2313 }
2314
2315 if (!DstTy.isScalar())
2316 return false;
2317
2318 if (SrcSize > 32) {
2319 int SubRegIdx = sizeToSubRegIndex(DstSize);
2320 if (SubRegIdx == -1)
2321 return false;
2322
2323 // Deal with weird cases where the class only partially supports the subreg
2324 // index.
2325 const TargetRegisterClass *SrcWithSubRC
2326 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2327 if (!SrcWithSubRC)
2328 return false;
2329
2330 if (SrcWithSubRC != SrcRC) {
2331 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2332 return false;
2333 }
2334
2335 I.getOperand(1).setSubReg(SubRegIdx);
2336 }
2337
2338 I.setDesc(TII.get(TargetOpcode::COPY));
2339 return true;
2340}
2341
2342/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2343static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2344 Mask = maskTrailingOnes<unsigned>(Size);
2345 int SignedMask = static_cast<int>(Mask);
2346 return SignedMask >= -16 && SignedMask <= 64;
2347}
2348
2349// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2350const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2351 Register Reg, const MachineRegisterInfo &MRI,
2352 const TargetRegisterInfo &TRI) const {
2353 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2354 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2355 return RB;
2356
2357 // Ignore the type, since we don't use vcc in artifacts.
2358 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2359 return &RBI.getRegBankFromRegClass(*RC, LLT());
2360 return nullptr;
2361}
2362
2363bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2364 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2365 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2366 const DebugLoc &DL = I.getDebugLoc();
2367 MachineBasicBlock &MBB = *I.getParent();
2368 const Register DstReg = I.getOperand(0).getReg();
2369 const Register SrcReg = I.getOperand(1).getReg();
2370
2371 const LLT DstTy = MRI->getType(DstReg);
2372 const LLT SrcTy = MRI->getType(SrcReg);
2373 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2374 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2375 const unsigned DstSize = DstTy.getSizeInBits();
2376 if (!DstTy.isScalar())
2377 return false;
2378
2379 // Artifact casts should never use vcc.
2380 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2381
2382 // FIXME: This should probably be illegal and split earlier.
2383 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2384 if (DstSize <= 32)
2385 return selectCOPY(I);
2386
2387 const TargetRegisterClass *SrcRC =
2388 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2389 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2390 const TargetRegisterClass *DstRC =
2391 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2392
2393 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2394 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2395 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2396 .addReg(SrcReg)
2397 .addImm(AMDGPU::sub0)
2398 .addReg(UndefReg)
2399 .addImm(AMDGPU::sub1);
2400 I.eraseFromParent();
2401
2402 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2403 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2404 }
2405
2406 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2407 // 64-bit should have been split up in RegBankSelect
2408
2409 // Try to use an and with a mask if it will save code size.
2410 unsigned Mask;
2411 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2412 MachineInstr *ExtI =
2413 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2414 .addImm(Mask)
2415 .addReg(SrcReg);
2416 I.eraseFromParent();
2417 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2418 }
2419
2420 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2421 MachineInstr *ExtI =
2422 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2423 .addReg(SrcReg)
2424 .addImm(0) // Offset
2425 .addImm(SrcSize); // Width
2426 I.eraseFromParent();
2427 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2428 }
2429
2430 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2431 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2432 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2433 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2434 return false;
2435
2436 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2437 const unsigned SextOpc = SrcSize == 8 ?
2438 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2439 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2440 .addReg(SrcReg);
2441 I.eraseFromParent();
2442 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2443 }
2444
2445 // Using a single 32-bit SALU to calculate the high half is smaller than
2446 // S_BFE with a literal constant operand.
2447 if (DstSize > 32 && SrcSize == 32) {
2448 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2449 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2450 if (Signed) {
2451 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2452 .addReg(SrcReg, 0, SubReg)
2453 .addImm(31)
2454 .setOperandDead(3); // Dead scc
2455 } else {
2456 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2457 .addImm(0);
2458 }
2459 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2460 .addReg(SrcReg, 0, SubReg)
2461 .addImm(AMDGPU::sub0)
2462 .addReg(HiReg)
2463 .addImm(AMDGPU::sub1);
2464 I.eraseFromParent();
2465 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2466 *MRI);
2467 }
2468
2469 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2470 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2471
2472 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2473 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2474 // We need a 64-bit register source, but the high bits don't matter.
2475 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2476 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2477 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2478
2479 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2480 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2481 .addReg(SrcReg, 0, SubReg)
2482 .addImm(AMDGPU::sub0)
2483 .addReg(UndefReg)
2484 .addImm(AMDGPU::sub1);
2485
2486 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2487 .addReg(ExtReg)
2488 .addImm(SrcSize << 16);
2489
2490 I.eraseFromParent();
2491 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2492 }
2493
2494 unsigned Mask;
2495 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2496 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2497 .addReg(SrcReg)
2498 .addImm(Mask)
2499 .setOperandDead(3); // Dead scc
2500 } else {
2501 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2502 .addReg(SrcReg)
2503 .addImm(SrcSize << 16);
2504 }
2505
2506 I.eraseFromParent();
2507 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2508 }
2509
2510 return false;
2511}
2512
2514 Register &Out) {
2515 Register LShlSrc;
2516 if (mi_match(In, MRI,
2517 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2518 Out = LShlSrc;
2519 return true;
2520 }
2521 return false;
2522}
2523
2524bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2525 if (!Subtarget->hasSALUFloatInsts())
2526 return false;
2527
2528 Register Dst = I.getOperand(0).getReg();
2529 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2530 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2531 return false;
2532
2533 Register Src = I.getOperand(1).getReg();
2534
2535 if (MRI->getType(Dst) == LLT::scalar(32) &&
2536 MRI->getType(Src) == LLT::scalar(16)) {
2537 if (isExtractHiElt(*MRI, Src, Src)) {
2538 MachineBasicBlock *BB = I.getParent();
2539 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2540 .addUse(Src);
2541 I.eraseFromParent();
2542 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2543 }
2544 }
2545
2546 return false;
2547}
2548
2549bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2550 MachineBasicBlock *BB = I.getParent();
2551 MachineOperand &ImmOp = I.getOperand(1);
2552 Register DstReg = I.getOperand(0).getReg();
2553 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2554
2555 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2556 if (ImmOp.isFPImm()) {
2557 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2558 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2559 } else if (ImmOp.isCImm()) {
2560 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2561 } else {
2562 llvm_unreachable("Not supported by g_constants");
2563 }
2564
2565 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2566 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2567
2568 unsigned Opcode;
2569 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2570 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2571 } else {
2572 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2573
2574 // We should never produce s1 values on banks other than VCC. If the user of
2575 // this already constrained the register, we may incorrectly think it's VCC
2576 // if it wasn't originally.
2577 if (Size == 1)
2578 return false;
2579 }
2580
2581 if (Size != 64) {
2582 I.setDesc(TII.get(Opcode));
2583 I.addImplicitDefUseOperands(*MF);
2584 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2585 }
2586
2587 const DebugLoc &DL = I.getDebugLoc();
2588
2589 APInt Imm(Size, I.getOperand(1).getImm());
2590
2591 MachineInstr *ResInst;
2592 if (IsSgpr && TII.isInlineConstant(Imm)) {
2593 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2594 .addImm(I.getOperand(1).getImm());
2595 } else {
2596 const TargetRegisterClass *RC = IsSgpr ?
2597 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2598 Register LoReg = MRI->createVirtualRegister(RC);
2599 Register HiReg = MRI->createVirtualRegister(RC);
2600
2601 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2602 .addImm(Imm.trunc(32).getZExtValue());
2603
2604 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2605 .addImm(Imm.ashr(32).getZExtValue());
2606
2607 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2608 .addReg(LoReg)
2609 .addImm(AMDGPU::sub0)
2610 .addReg(HiReg)
2611 .addImm(AMDGPU::sub1);
2612 }
2613
2614 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2615 // work for target independent opcodes
2616 I.eraseFromParent();
2617 const TargetRegisterClass *DstRC =
2618 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2619 if (!DstRC)
2620 return true;
2621 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2622}
2623
2624bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2625 // Only manually handle the f64 SGPR case.
2626 //
2627 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2628 // the bit ops theoretically have a second result due to the implicit def of
2629 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2630 // that is easy by disabling the check. The result works, but uses a
2631 // nonsensical sreg32orlds_and_sreg_1 regclass.
2632 //
2633 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2634 // the variadic REG_SEQUENCE operands.
2635
2636 Register Dst = MI.getOperand(0).getReg();
2637 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2638 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2639 MRI->getType(Dst) != LLT::scalar(64))
2640 return false;
2641
2642 Register Src = MI.getOperand(1).getReg();
2643 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2644 if (Fabs)
2645 Src = Fabs->getOperand(1).getReg();
2646
2647 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2648 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2649 return false;
2650
2651 MachineBasicBlock *BB = MI.getParent();
2652 const DebugLoc &DL = MI.getDebugLoc();
2653 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2654 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2655 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2656 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2657
2658 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2659 .addReg(Src, 0, AMDGPU::sub0);
2660 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2661 .addReg(Src, 0, AMDGPU::sub1);
2662 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2663 .addImm(0x80000000);
2664
2665 // Set or toggle sign bit.
2666 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2667 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2668 .addReg(HiReg)
2669 .addReg(ConstReg)
2670 .setOperandDead(3); // Dead scc
2671 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2672 .addReg(LoReg)
2673 .addImm(AMDGPU::sub0)
2674 .addReg(OpReg)
2675 .addImm(AMDGPU::sub1);
2676 MI.eraseFromParent();
2677 return true;
2678}
2679
2680// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2681bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2682 Register Dst = MI.getOperand(0).getReg();
2683 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2684 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2685 MRI->getType(Dst) != LLT::scalar(64))
2686 return false;
2687
2688 Register Src = MI.getOperand(1).getReg();
2689 MachineBasicBlock *BB = MI.getParent();
2690 const DebugLoc &DL = MI.getDebugLoc();
2691 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2692 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2693 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2694 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2695
2696 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2697 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2698 return false;
2699
2700 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2701 .addReg(Src, 0, AMDGPU::sub0);
2702 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2703 .addReg(Src, 0, AMDGPU::sub1);
2704 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2705 .addImm(0x7fffffff);
2706
2707 // Clear sign bit.
2708 // TODO: Should this used S_BITSET0_*?
2709 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2710 .addReg(HiReg)
2711 .addReg(ConstReg)
2712 .setOperandDead(3); // Dead scc
2713 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2714 .addReg(LoReg)
2715 .addImm(AMDGPU::sub0)
2716 .addReg(OpReg)
2717 .addImm(AMDGPU::sub1);
2718
2719 MI.eraseFromParent();
2720 return true;
2721}
2722
2723static bool isConstant(const MachineInstr &MI) {
2724 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2725}
2726
2727void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2728 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2729
2730 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2731
2732 assert(PtrMI);
2733
2734 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2735 return;
2736
2737 GEPInfo GEPInfo;
2738
2739 for (unsigned i = 1; i != 3; ++i) {
2740 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2741 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2742 assert(OpDef);
2743 if (i == 2 && isConstant(*OpDef)) {
2744 // TODO: Could handle constant base + variable offset, but a combine
2745 // probably should have commuted it.
2746 assert(GEPInfo.Imm == 0);
2747 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2748 continue;
2749 }
2750 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2751 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2752 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2753 else
2754 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2755 }
2756
2757 AddrInfo.push_back(GEPInfo);
2758 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2759}
2760
2761bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2762 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2763}
2764
2765bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2766 if (!MI.hasOneMemOperand())
2767 return false;
2768
2769 const MachineMemOperand *MMO = *MI.memoperands_begin();
2770 const Value *Ptr = MMO->getValue();
2771
2772 // UndefValue means this is a load of a kernel input. These are uniform.
2773 // Sometimes LDS instructions have constant pointers.
2774 // If Ptr is null, then that means this mem operand contains a
2775 // PseudoSourceValue like GOT.
2776 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2777 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2778 return true;
2779
2781 return true;
2782
2783 const Instruction *I = dyn_cast<Instruction>(Ptr);
2784 return I && I->getMetadata("amdgpu.uniform");
2785}
2786
2787bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2788 for (const GEPInfo &GEPInfo : AddrInfo) {
2789 if (!GEPInfo.VgprParts.empty())
2790 return true;
2791 }
2792 return false;
2793}
2794
2795void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2796 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2797 unsigned AS = PtrTy.getAddressSpace();
2799 STI.ldsRequiresM0Init()) {
2800 MachineBasicBlock *BB = I.getParent();
2801
2802 // If DS instructions require M0 initialization, insert it before selecting.
2803 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2804 .addImm(-1);
2805 }
2806}
2807
2808bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2809 MachineInstr &I) const {
2810 initM0(I);
2811 return selectImpl(I, *CoverageInfo);
2812}
2813
2815 if (Reg.isPhysical())
2816 return false;
2817
2818 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2819 const unsigned Opcode = MI.getOpcode();
2820
2821 if (Opcode == AMDGPU::COPY)
2822 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2823
2824 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2825 Opcode == AMDGPU::G_XOR)
2826 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2827 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2828
2829 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2830 return GI->is(Intrinsic::amdgcn_class);
2831
2832 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2833}
2834
2835bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2836 MachineBasicBlock *BB = I.getParent();
2837 MachineOperand &CondOp = I.getOperand(0);
2838 Register CondReg = CondOp.getReg();
2839 const DebugLoc &DL = I.getDebugLoc();
2840
2841 unsigned BrOpcode;
2842 Register CondPhysReg;
2843 const TargetRegisterClass *ConstrainRC;
2844
2845 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2846 // whether the branch is uniform when selecting the instruction. In
2847 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2848 // RegBankSelect knows what it's doing if the branch condition is scc, even
2849 // though it currently does not.
2850 if (!isVCC(CondReg, *MRI)) {
2851 if (MRI->getType(CondReg) != LLT::scalar(32))
2852 return false;
2853
2854 CondPhysReg = AMDGPU::SCC;
2855 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2856 ConstrainRC = &AMDGPU::SReg_32RegClass;
2857 } else {
2858 // FIXME: Should scc->vcc copies and with exec?
2859
2860 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2861 // need to insert an and with exec.
2862 if (!isVCmpResult(CondReg, *MRI)) {
2863 const bool Is64 = STI.isWave64();
2864 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2865 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2866
2867 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2868 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2869 .addReg(CondReg)
2870 .addReg(Exec)
2871 .setOperandDead(3); // Dead scc
2872 CondReg = TmpReg;
2873 }
2874
2875 CondPhysReg = TRI.getVCC();
2876 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2877 ConstrainRC = TRI.getBoolRC();
2878 }
2879
2880 if (!MRI->getRegClassOrNull(CondReg))
2881 MRI->setRegClass(CondReg, ConstrainRC);
2882
2883 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2884 .addReg(CondReg);
2885 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2886 .addMBB(I.getOperand(1).getMBB());
2887
2888 I.eraseFromParent();
2889 return true;
2890}
2891
2892bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2893 MachineInstr &I) const {
2894 Register DstReg = I.getOperand(0).getReg();
2895 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2896 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2897 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2898 if (IsVGPR)
2899 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2900
2901 return RBI.constrainGenericRegister(
2902 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2903}
2904
2905bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2906 Register DstReg = I.getOperand(0).getReg();
2907 Register SrcReg = I.getOperand(1).getReg();
2908 Register MaskReg = I.getOperand(2).getReg();
2909 LLT Ty = MRI->getType(DstReg);
2910 LLT MaskTy = MRI->getType(MaskReg);
2911 MachineBasicBlock *BB = I.getParent();
2912 const DebugLoc &DL = I.getDebugLoc();
2913
2914 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2915 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2916 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2917 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2918 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2919 return false;
2920
2921 // Try to avoid emitting a bit operation when we only need to touch half of
2922 // the 64-bit pointer.
2923 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2924 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2925 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2926
2927 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2928 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2929
2930 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2931 !CanCopyLow32 && !CanCopyHi32) {
2932 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2933 .addReg(SrcReg)
2934 .addReg(MaskReg)
2935 .setOperandDead(3); // Dead scc
2936 I.eraseFromParent();
2937 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2938 }
2939
2940 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2941 const TargetRegisterClass &RegRC
2942 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2943
2944 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2945 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2946 const TargetRegisterClass *MaskRC =
2947 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2948
2949 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2950 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2951 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2952 return false;
2953
2954 if (Ty.getSizeInBits() == 32) {
2955 assert(MaskTy.getSizeInBits() == 32 &&
2956 "ptrmask should have been narrowed during legalize");
2957
2958 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2959 .addReg(SrcReg)
2960 .addReg(MaskReg);
2961
2962 if (!IsVGPR)
2963 NewOp.setOperandDead(3); // Dead scc
2964 I.eraseFromParent();
2965 return true;
2966 }
2967
2968 Register HiReg = MRI->createVirtualRegister(&RegRC);
2969 Register LoReg = MRI->createVirtualRegister(&RegRC);
2970
2971 // Extract the subregisters from the source pointer.
2972 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2973 .addReg(SrcReg, 0, AMDGPU::sub0);
2974 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2975 .addReg(SrcReg, 0, AMDGPU::sub1);
2976
2977 Register MaskedLo, MaskedHi;
2978
2979 if (CanCopyLow32) {
2980 // If all the bits in the low half are 1, we only need a copy for it.
2981 MaskedLo = LoReg;
2982 } else {
2983 // Extract the mask subregister and apply the and.
2984 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2985 MaskedLo = MRI->createVirtualRegister(&RegRC);
2986
2987 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2988 .addReg(MaskReg, 0, AMDGPU::sub0);
2989 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2990 .addReg(LoReg)
2991 .addReg(MaskLo);
2992 }
2993
2994 if (CanCopyHi32) {
2995 // If all the bits in the high half are 1, we only need a copy for it.
2996 MaskedHi = HiReg;
2997 } else {
2998 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2999 MaskedHi = MRI->createVirtualRegister(&RegRC);
3000
3001 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3002 .addReg(MaskReg, 0, AMDGPU::sub1);
3003 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3004 .addReg(HiReg)
3005 .addReg(MaskHi);
3006 }
3007
3008 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3009 .addReg(MaskedLo)
3010 .addImm(AMDGPU::sub0)
3011 .addReg(MaskedHi)
3012 .addImm(AMDGPU::sub1);
3013 I.eraseFromParent();
3014 return true;
3015}
3016
3017/// Return the register to use for the index value, and the subregister to use
3018/// for the indirectly accessed register.
3019static std::pair<Register, unsigned>
3021 const TargetRegisterClass *SuperRC, Register IdxReg,
3022 unsigned EltSize, GISelKnownBits &KnownBits) {
3023 Register IdxBaseReg;
3024 int Offset;
3025
3026 std::tie(IdxBaseReg, Offset) =
3028 if (IdxBaseReg == AMDGPU::NoRegister) {
3029 // This will happen if the index is a known constant. This should ordinarily
3030 // be legalized out, but handle it as a register just in case.
3031 assert(Offset == 0);
3032 IdxBaseReg = IdxReg;
3033 }
3034
3035 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3036
3037 // Skip out of bounds offsets, or else we would end up using an undefined
3038 // register.
3039 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3040 return std::pair(IdxReg, SubRegs[0]);
3041 return std::pair(IdxBaseReg, SubRegs[Offset]);
3042}
3043
3044bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3045 MachineInstr &MI) const {
3046 Register DstReg = MI.getOperand(0).getReg();
3047 Register SrcReg = MI.getOperand(1).getReg();
3048 Register IdxReg = MI.getOperand(2).getReg();
3049
3050 LLT DstTy = MRI->getType(DstReg);
3051 LLT SrcTy = MRI->getType(SrcReg);
3052
3053 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3054 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3055 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3056
3057 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3058 // into a waterfall loop.
3059 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3060 return false;
3061
3062 const TargetRegisterClass *SrcRC =
3063 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3064 const TargetRegisterClass *DstRC =
3065 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3066 if (!SrcRC || !DstRC)
3067 return false;
3068 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3069 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3070 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3071 return false;
3072
3073 MachineBasicBlock *BB = MI.getParent();
3074 const DebugLoc &DL = MI.getDebugLoc();
3075 const bool Is64 = DstTy.getSizeInBits() == 64;
3076
3077 unsigned SubReg;
3078 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3079 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3080
3081 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3082 if (DstTy.getSizeInBits() != 32 && !Is64)
3083 return false;
3084
3085 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3086 .addReg(IdxReg);
3087
3088 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3089 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3090 .addReg(SrcReg, 0, SubReg)
3091 .addReg(SrcReg, RegState::Implicit);
3092 MI.eraseFromParent();
3093 return true;
3094 }
3095
3096 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3097 return false;
3098
3099 if (!STI.useVGPRIndexMode()) {
3100 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3101 .addReg(IdxReg);
3102 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3103 .addReg(SrcReg, 0, SubReg)
3104 .addReg(SrcReg, RegState::Implicit);
3105 MI.eraseFromParent();
3106 return true;
3107 }
3108
3109 const MCInstrDesc &GPRIDXDesc =
3110 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3111 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3112 .addReg(SrcReg)
3113 .addReg(IdxReg)
3114 .addImm(SubReg);
3115
3116 MI.eraseFromParent();
3117 return true;
3118}
3119
3120// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3121bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3122 MachineInstr &MI) const {
3123 Register DstReg = MI.getOperand(0).getReg();
3124 Register VecReg = MI.getOperand(1).getReg();
3125 Register ValReg = MI.getOperand(2).getReg();
3126 Register IdxReg = MI.getOperand(3).getReg();
3127
3128 LLT VecTy = MRI->getType(DstReg);
3129 LLT ValTy = MRI->getType(ValReg);
3130 unsigned VecSize = VecTy.getSizeInBits();
3131 unsigned ValSize = ValTy.getSizeInBits();
3132
3133 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3134 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3135 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3136
3137 assert(VecTy.getElementType() == ValTy);
3138
3139 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3140 // into a waterfall loop.
3141 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3142 return false;
3143
3144 const TargetRegisterClass *VecRC =
3145 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3146 const TargetRegisterClass *ValRC =
3147 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3148
3149 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3150 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3151 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3152 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3153 return false;
3154
3155 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3156 return false;
3157
3158 unsigned SubReg;
3159 std::tie(IdxReg, SubReg) =
3160 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3161
3162 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3163 STI.useVGPRIndexMode();
3164
3165 MachineBasicBlock *BB = MI.getParent();
3166 const DebugLoc &DL = MI.getDebugLoc();
3167
3168 if (!IndexMode) {
3169 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3170 .addReg(IdxReg);
3171
3172 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3173 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3174 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3175 .addReg(VecReg)
3176 .addReg(ValReg)
3177 .addImm(SubReg);
3178 MI.eraseFromParent();
3179 return true;
3180 }
3181
3182 const MCInstrDesc &GPRIDXDesc =
3183 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3184 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3185 .addReg(VecReg)
3186 .addReg(ValReg)
3187 .addReg(IdxReg)
3188 .addImm(SubReg);
3189
3190 MI.eraseFromParent();
3191 return true;
3192}
3193
3194bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3195 unsigned Opc;
3196 unsigned Size = MI.getOperand(3).getImm();
3197
3198 // The struct intrinsic variants add one additional operand over raw.
3199 const bool HasVIndex = MI.getNumOperands() == 9;
3200 Register VIndex;
3201 int OpOffset = 0;
3202 if (HasVIndex) {
3203 VIndex = MI.getOperand(4).getReg();
3204 OpOffset = 1;
3205 }
3206
3207 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3208 std::optional<ValueAndVReg> MaybeVOffset =
3210 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3211
3212 switch (Size) {
3213 default:
3214 return false;
3215 case 1:
3216 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3217 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3218 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3219 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3220 break;
3221 case 2:
3222 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3223 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3224 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3225 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3226 break;
3227 case 4:
3228 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3229 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3230 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3231 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3232 break;
3233 }
3234
3235 MachineBasicBlock *MBB = MI.getParent();
3236 const DebugLoc &DL = MI.getDebugLoc();
3237 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3238 .add(MI.getOperand(2));
3239
3240 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3241
3242 if (HasVIndex && HasVOffset) {
3243 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3244 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3245 .addReg(VIndex)
3246 .addImm(AMDGPU::sub0)
3247 .addReg(VOffset)
3248 .addImm(AMDGPU::sub1);
3249
3250 MIB.addReg(IdxReg);
3251 } else if (HasVIndex) {
3252 MIB.addReg(VIndex);
3253 } else if (HasVOffset) {
3254 MIB.addReg(VOffset);
3255 }
3256
3257 MIB.add(MI.getOperand(1)); // rsrc
3258 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3259 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3260 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3261 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3262 MIB.addImm((Aux >> 3) & 1); // swz
3263
3264 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3265 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3266 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3267 MachinePointerInfo StorePtrI = LoadPtrI;
3268 StorePtrI.V = nullptr;
3270
3271 auto F = LoadMMO->getFlags() &
3273 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3274 Size, LoadMMO->getBaseAlign());
3275
3276 MachineMemOperand *StoreMMO =
3278 sizeof(int32_t), LoadMMO->getBaseAlign());
3279
3280 MIB.setMemRefs({LoadMMO, StoreMMO});
3281
3282 MI.eraseFromParent();
3283 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3284}
3285
3286/// Match a zero extend from a 32-bit value to 64-bits.
3288 Register ZExtSrc;
3289 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3290 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3291
3292 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3293 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3294 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3295 return Register();
3296
3297 assert(Def->getNumOperands() == 3 &&
3298 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3299 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3300 return Def->getOperand(1).getReg();
3301 }
3302
3303 return Register();
3304}
3305
3306bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3307 unsigned Opc;
3308 unsigned Size = MI.getOperand(3).getImm();
3309
3310 switch (Size) {
3311 default:
3312 return false;
3313 case 1:
3314 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3315 break;
3316 case 2:
3317 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3318 break;
3319 case 4:
3320 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3321 break;
3322 }
3323
3324 MachineBasicBlock *MBB = MI.getParent();
3325 const DebugLoc &DL = MI.getDebugLoc();
3326 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3327 .add(MI.getOperand(2));
3328
3329 Register Addr = MI.getOperand(1).getReg();
3330 Register VOffset;
3331 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3332 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3333 if (!isSGPR(Addr)) {
3334 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3335 if (isSGPR(AddrDef->Reg)) {
3336 Addr = AddrDef->Reg;
3337 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3338 Register SAddr =
3339 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3340 if (isSGPR(SAddr)) {
3341 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3342 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3343 Addr = SAddr;
3344 VOffset = Off;
3345 }
3346 }
3347 }
3348 }
3349
3350 if (isSGPR(Addr)) {
3351 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3352 if (!VOffset) {
3353 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3354 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3355 .addImm(0);
3356 }
3357 }
3358
3359 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3360 .addReg(Addr);
3361
3362 if (isSGPR(Addr))
3363 MIB.addReg(VOffset);
3364
3365 MIB.add(MI.getOperand(4)) // offset
3366 .add(MI.getOperand(5)); // cpol
3367
3368 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3369 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3370 LoadPtrI.Offset = MI.getOperand(4).getImm();
3371 MachinePointerInfo StorePtrI = LoadPtrI;
3374 auto F = LoadMMO->getFlags() &
3376 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3377 Size, LoadMMO->getBaseAlign());
3378 MachineMemOperand *StoreMMO =
3380 sizeof(int32_t), Align(4));
3381
3382 MIB.setMemRefs({LoadMMO, StoreMMO});
3383
3384 MI.eraseFromParent();
3385 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3386}
3387
3388bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3389 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3390 MI.removeOperand(1);
3391 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3392 return true;
3393}
3394
3395bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3396 unsigned Opc;
3397 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3398 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3399 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3400 break;
3401 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3402 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3403 break;
3404 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3405 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3406 break;
3407 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3408 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3409 break;
3410 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3411 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3412 break;
3413 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3414 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3415 break;
3416 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3417 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3418 break;
3419 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3420 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3421 break;
3422 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3423 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3424 break;
3425 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3426 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3427 break;
3428 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3429 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3430 break;
3431 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3432 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3433 break;
3434 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3435 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3436 break;
3437 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3438 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3439 break;
3440 default:
3441 llvm_unreachable("unhandled smfmac intrinsic");
3442 }
3443
3444 auto VDst_In = MI.getOperand(4);
3445
3446 MI.setDesc(TII.get(Opc));
3447 MI.removeOperand(4); // VDst_In
3448 MI.removeOperand(1); // Intrinsic ID
3449 MI.addOperand(VDst_In); // Readd VDst_In to the end
3450 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3451 return true;
3452}
3453
3454bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3455 Register DstReg = MI.getOperand(0).getReg();
3456 Register SrcReg = MI.getOperand(1).getReg();
3457 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3458 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3459 MachineBasicBlock *MBB = MI.getParent();
3460 const DebugLoc &DL = MI.getDebugLoc();
3461
3462 if (IsVALU) {
3463 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3464 .addImm(Subtarget->getWavefrontSizeLog2())
3465 .addReg(SrcReg);
3466 } else {
3467 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3468 .addReg(SrcReg)
3469 .addImm(Subtarget->getWavefrontSizeLog2())
3470 .setOperandDead(3); // Dead scc
3471 }
3472
3473 const TargetRegisterClass &RC =
3474 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3475 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3476 return false;
3477
3478 MI.eraseFromParent();
3479 return true;
3480}
3481
3482bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3483 Register SrcReg = MI.getOperand(0).getReg();
3484 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3485 return false;
3486
3487 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3488 Register SP =
3490 Register WaveAddr = getWaveAddress(DefMI);
3491 MachineBasicBlock *MBB = MI.getParent();
3492 const DebugLoc &DL = MI.getDebugLoc();
3493
3494 if (!WaveAddr) {
3495 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3496 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3497 .addReg(SrcReg)
3498 .addImm(Subtarget->getWavefrontSizeLog2())
3499 .setOperandDead(3); // Dead scc
3500 }
3501
3502 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3503 .addReg(WaveAddr);
3504
3505 MI.eraseFromParent();
3506 return true;
3507}
3508
3510 if (I.isPHI())
3511 return selectPHI(I);
3512
3513 if (!I.isPreISelOpcode()) {
3514 if (I.isCopy())
3515 return selectCOPY(I);
3516 return true;
3517 }
3518
3519 switch (I.getOpcode()) {
3520 case TargetOpcode::G_AND:
3521 case TargetOpcode::G_OR:
3522 case TargetOpcode::G_XOR:
3523 if (selectImpl(I, *CoverageInfo))
3524 return true;
3525 return selectG_AND_OR_XOR(I);
3526 case TargetOpcode::G_ADD:
3527 case TargetOpcode::G_SUB:
3528 if (selectImpl(I, *CoverageInfo))
3529 return true;
3530 return selectG_ADD_SUB(I);
3531 case TargetOpcode::G_UADDO:
3532 case TargetOpcode::G_USUBO:
3533 case TargetOpcode::G_UADDE:
3534 case TargetOpcode::G_USUBE:
3535 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3536 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3537 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3538 return selectG_AMDGPU_MAD_64_32(I);
3539 case TargetOpcode::G_INTTOPTR:
3540 case TargetOpcode::G_BITCAST:
3541 case TargetOpcode::G_PTRTOINT:
3542 return selectCOPY(I);
3543 case TargetOpcode::G_CONSTANT:
3544 case TargetOpcode::G_FCONSTANT:
3545 return selectG_CONSTANT(I);
3546 case TargetOpcode::G_FNEG:
3547 if (selectImpl(I, *CoverageInfo))
3548 return true;
3549 return selectG_FNEG(I);
3550 case TargetOpcode::G_FABS:
3551 if (selectImpl(I, *CoverageInfo))
3552 return true;
3553 return selectG_FABS(I);
3554 case TargetOpcode::G_EXTRACT:
3555 return selectG_EXTRACT(I);
3556 case TargetOpcode::G_MERGE_VALUES:
3557 case TargetOpcode::G_CONCAT_VECTORS:
3558 return selectG_MERGE_VALUES(I);
3559 case TargetOpcode::G_UNMERGE_VALUES:
3560 return selectG_UNMERGE_VALUES(I);
3561 case TargetOpcode::G_BUILD_VECTOR:
3562 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3563 return selectG_BUILD_VECTOR(I);
3564 case TargetOpcode::G_PTR_ADD:
3565 if (selectImpl(I, *CoverageInfo))
3566 return true;
3567 return selectG_PTR_ADD(I);
3568 case TargetOpcode::G_IMPLICIT_DEF:
3569 return selectG_IMPLICIT_DEF(I);
3570 case TargetOpcode::G_FREEZE:
3571 return selectCOPY(I);
3572 case TargetOpcode::G_INSERT:
3573 return selectG_INSERT(I);
3574 case TargetOpcode::G_INTRINSIC:
3575 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3576 return selectG_INTRINSIC(I);
3577 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3578 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3579 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3580 case TargetOpcode::G_ICMP:
3581 case TargetOpcode::G_FCMP:
3582 if (selectG_ICMP_or_FCMP(I))
3583 return true;
3584 return selectImpl(I, *CoverageInfo);
3585 case TargetOpcode::G_LOAD:
3586 case TargetOpcode::G_STORE:
3587 case TargetOpcode::G_ATOMIC_CMPXCHG:
3588 case TargetOpcode::G_ATOMICRMW_XCHG:
3589 case TargetOpcode::G_ATOMICRMW_ADD:
3590 case TargetOpcode::G_ATOMICRMW_SUB:
3591 case TargetOpcode::G_ATOMICRMW_AND:
3592 case TargetOpcode::G_ATOMICRMW_OR:
3593 case TargetOpcode::G_ATOMICRMW_XOR:
3594 case TargetOpcode::G_ATOMICRMW_MIN:
3595 case TargetOpcode::G_ATOMICRMW_MAX:
3596 case TargetOpcode::G_ATOMICRMW_UMIN:
3597 case TargetOpcode::G_ATOMICRMW_UMAX:
3598 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3599 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3600 case TargetOpcode::G_ATOMICRMW_FADD:
3601 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3602 case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3603 return selectG_LOAD_STORE_ATOMICRMW(I);
3604 case TargetOpcode::G_SELECT:
3605 return selectG_SELECT(I);
3606 case TargetOpcode::G_TRUNC:
3607 return selectG_TRUNC(I);
3608 case TargetOpcode::G_SEXT:
3609 case TargetOpcode::G_ZEXT:
3610 case TargetOpcode::G_ANYEXT:
3611 case TargetOpcode::G_SEXT_INREG:
3612 // This is a workaround. For extension from type i1, `selectImpl()` uses
3613 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3614 // i1 can only be hold in a SGPR class.
3615 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3616 selectImpl(I, *CoverageInfo))
3617 return true;
3618 return selectG_SZA_EXT(I);
3619 case TargetOpcode::G_FPEXT:
3620 if (selectG_FPEXT(I))
3621 return true;
3622 return selectImpl(I, *CoverageInfo);
3623 case TargetOpcode::G_BRCOND:
3624 return selectG_BRCOND(I);
3625 case TargetOpcode::G_GLOBAL_VALUE:
3626 return selectG_GLOBAL_VALUE(I);
3627 case TargetOpcode::G_PTRMASK:
3628 return selectG_PTRMASK(I);
3629 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3630 return selectG_EXTRACT_VECTOR_ELT(I);
3631 case TargetOpcode::G_INSERT_VECTOR_ELT:
3632 return selectG_INSERT_VECTOR_ELT(I);
3633 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3634 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3635 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3636 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3639 assert(Intr && "not an image intrinsic with image pseudo");
3640 return selectImageIntrinsic(I, Intr);
3641 }
3642 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3643 return selectBVHIntrinsic(I);
3644 case AMDGPU::G_SBFX:
3645 case AMDGPU::G_UBFX:
3646 return selectG_SBFX_UBFX(I);
3647 case AMDGPU::G_SI_CALL:
3648 I.setDesc(TII.get(AMDGPU::SI_CALL));
3649 return true;
3650 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3651 return selectWaveAddress(I);
3652 case AMDGPU::G_STACKRESTORE:
3653 return selectStackRestore(I);
3654 default:
3655 return selectImpl(I, *CoverageInfo);
3656 }
3657 return false;
3658}
3659
3661AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3662 return {{
3663 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3664 }};
3665
3666}
3667
3668std::pair<Register, unsigned>
3669AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3670 bool IsCanonicalizing,
3671 bool AllowAbs, bool OpSel) const {
3672 Register Src = Root.getReg();
3673 unsigned Mods = 0;
3674 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3675
3676 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3677 Src = MI->getOperand(1).getReg();
3678 Mods |= SISrcMods::NEG;
3679 MI = getDefIgnoringCopies(Src, *MRI);
3680 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3681 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3682 // denormal mode, but we're implicitly canonicalizing in a source operand.
3683 const ConstantFP *LHS =
3684 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3685 if (LHS && LHS->isZero()) {
3686 Mods |= SISrcMods::NEG;
3687 Src = MI->getOperand(2).getReg();
3688 }
3689 }
3690
3691 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3692 Src = MI->getOperand(1).getReg();
3693 Mods |= SISrcMods::ABS;
3694 }
3695
3696 if (OpSel)
3697 Mods |= SISrcMods::OP_SEL_0;
3698
3699 return std::pair(Src, Mods);
3700}
3701
3702Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3703 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3704 bool ForceVGPR) const {
3705 if ((Mods != 0 || ForceVGPR) &&
3706 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3707
3708 // If we looked through copies to find source modifiers on an SGPR operand,
3709 // we now have an SGPR register source. To avoid potentially violating the
3710 // constant bus restriction, we need to insert a copy to a VGPR.
3711 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3712 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3713 TII.get(AMDGPU::COPY), VGPRSrc)
3714 .addReg(Src);
3715 Src = VGPRSrc;
3716 }
3717
3718 return Src;
3719}
3720
3721///
3722/// This will select either an SGPR or VGPR operand and will save us from
3723/// having to write an extra tablegen pattern.
3725AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3726 return {{
3727 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3728 }};
3729}
3730
3732AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3733 Register Src;
3734 unsigned Mods;
3735 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3736
3737 return {{
3738 [=](MachineInstrBuilder &MIB) {
3739 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3740 },
3741 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3742 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3743 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3744 }};
3745}
3746
3748AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3749 Register Src;
3750 unsigned Mods;
3751 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3752 /*IsCanonicalizing=*/true,
3753 /*AllowAbs=*/false);
3754
3755 return {{
3756 [=](MachineInstrBuilder &MIB) {
3757 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3758 },
3759 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3760 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3761 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3762 }};
3763}
3764
3766AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3767 return {{
3768 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3769 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3770 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3771 }};
3772}
3773
3775AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3776 Register Src;
3777 unsigned Mods;
3778 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3779
3780 return {{
3781 [=](MachineInstrBuilder &MIB) {
3782 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3783 },
3784 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3785 }};
3786}
3787
3789AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3790 MachineOperand &Root) const {
3791 Register Src;
3792 unsigned Mods;
3793 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3794
3795 return {{
3796 [=](MachineInstrBuilder &MIB) {
3797 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3798 },
3799 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3800 }};
3801}
3802
3804AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3805 Register Src;
3806 unsigned Mods;
3807 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3808 /*AllowAbs=*/false);
3809
3810 return {{
3811 [=](MachineInstrBuilder &MIB) {
3812 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3813 },
3814 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3815 }};
3816}
3817
3819AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3820 Register Reg = Root.getReg();
3821 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3822 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3823 return {};
3824 return {{
3825 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3826 }};
3827}
3828
3829std::pair<Register, unsigned>
3830AMDGPUInstructionSelector::selectVOP3PModsImpl(
3831 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3832 unsigned Mods = 0;
3833 MachineInstr *MI = MRI.getVRegDef(Src);
3834
3835 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3836 // It's possible to see an f32 fneg here, but unlikely.
3837 // TODO: Treat f32 fneg as only high bit.
3838 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3840 Src = MI->getOperand(1).getReg();
3841 MI = MRI.getVRegDef(Src);
3842 }
3843
3844 // TODO: Handle G_FSUB 0 as fneg
3845
3846 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3847 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3848
3849 // Packed instructions do not have abs modifiers.
3850 Mods |= SISrcMods::OP_SEL_1;
3851
3852 return std::pair(Src, Mods);
3853}
3854
3856AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3858 = Root.getParent()->getParent()->getParent()->getRegInfo();
3859
3860 Register Src;
3861 unsigned Mods;
3862 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3863
3864 return {{
3865 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3866 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3867 }};
3868}
3869
3871AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3873 = Root.getParent()->getParent()->getParent()->getRegInfo();
3874
3875 Register Src;
3876 unsigned Mods;
3877 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3878
3879 return {{
3880 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3881 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3882 }};
3883}
3884
3886AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
3887 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3888 // Value is in Imm operand as i1 sign extended to int64_t.
3889 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3890 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3891 "expected i1 value");
3892 unsigned Mods = SISrcMods::OP_SEL_1;
3893 if (Root.getImm() == -1)
3894 Mods ^= SISrcMods::NEG;
3895 return {{
3896 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3897 }};
3898}
3899
3901AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3902 MachineOperand &Root) const {
3903 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3904 "expected i1 value");
3905 unsigned Mods = SISrcMods::OP_SEL_1;
3906 if (Root.getImm() != 0)
3907 Mods |= SISrcMods::OP_SEL_0;
3908
3909 return {{
3910 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3911 }};
3912}
3913
3915AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3916 Register Src;
3917 unsigned Mods;
3918 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3919
3920 // FIXME: Handle op_sel
3921 return {{
3922 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3923 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3924 }};
3925}
3926
3928AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
3929 Register Src;
3930 unsigned Mods;
3931 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3932 /*IsCanonicalizing=*/true,
3933 /*AllowAbs=*/false,
3934 /*OpSel=*/false);
3935
3936 return {{
3937 [=](MachineInstrBuilder &MIB) {
3938 MIB.addReg(
3939 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
3940 },
3941 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3942 }};
3943}
3944
3946AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
3947 Register Src;
3948 unsigned Mods;
3949 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3950 /*IsCanonicalizing=*/true,
3951 /*AllowAbs=*/false,
3952 /*OpSel=*/true);
3953
3954 return {{
3955 [=](MachineInstrBuilder &MIB) {
3956 MIB.addReg(
3957 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
3958 },
3959 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3960 }};
3961}
3962
3963bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
3964 Register &Base,
3965 Register *SOffset,
3966 int64_t *Offset) const {
3967 MachineInstr *MI = Root.getParent();
3968 MachineBasicBlock *MBB = MI->getParent();
3969
3970 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3971 // then we can select all ptr + 32-bit offsets.
3972 SmallVector<GEPInfo, 4> AddrInfo;
3973 getAddrModeInfo(*MI, *MRI, AddrInfo);
3974
3975 if (AddrInfo.empty())
3976 return false;
3977
3978 const GEPInfo &GEPI = AddrInfo[0];
3979 std::optional<int64_t> EncodedImm =
3980 AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
3981
3982 if (SOffset && Offset) {
3983 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
3984 AddrInfo.size() > 1) {
3985 const GEPInfo &GEPI2 = AddrInfo[1];
3986 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
3987 if (Register OffsetReg =
3988 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
3989 Base = GEPI2.SgprParts[0];
3990 *SOffset = OffsetReg;
3991 *Offset = *EncodedImm;
3992 return true;
3993 }
3994 }
3995 }
3996 return false;
3997 }
3998
3999 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4000 Base = GEPI.SgprParts[0];
4001 *Offset = *EncodedImm;
4002 return true;
4003 }
4004
4005 // SGPR offset is unsigned.
4006 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4007 GEPI.Imm != 0) {
4008 // If we make it this far we have a load with an 32-bit immediate offset.
4009 // It is OK to select this using a sgpr offset, because we have already
4010 // failed trying to select this load into one of the _IMM variants since
4011 // the _IMM Patterns are considered before the _SGPR patterns.
4012 Base = GEPI.SgprParts[0];
4013 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4014 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4015 .addImm(GEPI.Imm);
4016 return true;
4017 }
4018
4019 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4020 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4021 Base = GEPI.SgprParts[0];
4022 *SOffset = OffsetReg;
4023 return true;
4024 }
4025 }
4026
4027 return false;
4028}
4029
4031AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4032 Register Base;
4033 int64_t Offset;
4034 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4035 return std::nullopt;
4036
4037 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4038 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4039}
4040
4042AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4043 SmallVector<GEPInfo, 4> AddrInfo;
4044 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4045
4046 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4047 return std::nullopt;
4048
4049 const GEPInfo &GEPInfo = AddrInfo[0];
4050 Register PtrReg = GEPInfo.SgprParts[0];
4051 std::optional<int64_t> EncodedImm =
4052 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4053 if (!EncodedImm)
4054 return std::nullopt;
4055
4056 return {{
4057 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4058 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4059 }};
4060}
4061
4063AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4064 Register Base, SOffset;
4065 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4066 return std::nullopt;
4067
4068 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4069 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4070}
4071
4073AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4074 Register Base, SOffset;
4075 int64_t Offset;
4076 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4077 return std::nullopt;
4078
4079 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4080 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4081 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4082}
4083
4084std::pair<Register, int>
4085AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4086 uint64_t FlatVariant) const {
4087 MachineInstr *MI = Root.getParent();
4088
4089 auto Default = std::pair(Root.getReg(), 0);
4090
4091 if (!STI.hasFlatInstOffsets())
4092 return Default;
4093
4094 Register PtrBase;
4095 int64_t ConstOffset;
4096 std::tie(PtrBase, ConstOffset) =
4097 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4098 if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant))
4099 return Default;
4100
4101 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4102 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4103 return Default;
4104
4105 return std::pair(PtrBase, ConstOffset);
4106}
4107
4109AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4110 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4111
4112 return {{
4113 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4114 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4115 }};
4116}
4117
4119AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4120 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4121
4122 return {{
4123 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4124 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4125 }};
4126}
4127
4129AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4130 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4131
4132 return {{
4133 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4134 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4135 }};
4136}
4137
4138// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4140AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4141 Register Addr = Root.getReg();
4142 Register PtrBase;
4143 int64_t ConstOffset;
4144 int64_t ImmOffset = 0;
4145
4146 // Match the immediate offset first, which canonically is moved as low as
4147 // possible.
4148 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4149
4150 if (ConstOffset != 0) {
4151 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4153 Addr = PtrBase;
4154 ImmOffset = ConstOffset;
4155 } else {
4156 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4157 if (isSGPR(PtrBaseDef->Reg)) {
4158 if (ConstOffset > 0) {
4159 // Offset is too large.
4160 //
4161 // saddr + large_offset -> saddr +
4162 // (voffset = large_offset & ~MaxOffset) +
4163 // (large_offset & MaxOffset);
4164 int64_t SplitImmOffset, RemainderOffset;
4165 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4167
4168 if (isUInt<32>(RemainderOffset)) {
4169 MachineInstr *MI = Root.getParent();
4170 MachineBasicBlock *MBB = MI->getParent();
4171 Register HighBits =
4172 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4173
4174 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4175 HighBits)
4176 .addImm(RemainderOffset);
4177
4178 return {{
4179 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4180 [=](MachineInstrBuilder &MIB) {
4181 MIB.addReg(HighBits);
4182 }, // voffset
4183 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4184 }};
4185 }
4186 }
4187
4188 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4189 // is 1 we would need to perform 1 or 2 extra moves for each half of
4190 // the constant and it is better to do a scalar add and then issue a
4191 // single VALU instruction to materialize zero. Otherwise it is less
4192 // instructions to perform VALU adds with immediates or inline literals.
4193 unsigned NumLiterals =
4194 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4195 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4196 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4197 return std::nullopt;
4198 }
4199 }
4200 }
4201
4202 // Match the variable offset.
4203 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4204 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4205 // Look through the SGPR->VGPR copy.
4206 Register SAddr =
4207 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4208
4209 if (isSGPR(SAddr)) {
4210 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4211
4212 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4213 // inserted later.
4214 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4215 return {{[=](MachineInstrBuilder &MIB) { // saddr
4216 MIB.addReg(SAddr);
4217 },
4218 [=](MachineInstrBuilder &MIB) { // voffset
4219 MIB.addReg(VOffset);
4220 },
4221 [=](MachineInstrBuilder &MIB) { // offset
4222 MIB.addImm(ImmOffset);
4223 }}};
4224 }
4225 }
4226 }
4227
4228 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4229 // drop this.
4230 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4231 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4232 return std::nullopt;
4233
4234 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4235 // moves required to copy a 64-bit SGPR to VGPR.
4236 MachineInstr *MI = Root.getParent();
4237 MachineBasicBlock *MBB = MI->getParent();
4238 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4239
4240 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4241 .addImm(0);
4242
4243 return {{
4244 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4245 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4246 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4247 }};
4248}
4249
4251AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4252 Register Addr = Root.getReg();
4253 Register PtrBase;
4254 int64_t ConstOffset;
4255 int64_t ImmOffset = 0;
4256
4257 // Match the immediate offset first, which canonically is moved as low as
4258 // possible.
4259 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4260
4261 if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
4264 Addr = PtrBase;
4265 ImmOffset = ConstOffset;
4266 }
4267
4268 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4269 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4270 int FI = AddrDef->MI->getOperand(1).getIndex();
4271 return {{
4272 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4273 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4274 }};
4275 }
4276
4277 Register SAddr = AddrDef->Reg;
4278
4279 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4280 Register LHS = AddrDef->MI->getOperand(1).getReg();
4281 Register RHS = AddrDef->MI->getOperand(2).getReg();
4282 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4283 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4284
4285 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4286 isSGPR(RHSDef->Reg)) {
4287 int FI = LHSDef->MI->getOperand(1).getIndex();
4288 MachineInstr &I = *Root.getParent();
4289 MachineBasicBlock *BB = I.getParent();
4290 const DebugLoc &DL = I.getDebugLoc();
4291 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4292
4293 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4294 .addFrameIndex(FI)
4295 .addReg(RHSDef->Reg)
4296 .setOperandDead(3); // Dead scc
4297 }
4298 }
4299
4300 if (!isSGPR(SAddr))
4301 return std::nullopt;
4302
4303 return {{
4304 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4305 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4306 }};
4307}
4308
4309// Check whether the flat scratch SVS swizzle bug affects this access.
4310bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4311 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4312 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4313 return false;
4314
4315 // The bug affects the swizzling of SVS accesses if there is any carry out
4316 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4317 // voffset to (soffset + inst_offset).
4318 auto VKnown = KB->getKnownBits(VAddr);
4319 auto SKnown = KnownBits::computeForAddSub(
4320 true, false, KB->getKnownBits(SAddr),
4321 KnownBits::makeConstant(APInt(32, ImmOffset)));
4322 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4323 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4324 return (VMax & 3) + (SMax & 3) >= 4;
4325}
4326
4328AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4329 Register Addr = Root.getReg();
4330 Register PtrBase;
4331 int64_t ConstOffset;
4332 int64_t ImmOffset = 0;
4333
4334 // Match the immediate offset first, which canonically is moved as low as
4335 // possible.
4336 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4337
4338 if (ConstOffset != 0 &&
4339 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4340 Addr = PtrBase;
4341 ImmOffset = ConstOffset;
4342 }
4343
4344 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4345 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4346 return std::nullopt;
4347
4348 Register RHS = AddrDef->MI->getOperand(2).getReg();
4349 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4350 return std::nullopt;
4351
4352 Register LHS = AddrDef->MI->getOperand(1).getReg();
4353 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4354
4355 if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS))
4356 return std::nullopt;
4357
4358 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4359 return std::nullopt;
4360
4361 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4362 int FI = LHSDef->MI->getOperand(1).getIndex();
4363 return {{
4364 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4365 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4366 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4367 }};
4368 }
4369
4370 if (!isSGPR(LHS))
4371 return std::nullopt;
4372
4373 return {{
4374 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4375 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4376 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4377 }};
4378}
4379
4381AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4382 MachineInstr *MI = Root.getParent();
4383 MachineBasicBlock *MBB = MI->getParent();
4386
4387 int64_t Offset = 0;
4388 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4390 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4391
4392 // TODO: Should this be inside the render function? The iterator seems to
4393 // move.
4394 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset();
4395 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4396 HighBits)
4397 .addImm(Offset & ~MaxOffset);
4398
4399 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4400 MIB.addReg(Info->getScratchRSrcReg());
4401 },
4402 [=](MachineInstrBuilder &MIB) { // vaddr
4403 MIB.addReg(HighBits);
4404 },
4405 [=](MachineInstrBuilder &MIB) { // soffset
4406 // Use constant zero for soffset and rely on eliminateFrameIndex
4407 // to choose the appropriate frame register if need be.
4408 MIB.addImm(0);
4409 },
4410 [=](MachineInstrBuilder &MIB) { // offset
4411 MIB.addImm(Offset & MaxOffset);
4412 }}};
4413 }
4414
4415 assert(Offset == 0 || Offset == -1);
4416
4417 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4418 // offsets.
4419 std::optional<int> FI;
4420 Register VAddr = Root.getReg();
4421 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4422 Register PtrBase;
4423 int64_t ConstOffset;
4424 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4425 if (ConstOffset != 0) {
4426 if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
4428 KB->signBitIsZero(PtrBase))) {
4429 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4430 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4431 FI = PtrBaseDef->getOperand(1).getIndex();
4432 else
4433 VAddr = PtrBase;
4434 Offset = ConstOffset;
4435 }
4436 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4437 FI = RootDef->getOperand(1).getIndex();
4438 }
4439 }
4440
4441 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4442 MIB.addReg(Info->getScratchRSrcReg());
4443 },
4444 [=](MachineInstrBuilder &MIB) { // vaddr
4445 if (FI)
4446 MIB.addFrameIndex(*FI);
4447 else
4448 MIB.addReg(VAddr);
4449 },
4450 [=](MachineInstrBuilder &MIB) { // soffset
4451 // Use constant zero for soffset and rely on eliminateFrameIndex
4452 // to choose the appropriate frame register if need be.
4453 MIB.addImm(0);
4454 },
4455 [=](MachineInstrBuilder &MIB) { // offset
4456 MIB.addImm(Offset);
4457 }}};
4458}
4459
4460bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4461 int64_t Offset) const {
4462 if (!isUInt<16>(Offset))
4463 return false;
4464
4466 return true;
4467
4468 // On Southern Islands instruction with a negative base value and an offset
4469 // don't seem to work.
4470 return KB->signBitIsZero(Base);
4471}
4472
4473bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4474 int64_t Offset1,
4475 unsigned Size) const {
4476 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4477 return false;
4478 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4479 return false;
4480
4482 return true;
4483
4484 // On Southern Islands instruction with a negative base value and an offset
4485 // don't seem to work.
4486 return KB->signBitIsZero(Base);
4487}
4488
4489bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
4490 Register Base, uint64_t FlatVariant) const {
4491 if (FlatVariant != SIInstrFlags::FlatScratch)
4492 return true;
4493
4494 // When value in 32-bit Base can be negative calculate scratch offset using
4495 // 32-bit add instruction, otherwise use Base(unsigned) + offset.
4496 return KB->signBitIsZero(Base);
4497}
4498
4499bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4500 unsigned ShAmtBits) const {
4501 assert(MI.getOpcode() == TargetOpcode::G_AND);
4502
4503 std::optional<APInt> RHS =
4504 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4505 if (!RHS)
4506 return false;
4507
4508 if (RHS->countr_one() >= ShAmtBits)
4509 return true;
4510
4511 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4512 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4513}
4514
4516AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4517 MachineOperand &Root) const {
4518 Register Reg = Root.getReg();
4520
4521 std::optional<DefinitionAndSourceRegister> Def =
4522 getDefSrcRegIgnoringCopies(Reg, *MRI);
4523 assert(Def && "this shouldn't be an optional result");
4524 Reg = Def->Reg;
4525
4526 if (Register WaveBase = getWaveAddress(Def->MI)) {
4527 return {{
4528 [=](MachineInstrBuilder &MIB) { // rsrc
4529 MIB.addReg(Info->getScratchRSrcReg());
4530 },
4531 [=](MachineInstrBuilder &MIB) { // soffset
4532 MIB.addReg(WaveBase);
4533 },
4534 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4535 }};
4536 }
4537
4538 int64_t Offset = 0;
4539
4540 // FIXME: Copy check is a hack
4542 if (mi_match(Reg, *MRI,
4543 m_GPtrAdd(m_Reg(BasePtr),
4546 return {};
4547 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4548 Register WaveBase = getWaveAddress(BasePtrDef);
4549 if (!WaveBase)
4550 return {};
4551
4552 return {{
4553 [=](MachineInstrBuilder &MIB) { // rsrc
4554 MIB.addReg(Info->getScratchRSrcReg());
4555 },
4556 [=](MachineInstrBuilder &MIB) { // soffset
4557 MIB.addReg(WaveBase);
4558 },
4559 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4560 }};
4561 }
4562
4563 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4565 return {};
4566
4567 return {{
4568 [=](MachineInstrBuilder &MIB) { // rsrc
4569 MIB.addReg(Info->getScratchRSrcReg());
4570 },
4571 [=](MachineInstrBuilder &MIB) { // soffset
4572 MIB.addImm(0);
4573 },
4574 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4575 }};
4576}
4577
4578std::pair<Register, unsigned>
4579AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4580 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4581 if (!RootDef)
4582 return std::pair(Root.getReg(), 0);
4583
4584 int64_t ConstAddr = 0;
4585
4586 Register PtrBase;
4587 int64_t Offset;
4588 std::tie(PtrBase, Offset) =
4589 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4590
4591 if (Offset) {
4592 if (isDSOffsetLegal(PtrBase, Offset)) {
4593 // (add n0, c0)
4594 return std::pair(PtrBase, Offset);
4595 }
4596 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4597 // TODO
4598
4599
4600 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4601 // TODO
4602
4603 }
4604
4605 return std::pair(Root.getReg(), 0);
4606}
4607
4609AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4610 Register Reg;
4611 unsigned Offset;
4612 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4613 return {{
4614 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4615 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4616 }};
4617}
4618
4620AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4621 return selectDSReadWrite2(Root, 4);
4622}
4623
4625AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4626 return selectDSReadWrite2(Root, 8);
4627}
4628
4630AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4631 unsigned Size) const {
4632 Register Reg;
4633 unsigned Offset;
4634 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4635 return {{
4636 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4637 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4638 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4639 }};
4640}
4641
4642std::pair<Register, unsigned>
4643AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4644 unsigned Size) const {
4645 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4646 if (!RootDef)
4647 return std::pair(Root.getReg(), 0);
4648
4649 int64_t ConstAddr = 0;
4650
4651 Register PtrBase;
4652 int64_t Offset;
4653 std::tie(PtrBase, Offset) =
4654 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4655
4656 if (Offset) {
4657 int64_t OffsetValue0 = Offset;
4658 int64_t OffsetValue1 = Offset + Size;
4659 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4660 // (add n0, c0)
4661 return std::pair(PtrBase, OffsetValue0 / Size);
4662 }