LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
78 const Function &F = I.getMF()->getFunction();
79 F.getContext().diagnose(DiagnosticInfoUnsupported(
80 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error));
81}
82
83bool AMDGPUInstructionSelector::isVCC(Register Reg,
84 const MachineRegisterInfo &MRI) const {
85 // The verifier is oblivious to s1 being a valid value for wavesize registers.
86 if (Reg.isPhysical())
87 return false;
88
89 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
90 const TargetRegisterClass *RC =
92 if (RC) {
93 const LLT Ty = MRI.getType(Reg);
94 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
95 return false;
96 // G_TRUNC s1 result is never vcc.
97 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
98 RC->hasSuperClassEq(TRI.getBoolRC());
99 }
100
101 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
102 return RB->getID() == AMDGPU::VCCRegBankID;
103}
104
105bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
106 unsigned NewOpc) const {
107 MI.setDesc(TII.get(NewOpc));
108 MI.removeOperand(1); // Remove intrinsic ID.
109 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
110
111 MachineOperand &Dst = MI.getOperand(0);
112 MachineOperand &Src = MI.getOperand(1);
113
114 // TODO: This should be legalized to s32 if needed
115 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
116 return false;
117
118 const TargetRegisterClass *DstRC
119 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
120 const TargetRegisterClass *SrcRC
121 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
122 if (!DstRC || DstRC != SrcRC)
123 return false;
124
125 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
126 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
127 return false;
128 const MCInstrDesc &MCID = MI.getDesc();
129 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
130 MI.getOperand(0).setIsEarlyClobber(true);
131 }
132 return true;
133}
134
135bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
136 const DebugLoc &DL = I.getDebugLoc();
137 MachineBasicBlock *BB = I.getParent();
138 I.setDesc(TII.get(TargetOpcode::COPY));
139
140 const MachineOperand &Src = I.getOperand(1);
141 MachineOperand &Dst = I.getOperand(0);
142 Register DstReg = Dst.getReg();
143 Register SrcReg = Src.getReg();
144
145 if (isVCC(DstReg, *MRI)) {
146 if (SrcReg == AMDGPU::SCC) {
147 const TargetRegisterClass *RC
148 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
149 if (!RC)
150 return true;
151 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
152 }
153
154 if (!isVCC(SrcReg, *MRI)) {
155 // TODO: Should probably leave the copy and let copyPhysReg expand it.
156 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
157 return false;
158
159 const TargetRegisterClass *SrcRC
160 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
161
162 std::optional<ValueAndVReg> ConstVal =
163 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
164 if (ConstVal) {
165 unsigned MovOpc =
166 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
167 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
168 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
169 } else {
170 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
171
172 // We can't trust the high bits at this point, so clear them.
173
174 // TODO: Skip masking high bits if def is known boolean.
175
176 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
177 assert(Subtarget->useRealTrue16Insts());
178 const int64_t NoMods = 0;
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
180 .addImm(NoMods)
181 .addImm(1)
182 .addImm(NoMods)
183 .addReg(SrcReg)
184 .addImm(NoMods);
185 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
186 .addImm(NoMods)
187 .addImm(0)
188 .addImm(NoMods)
189 .addReg(MaskedReg)
190 .addImm(NoMods);
191 } else {
192 bool IsSGPR = TRI.isSGPRClass(SrcRC);
193 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
194 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
195 .addImm(1)
196 .addReg(SrcReg);
197 if (IsSGPR)
198 And.setOperandDead(3); // Dead scc
199
200 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
201 .addImm(0)
202 .addReg(MaskedReg);
203 }
204 }
205
206 if (!MRI->getRegClassOrNull(SrcReg))
207 MRI->setRegClass(SrcReg, SrcRC);
208 I.eraseFromParent();
209 return true;
210 }
211
212 const TargetRegisterClass *RC =
213 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
214 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
215 return false;
216
217 return true;
218 }
219
220 for (const MachineOperand &MO : I.operands()) {
221 if (MO.getReg().isPhysical())
222 continue;
223
224 const TargetRegisterClass *RC =
225 TRI.getConstrainedRegClassForOperand(MO, *MRI);
226 if (!RC)
227 continue;
228 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
229 }
230 return true;
231}
232
233bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
234 const DebugLoc &DL = I.getDebugLoc();
235 MachineBasicBlock *BB = I.getParent();
236 Register VCCReg = I.getOperand(1).getReg();
237 MachineInstr *Cmp;
238
239 // Set SCC as a side effect with S_CMP or S_OR.
240 if (STI.hasScalarCompareEq64()) {
241 unsigned CmpOpc =
242 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
243 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
244 } else {
245 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
246 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
247 .addReg(VCCReg)
248 .addReg(VCCReg);
249 }
250
251 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
252
253 Register DstReg = I.getOperand(0).getReg();
254 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
255
256 I.eraseFromParent();
257 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
258}
259
260bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
261 const DebugLoc &DL = I.getDebugLoc();
262 MachineBasicBlock *BB = I.getParent();
263
264 Register DstReg = I.getOperand(0).getReg();
265 Register SrcReg = I.getOperand(1).getReg();
266 std::optional<ValueAndVReg> Arg =
267 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
268
269 if (Arg) {
270 const int64_t Value = Arg->Value.getZExtValue();
271 if (Value == 0) {
272 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
273 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
274 } else {
275 assert(Value == 1);
276 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
277 }
278 I.eraseFromParent();
279 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
280 }
281
282 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
283 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
284
285 unsigned SelectOpcode =
286 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
287 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
288 .addReg(TRI.getExec())
289 .addImm(0);
290
291 I.eraseFromParent();
293 return true;
294}
295
296bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
297 Register DstReg = I.getOperand(0).getReg();
298 Register SrcReg = I.getOperand(1).getReg();
299
300 const DebugLoc &DL = I.getDebugLoc();
301 MachineBasicBlock *BB = I.getParent();
302
303 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
304 .addReg(SrcReg);
305
306 I.eraseFromParent();
307 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
308 return true;
309}
310
311bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
312 const Register DefReg = I.getOperand(0).getReg();
313 const LLT DefTy = MRI->getType(DefReg);
314
315 // S1 G_PHIs should not be selected in instruction-select, instead:
316 // - divergent S1 G_PHI should go through lane mask merging algorithm
317 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
318 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
319 if (DefTy == LLT::scalar(1))
320 return false;
321
322 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
323
324 const RegClassOrRegBank &RegClassOrBank =
325 MRI->getRegClassOrRegBank(DefReg);
326
327 const TargetRegisterClass *DefRC =
329 if (!DefRC) {
330 if (!DefTy.isValid()) {
331 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
332 return false;
333 }
334
335 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
336 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
337 if (!DefRC) {
338 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
339 return false;
340 }
341 }
342
343 // If inputs have register bank, assign corresponding reg class.
344 // Note: registers don't need to have the same reg bank.
345 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
346 const Register SrcReg = I.getOperand(i).getReg();
347
348 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
349 if (RB) {
350 const LLT SrcTy = MRI->getType(SrcReg);
351 const TargetRegisterClass *SrcRC =
352 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
353 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
354 return false;
355 }
356 }
357
358 I.setDesc(TII.get(TargetOpcode::PHI));
359 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
360}
361
363AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
364 const TargetRegisterClass &SubRC,
365 unsigned SubIdx) const {
366
367 MachineInstr *MI = MO.getParent();
368 MachineBasicBlock *BB = MO.getParent()->getParent();
369 Register DstReg = MRI->createVirtualRegister(&SubRC);
370
371 if (MO.isReg()) {
372 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
373 Register Reg = MO.getReg();
374 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
375 .addReg(Reg, {}, ComposedSubIdx);
376
377 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
378 MO.isKill(), MO.isDead(), MO.isUndef(),
379 MO.isEarlyClobber(), 0, MO.isDebug(),
380 MO.isInternalRead());
381 }
382
383 assert(MO.isImm());
384
385 APInt Imm(64, MO.getImm());
386
387 switch (SubIdx) {
388 default:
389 llvm_unreachable("do not know to split immediate with this sub index.");
390 case AMDGPU::sub0:
391 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
392 case AMDGPU::sub1:
393 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
394 }
395}
396
397static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
398 switch (Opc) {
399 case AMDGPU::G_AND:
400 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
401 case AMDGPU::G_OR:
402 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
403 case AMDGPU::G_XOR:
404 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
405 default:
406 llvm_unreachable("not a bit op");
407 }
408}
409
410bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
411 Register DstReg = I.getOperand(0).getReg();
412 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
413
414 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
415 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
416 DstRB->getID() != AMDGPU::VCCRegBankID)
417 return false;
418
419 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
420 STI.isWave64());
421 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
422
423 // Dead implicit-def of scc
424 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
425 true, // isImp
426 false, // isKill
427 true)); // isDead
428 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
429 return true;
430}
431
432bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
433 MachineBasicBlock *BB = I.getParent();
434 MachineFunction *MF = BB->getParent();
435 Register DstReg = I.getOperand(0).getReg();
436 const DebugLoc &DL = I.getDebugLoc();
437 LLT Ty = MRI->getType(DstReg);
438 if (Ty.isVector())
439 return false;
440
441 unsigned Size = Ty.getSizeInBits();
442 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
443 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
444 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
445
446 if (Size == 32) {
447 if (IsSALU) {
448 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
449 MachineInstr *Add =
450 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
451 .add(I.getOperand(1))
452 .add(I.getOperand(2))
453 .setOperandDead(3); // Dead scc
454 I.eraseFromParent();
455 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
456 return true;
457 }
458
459 if (STI.hasAddNoCarryInsts()) {
460 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
461 I.setDesc(TII.get(Opc));
462 I.addOperand(*MF, MachineOperand::CreateImm(0));
463 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
464 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
465 return true;
466 }
467
468 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
469
470 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
471 MachineInstr *Add
472 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
473 .addDef(UnusedCarry, RegState::Dead)
474 .add(I.getOperand(1))
475 .add(I.getOperand(2))
476 .addImm(0);
477 I.eraseFromParent();
478 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
479 return true;
480 }
481
482 assert(!Sub && "illegal sub should not reach here");
483
484 const TargetRegisterClass &RC
485 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
486 const TargetRegisterClass &HalfRC
487 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
488
489 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
490 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
491 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
492 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
493
494 Register DstLo = MRI->createVirtualRegister(&HalfRC);
495 Register DstHi = MRI->createVirtualRegister(&HalfRC);
496
497 if (IsSALU) {
498 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
499 .add(Lo1)
500 .add(Lo2);
501 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
502 .add(Hi1)
503 .add(Hi2)
504 .setOperandDead(3); // Dead scc
505 } else {
506 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
507 Register CarryReg = MRI->createVirtualRegister(CarryRC);
508 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
509 .addDef(CarryReg)
510 .add(Lo1)
511 .add(Lo2)
512 .addImm(0);
513 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
514 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
515 .add(Hi1)
516 .add(Hi2)
517 .addReg(CarryReg, RegState::Kill)
518 .addImm(0);
519
520 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
521 }
522
523 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
524 .addReg(DstLo)
525 .addImm(AMDGPU::sub0)
526 .addReg(DstHi)
527 .addImm(AMDGPU::sub1);
528
529
530 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
531 return false;
532
533 I.eraseFromParent();
534 return true;
535}
536
537bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
538 MachineInstr &I) const {
539 MachineBasicBlock *BB = I.getParent();
540 MachineFunction *MF = BB->getParent();
541 const DebugLoc &DL = I.getDebugLoc();
542 Register Dst0Reg = I.getOperand(0).getReg();
543 Register Dst1Reg = I.getOperand(1).getReg();
544 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
545 I.getOpcode() == AMDGPU::G_UADDE;
546 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
547 I.getOpcode() == AMDGPU::G_USUBE;
548
549 if (isVCC(Dst1Reg, *MRI)) {
550 unsigned NoCarryOpc =
551 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
552 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
553 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
554 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
555 I.addOperand(*MF, MachineOperand::CreateImm(0));
556 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
557 return true;
558 }
559
560 Register Src0Reg = I.getOperand(2).getReg();
561 Register Src1Reg = I.getOperand(3).getReg();
562
563 if (HasCarryIn) {
564 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
565 .addReg(I.getOperand(4).getReg());
566 }
567
568 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
569 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
570
571 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
572 .add(I.getOperand(2))
573 .add(I.getOperand(3));
574
575 if (MRI->use_nodbg_empty(Dst1Reg)) {
576 CarryInst.setOperandDead(3); // Dead scc
577 } else {
578 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
579 .addReg(AMDGPU::SCC);
580 if (!MRI->getRegClassOrNull(Dst1Reg))
581 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
582 }
583
584 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
585 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
586 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
587 return false;
588
589 if (HasCarryIn &&
590 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
591 AMDGPU::SReg_32RegClass, *MRI))
592 return false;
593
594 I.eraseFromParent();
595 return true;
596}
597
598bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
599 MachineInstr &I) const {
600 MachineBasicBlock *BB = I.getParent();
601 MachineFunction *MF = BB->getParent();
602 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
603 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() &&
604 MRI->use_nodbg_empty(I.getOperand(1).getReg());
605
606 unsigned Opc;
607 if (Subtarget->hasMADIntraFwdBug())
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
609 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
610 else if (UseNoCarry)
611 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
612 : AMDGPU::V_MAD_NC_I64_I32_e64;
613 else
614 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
615
616 if (UseNoCarry)
617 I.removeOperand(1);
618
619 I.setDesc(TII.get(Opc));
620 I.addOperand(*MF, MachineOperand::CreateImm(0));
621 I.addImplicitDefUseOperands(*MF);
622 I.getOperand(0).setIsEarlyClobber(true);
623 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
624 return true;
625}
626
627// TODO: We should probably legalize these to only using 32-bit results.
628bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
629 MachineBasicBlock *BB = I.getParent();
630 Register DstReg = I.getOperand(0).getReg();
631 Register SrcReg = I.getOperand(1).getReg();
632 LLT DstTy = MRI->getType(DstReg);
633 LLT SrcTy = MRI->getType(SrcReg);
634 const unsigned SrcSize = SrcTy.getSizeInBits();
635 unsigned DstSize = DstTy.getSizeInBits();
636
637 // TODO: Should handle any multiple of 32 offset.
638 unsigned Offset = I.getOperand(2).getImm();
639 if (Offset % 32 != 0 || DstSize > 128)
640 return false;
641
642 // 16-bit operations really use 32-bit registers.
643 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
644 if (DstSize == 16)
645 DstSize = 32;
646
647 const TargetRegisterClass *DstRC =
648 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
649 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
650 return false;
651
652 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
653 const TargetRegisterClass *SrcRC =
654 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
655 if (!SrcRC)
656 return false;
657 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
658 DstSize / 32);
659 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
660 if (!SrcRC)
661 return false;
662
663 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
664 *SrcRC, I.getOperand(1));
665 const DebugLoc &DL = I.getDebugLoc();
666 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
667 .addReg(SrcReg, {}, SubReg);
668
669 I.eraseFromParent();
670 return true;
671}
672
673bool AMDGPUInstructionSelector::selectS16MergeToS32(MachineInstr &MI) const {
674 Register Dst = MI.getOperand(0).getReg();
675 Register Src0 = MI.getOperand(1).getReg();
676 Register Src1 = MI.getOperand(2).getReg();
677
678 LLT Src0Ty = MRI->getType(Src0);
679 LLT Src1Ty = MRI->getType(Src1);
680
681 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
682 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, TRI);
683 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, TRI);
684 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
685
686 Register ShiftSrc0;
687 Register ShiftSrc1;
688
689 const DebugLoc &DL = MI.getDebugLoc();
690 MachineBasicBlock *BB = MI.getParent();
691
692 // VGPR case
693 if (IsVector) {
694 // If source are both VGPR16, use REG_SEQUENCE with lo16/hi16 subregisters
695 if (Src0Bank->getID() == AMDGPU::VGPRRegBankID &&
696 Src1Bank->getID() == AMDGPU::VGPRRegBankID &&
697 Src0Ty == LLT::scalar(16) && Src1Ty == LLT::scalar(16)) {
698 BuildMI(*BB, MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), Dst)
699 .addReg(Src0)
700 .addImm(AMDGPU::lo16)
701 .addReg(Src1)
702 .addImm(AMDGPU::hi16);
703
704 if (!RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI))
705 return false;
706
707 MI.eraseFromParent();
708 return true;
709 }
710
711 // Otherwise, use V_LSHL_OR_B32_e64
712 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
713 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
714 .addImm(0xFFFF)
715 .addReg(Src0);
716 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
717
718 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
719 .addReg(Src1)
720 .addImm(16)
721 .addReg(TmpReg);
722 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
723
724 MI.eraseFromParent();
725 return true;
726 }
727
728 // SGPR case -> S_PACK_*_B32_B16
729 // With multiple uses of the shift, this will duplicate the shift and
730 // increase register pressure.
731 //
732 // (merge (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
733 // => (S_PACK_HH_B32_B16 $src0, $src1)
734 // (merge (lshr_oneuse SReg_32:$src0, 16), $src1)
735 // => (S_PACK_HL_B32_B16 $src0, $src1)
736 // (merge $src0, (lshr_oneuse SReg_32:$src1, 16))
737 // => (S_PACK_LH_B32_B16 $src0, $src1)
738 // (merge $src0, $src1)
739 // => (S_PACK_LL_B32_B16 $src0, $src1)
740
741 bool Shift0 = mi_match(
742 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
743
744 bool Shift1 = mi_match(
745 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
746
747 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
748 if (Shift0 && Shift1) {
749 Opc = AMDGPU::S_PACK_HH_B32_B16;
750 MI.getOperand(1).setReg(ShiftSrc0);
751 MI.getOperand(2).setReg(ShiftSrc1);
752 } else if (Shift1) {
753 Opc = AMDGPU::S_PACK_LH_B32_B16;
754 MI.getOperand(2).setReg(ShiftSrc1);
755 } else if (Shift0) {
756 auto ConstSrc1 =
757 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
758 if (ConstSrc1 && ConstSrc1->Value == 0) {
759 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
760 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
761 .addReg(ShiftSrc0)
762 .addImm(16)
763 .setOperandDead(3); // Dead scc
764
765 MI.eraseFromParent();
766 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
767 return true;
768 }
769 if (STI.hasSPackHL()) {
770 Opc = AMDGPU::S_PACK_HL_B32_B16;
771 MI.getOperand(1).setReg(ShiftSrc0);
772 }
773 }
774
775 MI.setDesc(TII.get(Opc));
776 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
777 return true;
778}
779
780bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
781 MachineBasicBlock *BB = MI.getParent();
782 Register DstReg = MI.getOperand(0).getReg();
783 LLT DstTy = MRI->getType(DstReg);
784 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
785
786 const unsigned SrcSize = SrcTy.getSizeInBits();
787 if (SrcSize < 32) {
788 // Handle s32 <- G_MERGE_VALUES s16, s16
789 if (SrcSize == 16 && DstTy.getSizeInBits() == 32 &&
790 MI.getNumOperands() == 3) {
791 return selectS16MergeToS32(MI);
792 }
793 return selectImpl(MI, *CoverageInfo);
794 }
795
796 const DebugLoc &DL = MI.getDebugLoc();
797 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
798 const unsigned DstSize = DstTy.getSizeInBits();
799 const TargetRegisterClass *DstRC =
800 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
801 if (!DstRC)
802 return false;
803
804 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
805 MachineInstrBuilder MIB =
806 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
807 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
808 MachineOperand &Src = MI.getOperand(I + 1);
809 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
810 MIB.addImm(SubRegs[I]);
811
812 const TargetRegisterClass *SrcRC
813 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
814 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
815 return false;
816 }
817
818 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
819 return false;
820
821 MI.eraseFromParent();
822 return true;
823}
824
825bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
826 MachineBasicBlock *BB = MI.getParent();
827 const int NumDst = MI.getNumOperands() - 1;
828
829 MachineOperand &Src = MI.getOperand(NumDst);
830
831 Register SrcReg = Src.getReg();
832 Register DstReg0 = MI.getOperand(0).getReg();
833 LLT DstTy = MRI->getType(DstReg0);
834 LLT SrcTy = MRI->getType(SrcReg);
835
836 const unsigned DstSize = DstTy.getSizeInBits();
837 const unsigned SrcSize = SrcTy.getSizeInBits();
838 const DebugLoc &DL = MI.getDebugLoc();
839 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
840
841 const TargetRegisterClass *SrcRC =
842 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
843 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
844 return false;
845
846 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
847 // source, and this relies on the fact that the same subregister indices are
848 // used for both.
849 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
850 for (int I = 0, E = NumDst; I != E; ++I) {
851 MachineOperand &Dst = MI.getOperand(I);
852 // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits.
853 if (SrcBank->getID() == AMDGPU::SGPRRegBankID &&
854 SubRegs[I] == AMDGPU::hi16) {
855 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg())
856 .addReg(SrcReg)
857 .addImm(16);
858 } else {
859 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
860 .addReg(SrcReg, {}, SubRegs[I]);
861 }
862
863 // Make sure the subregister index is valid for the source register.
864 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
865 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
866 return false;
867
868 const TargetRegisterClass *DstRC =
869 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
870 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
871 return false;
872 }
873
874 MI.eraseFromParent();
875 return true;
876}
877
878bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
879 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
880 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
881
882 Register Src0 = MI.getOperand(1).getReg();
883 Register Src1 = MI.getOperand(2).getReg();
884 LLT SrcTy = MRI->getType(Src0);
885 const unsigned SrcSize = SrcTy.getSizeInBits();
886
887 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
888 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
889 return selectG_MERGE_VALUES(MI);
890 }
891
892 // Selection logic below is for V2S16 only.
893 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
894 Register Dst = MI.getOperand(0).getReg();
895 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
896 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
897 SrcTy != LLT::scalar(32)))
898 return selectImpl(MI, *CoverageInfo);
899
900 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
901 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
902 return false;
903
904 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
905 DstBank->getID() == AMDGPU::VGPRRegBankID);
906 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
907
908 const DebugLoc &DL = MI.getDebugLoc();
909 MachineBasicBlock *BB = MI.getParent();
910
911 // First, before trying TableGen patterns, check if both sources are
912 // constants. In those cases, we can trivially compute the final constant
913 // and emit a simple move.
914 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
915 if (ConstSrc1) {
916 auto ConstSrc0 =
917 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
918 if (ConstSrc0) {
919 const int64_t K0 = ConstSrc0->Value.getSExtValue();
920 const int64_t K1 = ConstSrc1->Value.getSExtValue();
921 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
922 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
923 uint32_t Imm = Lo16 | (Hi16 << 16);
924
925 // VALU
926 if (IsVector) {
927 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
928 MI.eraseFromParent();
929 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
930 }
931
932 // SALU
933 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
934 MI.eraseFromParent();
935 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
936 }
937 }
938
939 // Now try TableGen patterns.
940 if (selectImpl(MI, *CoverageInfo))
941 return true;
942
943 // TODO: This should probably be a combine somewhere
944 // (build_vector $src0, undef) -> copy $src0
945 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
946 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
947 MI.setDesc(TII.get(AMDGPU::COPY));
948 MI.removeOperand(2);
949 const auto &RC =
950 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
951 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
952 RBI.constrainGenericRegister(Src0, RC, *MRI);
953 }
954
955 return selectS16MergeToS32(MI);
956}
957
958bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
959 const MachineOperand &MO = I.getOperand(0);
960
961 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
962 // regbank check here is to know why getConstrainedRegClassForOperand failed.
963 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
964 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
965 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
966 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
967 return true;
968 }
969
970 return false;
971}
972
973bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
974 MachineBasicBlock *BB = I.getParent();
975
976 Register DstReg = I.getOperand(0).getReg();
977 Register Src0Reg = I.getOperand(1).getReg();
978 Register Src1Reg = I.getOperand(2).getReg();
979 LLT Src1Ty = MRI->getType(Src1Reg);
980
981 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
982 unsigned InsSize = Src1Ty.getSizeInBits();
983
984 int64_t Offset = I.getOperand(3).getImm();
985
986 // FIXME: These cases should have been illegal and unnecessary to check here.
987 if (Offset % 32 != 0 || InsSize % 32 != 0)
988 return false;
989
990 // Currently not handled by getSubRegFromChannel.
991 if (InsSize > 128)
992 return false;
993
994 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
995 if (SubReg == AMDGPU::NoSubRegister)
996 return false;
997
998 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
999 const TargetRegisterClass *DstRC =
1000 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
1001 if (!DstRC)
1002 return false;
1003
1004 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
1005 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
1006 const TargetRegisterClass *Src0RC =
1007 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
1008 const TargetRegisterClass *Src1RC =
1009 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
1010
1011 // Deal with weird cases where the class only partially supports the subreg
1012 // index.
1013 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
1014 if (!Src0RC || !Src1RC)
1015 return false;
1016
1017 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
1018 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
1019 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
1020 return false;
1021
1022 const DebugLoc &DL = I.getDebugLoc();
1023 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
1024 .addReg(Src0Reg)
1025 .addReg(Src1Reg)
1026 .addImm(SubReg);
1027
1028 I.eraseFromParent();
1029 return true;
1030}
1031
1032bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
1033 Register DstReg = MI.getOperand(0).getReg();
1034 Register SrcReg = MI.getOperand(1).getReg();
1035 Register OffsetReg = MI.getOperand(2).getReg();
1036 Register WidthReg = MI.getOperand(3).getReg();
1037
1038 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
1039 "scalar BFX instructions are expanded in regbankselect");
1040 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
1041 "64-bit vector BFX instructions are expanded in regbankselect");
1042
1043 const DebugLoc &DL = MI.getDebugLoc();
1044 MachineBasicBlock *MBB = MI.getParent();
1045
1046 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
1047 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
1048 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
1049 .addReg(SrcReg)
1050 .addReg(OffsetReg)
1051 .addReg(WidthReg);
1052 MI.eraseFromParent();
1053 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1054 return true;
1055}
1056
1057bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1058 if (STI.getLDSBankCount() != 16)
1059 return selectImpl(MI, *CoverageInfo);
1060
1061 Register Dst = MI.getOperand(0).getReg();
1062 Register Src0 = MI.getOperand(2).getReg();
1063 Register M0Val = MI.getOperand(6).getReg();
1064 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1065 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1066 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1067 return false;
1068
1069 // This requires 2 instructions. It is possible to write a pattern to support
1070 // this, but the generated isel emitter doesn't correctly deal with multiple
1071 // output instructions using the same physical register input. The copy to m0
1072 // is incorrectly placed before the second instruction.
1073 //
1074 // TODO: Match source modifiers.
1075
1076 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1077 const DebugLoc &DL = MI.getDebugLoc();
1078 MachineBasicBlock *MBB = MI.getParent();
1079
1080 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1081 .addReg(M0Val);
1082 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1083 .addImm(2)
1084 .addImm(MI.getOperand(4).getImm()) // $attr
1085 .addImm(MI.getOperand(3).getImm()); // $attrchan
1086
1087 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1088 .addImm(0) // $src0_modifiers
1089 .addReg(Src0) // $src0
1090 .addImm(MI.getOperand(4).getImm()) // $attr
1091 .addImm(MI.getOperand(3).getImm()) // $attrchan
1092 .addImm(0) // $src2_modifiers
1093 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1094 .addImm(MI.getOperand(5).getImm()) // $high
1095 .addImm(0) // $clamp
1096 .addImm(0); // $omod
1097
1098 MI.eraseFromParent();
1099 return true;
1100}
1101
1102// Writelane is special in that it can use SGPR and M0 (which would normally
1103// count as using the constant bus twice - but in this case it is allowed since
1104// the lane selector doesn't count as a use of the constant bus). However, it is
1105// still required to abide by the 1 SGPR rule. Fix this up if we might have
1106// multiple SGPRs.
1107bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1108 // With a constant bus limit of at least 2, there's no issue.
1109 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1110 return selectImpl(MI, *CoverageInfo);
1111
1112 MachineBasicBlock *MBB = MI.getParent();
1113 const DebugLoc &DL = MI.getDebugLoc();
1114 Register VDst = MI.getOperand(0).getReg();
1115 Register Val = MI.getOperand(2).getReg();
1116 Register LaneSelect = MI.getOperand(3).getReg();
1117 Register VDstIn = MI.getOperand(4).getReg();
1118
1119 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1120
1121 std::optional<ValueAndVReg> ConstSelect =
1122 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1123 if (ConstSelect) {
1124 // The selector has to be an inline immediate, so we can use whatever for
1125 // the other operands.
1126 MIB.addReg(Val);
1127 MIB.addImm(ConstSelect->Value.getSExtValue() &
1128 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1129 } else {
1130 std::optional<ValueAndVReg> ConstVal =
1132
1133 // If the value written is an inline immediate, we can get away without a
1134 // copy to m0.
1135 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1136 STI.hasInv2PiInlineImm())) {
1137 MIB.addImm(ConstVal->Value.getSExtValue());
1138 MIB.addReg(LaneSelect);
1139 } else {
1140 MIB.addReg(Val);
1141
1142 // If the lane selector was originally in a VGPR and copied with
1143 // readfirstlane, there's a hazard to read the same SGPR from the
1144 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1145 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1146
1147 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1148 .addReg(LaneSelect);
1149 MIB.addReg(AMDGPU::M0);
1150 }
1151 }
1152
1153 MIB.addReg(VDstIn);
1154
1155 MI.eraseFromParent();
1156 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1157 return true;
1158}
1159
1160// We need to handle this here because tablegen doesn't support matching
1161// instructions with multiple outputs.
1162bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1163 Register Dst0 = MI.getOperand(0).getReg();
1164 Register Dst1 = MI.getOperand(1).getReg();
1165
1166 LLT Ty = MRI->getType(Dst0);
1167 unsigned Opc;
1168 if (Ty == LLT::scalar(32))
1169 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1170 else if (Ty == LLT::scalar(64))
1171 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1172 else
1173 return false;
1174
1175 // TODO: Match source modifiers.
1176
1177 const DebugLoc &DL = MI.getDebugLoc();
1178 MachineBasicBlock *MBB = MI.getParent();
1179
1180 Register Numer = MI.getOperand(3).getReg();
1181 Register Denom = MI.getOperand(4).getReg();
1182 unsigned ChooseDenom = MI.getOperand(5).getImm();
1183
1184 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1185
1186 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1187 .addDef(Dst1)
1188 .addImm(0) // $src0_modifiers
1189 .addUse(Src0) // $src0
1190 .addImm(0) // $src1_modifiers
1191 .addUse(Denom) // $src1
1192 .addImm(0) // $src2_modifiers
1193 .addUse(Numer) // $src2
1194 .addImm(0) // $clamp
1195 .addImm(0); // $omod
1196
1197 MI.eraseFromParent();
1198 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1199 return true;
1200}
1201
1202bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1203 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1204 switch (IntrinsicID) {
1205 case Intrinsic::amdgcn_if_break: {
1206 MachineBasicBlock *BB = I.getParent();
1207
1208 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1209 // SelectionDAG uses for wave32 vs wave64.
1210 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1211 .add(I.getOperand(0))
1212 .add(I.getOperand(2))
1213 .add(I.getOperand(3));
1214
1215 Register DstReg = I.getOperand(0).getReg();
1216 Register Src0Reg = I.getOperand(2).getReg();
1217 Register Src1Reg = I.getOperand(3).getReg();
1218
1219 I.eraseFromParent();
1220
1221 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1222 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1223
1224 return true;
1225 }
1226 case Intrinsic::amdgcn_interp_p1_f16:
1227 return selectInterpP1F16(I);
1228 case Intrinsic::amdgcn_wqm:
1229 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1230 case Intrinsic::amdgcn_softwqm:
1231 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1232 case Intrinsic::amdgcn_strict_wwm:
1233 case Intrinsic::amdgcn_wwm:
1234 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1235 case Intrinsic::amdgcn_strict_wqm:
1236 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1237 case Intrinsic::amdgcn_writelane:
1238 return selectWritelane(I);
1239 case Intrinsic::amdgcn_div_scale:
1240 return selectDivScale(I);
1241 case Intrinsic::amdgcn_icmp:
1242 case Intrinsic::amdgcn_fcmp:
1243 if (selectImpl(I, *CoverageInfo))
1244 return true;
1245 return selectIntrinsicCmp(I);
1246 case Intrinsic::amdgcn_ballot:
1247 return selectBallot(I);
1248 case Intrinsic::amdgcn_reloc_constant:
1249 return selectRelocConstant(I);
1250 case Intrinsic::amdgcn_groupstaticsize:
1251 return selectGroupStaticSize(I);
1252 case Intrinsic::returnaddress:
1253 return selectReturnAddress(I);
1254 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1255 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1256 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1257 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1258 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1259 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1260 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1261 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1262 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1263 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1264 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1265 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1266 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1267 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1268 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1269 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1270 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1271 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1272 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1273 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1274 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1275 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1276 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1277 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1278 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1279 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1280 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1281 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1282 return selectSMFMACIntrin(I);
1283 case Intrinsic::amdgcn_permlane16_swap:
1284 case Intrinsic::amdgcn_permlane32_swap:
1285 return selectPermlaneSwapIntrin(I, IntrinsicID);
1286 case Intrinsic::amdgcn_wave_shuffle:
1287 return selectWaveShuffleIntrin(I);
1288 case Intrinsic::amdgcn_fma_legacy:
1289 if (!STI.hasFmaLegacy32Insts()) {
1291 return false;
1292 }
1293 return selectImpl(I, *CoverageInfo);
1294 case Intrinsic::amdgcn_sudot4:
1295 case Intrinsic::amdgcn_sudot8:
1296 if (!STI.hasDot8Insts()) {
1298 return false;
1299 }
1300 return selectImpl(I, *CoverageInfo);
1301 case Intrinsic::amdgcn_permlane16:
1302 case Intrinsic::amdgcn_permlanex16:
1303 if (!STI.hasPermlane16Insts()) {
1305 return false;
1306 }
1307 return selectImpl(I, *CoverageInfo);
1308 case Intrinsic::amdgcn_mov_dpp8:
1309 if (!STI.hasDPP8()) {
1311 return false;
1312 }
1313 return selectImpl(I, *CoverageInfo);
1314 case Intrinsic::amdgcn_tanh:
1315 if (!STI.hasTanhInsts()) {
1317 return false;
1318 }
1319 return selectImpl(I, *CoverageInfo);
1320 default:
1321 return selectImpl(I, *CoverageInfo);
1322 }
1323}
1324
1326 const GCNSubtarget &ST) {
1327 if (Size != 16 && Size != 32 && Size != 64)
1328 return -1;
1329
1330 if (Size == 16 && !ST.has16BitInsts())
1331 return -1;
1332
1333 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1334 unsigned FakeS16Opc, unsigned S32Opc,
1335 unsigned S64Opc) {
1336 if (Size == 16)
1337 return ST.hasTrue16BitInsts()
1338 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1339 : S16Opc;
1340 if (Size == 32)
1341 return S32Opc;
1342 return S64Opc;
1343 };
1344
1345 switch (P) {
1346 default:
1347 llvm_unreachable("Unknown condition code!");
1348 case CmpInst::ICMP_NE:
1349 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1350 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1351 AMDGPU::V_CMP_NE_U64_e64);
1352 case CmpInst::ICMP_EQ:
1353 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1354 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1355 AMDGPU::V_CMP_EQ_U64_e64);
1356 case CmpInst::ICMP_SGT:
1357 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1358 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1359 AMDGPU::V_CMP_GT_I64_e64);
1360 case CmpInst::ICMP_SGE:
1361 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1362 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1363 AMDGPU::V_CMP_GE_I64_e64);
1364 case CmpInst::ICMP_SLT:
1365 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1366 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1367 AMDGPU::V_CMP_LT_I64_e64);
1368 case CmpInst::ICMP_SLE:
1369 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1370 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1371 AMDGPU::V_CMP_LE_I64_e64);
1372 case CmpInst::ICMP_UGT:
1373 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1374 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1375 AMDGPU::V_CMP_GT_U64_e64);
1376 case CmpInst::ICMP_UGE:
1377 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1378 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1379 AMDGPU::V_CMP_GE_U64_e64);
1380 case CmpInst::ICMP_ULT:
1381 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1382 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1383 AMDGPU::V_CMP_LT_U64_e64);
1384 case CmpInst::ICMP_ULE:
1385 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1386 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1387 AMDGPU::V_CMP_LE_U64_e64);
1388
1389 case CmpInst::FCMP_OEQ:
1390 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1391 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1392 AMDGPU::V_CMP_EQ_F64_e64);
1393 case CmpInst::FCMP_OGT:
1394 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1395 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1396 AMDGPU::V_CMP_GT_F64_e64);
1397 case CmpInst::FCMP_OGE:
1398 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1399 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1400 AMDGPU::V_CMP_GE_F64_e64);
1401 case CmpInst::FCMP_OLT:
1402 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1403 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1404 AMDGPU::V_CMP_LT_F64_e64);
1405 case CmpInst::FCMP_OLE:
1406 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1407 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1408 AMDGPU::V_CMP_LE_F64_e64);
1409 case CmpInst::FCMP_ONE:
1410 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1411 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1412 AMDGPU::V_CMP_NEQ_F64_e64);
1413 case CmpInst::FCMP_ORD:
1414 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1415 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1416 AMDGPU::V_CMP_O_F64_e64);
1417 case CmpInst::FCMP_UNO:
1418 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1419 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1420 AMDGPU::V_CMP_U_F64_e64);
1421 case CmpInst::FCMP_UEQ:
1422 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1423 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1424 AMDGPU::V_CMP_NLG_F64_e64);
1425 case CmpInst::FCMP_UGT:
1426 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1427 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1428 AMDGPU::V_CMP_NLE_F64_e64);
1429 case CmpInst::FCMP_UGE:
1430 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1431 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1432 AMDGPU::V_CMP_NLT_F64_e64);
1433 case CmpInst::FCMP_ULT:
1434 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1435 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1436 AMDGPU::V_CMP_NGE_F64_e64);
1437 case CmpInst::FCMP_ULE:
1438 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1439 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1440 AMDGPU::V_CMP_NGT_F64_e64);
1441 case CmpInst::FCMP_UNE:
1442 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1443 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1444 AMDGPU::V_CMP_NEQ_F64_e64);
1445 case CmpInst::FCMP_TRUE:
1446 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1447 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1448 AMDGPU::V_CMP_TRU_F64_e64);
1450 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1451 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1452 AMDGPU::V_CMP_F_F64_e64);
1453 }
1454}
1455
1456int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1457 unsigned Size) const {
1458 if (Size == 64) {
1459 if (!STI.hasScalarCompareEq64())
1460 return -1;
1461
1462 switch (P) {
1463 case CmpInst::ICMP_NE:
1464 return AMDGPU::S_CMP_LG_U64;
1465 case CmpInst::ICMP_EQ:
1466 return AMDGPU::S_CMP_EQ_U64;
1467 default:
1468 return -1;
1469 }
1470 }
1471
1472 if (Size == 32) {
1473 switch (P) {
1474 case CmpInst::ICMP_NE:
1475 return AMDGPU::S_CMP_LG_U32;
1476 case CmpInst::ICMP_EQ:
1477 return AMDGPU::S_CMP_EQ_U32;
1478 case CmpInst::ICMP_SGT:
1479 return AMDGPU::S_CMP_GT_I32;
1480 case CmpInst::ICMP_SGE:
1481 return AMDGPU::S_CMP_GE_I32;
1482 case CmpInst::ICMP_SLT:
1483 return AMDGPU::S_CMP_LT_I32;
1484 case CmpInst::ICMP_SLE:
1485 return AMDGPU::S_CMP_LE_I32;
1486 case CmpInst::ICMP_UGT:
1487 return AMDGPU::S_CMP_GT_U32;
1488 case CmpInst::ICMP_UGE:
1489 return AMDGPU::S_CMP_GE_U32;
1490 case CmpInst::ICMP_ULT:
1491 return AMDGPU::S_CMP_LT_U32;
1492 case CmpInst::ICMP_ULE:
1493 return AMDGPU::S_CMP_LE_U32;
1494 case CmpInst::FCMP_OEQ:
1495 return AMDGPU::S_CMP_EQ_F32;
1496 case CmpInst::FCMP_OGT:
1497 return AMDGPU::S_CMP_GT_F32;
1498 case CmpInst::FCMP_OGE:
1499 return AMDGPU::S_CMP_GE_F32;
1500 case CmpInst::FCMP_OLT:
1501 return AMDGPU::S_CMP_LT_F32;
1502 case CmpInst::FCMP_OLE:
1503 return AMDGPU::S_CMP_LE_F32;
1504 case CmpInst::FCMP_ONE:
1505 return AMDGPU::S_CMP_LG_F32;
1506 case CmpInst::FCMP_ORD:
1507 return AMDGPU::S_CMP_O_F32;
1508 case CmpInst::FCMP_UNO:
1509 return AMDGPU::S_CMP_U_F32;
1510 case CmpInst::FCMP_UEQ:
1511 return AMDGPU::S_CMP_NLG_F32;
1512 case CmpInst::FCMP_UGT:
1513 return AMDGPU::S_CMP_NLE_F32;
1514 case CmpInst::FCMP_UGE:
1515 return AMDGPU::S_CMP_NLT_F32;
1516 case CmpInst::FCMP_ULT:
1517 return AMDGPU::S_CMP_NGE_F32;
1518 case CmpInst::FCMP_ULE:
1519 return AMDGPU::S_CMP_NGT_F32;
1520 case CmpInst::FCMP_UNE:
1521 return AMDGPU::S_CMP_NEQ_F32;
1522 default:
1523 llvm_unreachable("Unknown condition code!");
1524 }
1525 }
1526
1527 if (Size == 16) {
1528 if (!STI.hasSALUFloatInsts())
1529 return -1;
1530
1531 switch (P) {
1532 case CmpInst::FCMP_OEQ:
1533 return AMDGPU::S_CMP_EQ_F16;
1534 case CmpInst::FCMP_OGT:
1535 return AMDGPU::S_CMP_GT_F16;
1536 case CmpInst::FCMP_OGE:
1537 return AMDGPU::S_CMP_GE_F16;
1538 case CmpInst::FCMP_OLT:
1539 return AMDGPU::S_CMP_LT_F16;
1540 case CmpInst::FCMP_OLE:
1541 return AMDGPU::S_CMP_LE_F16;
1542 case CmpInst::FCMP_ONE:
1543 return AMDGPU::S_CMP_LG_F16;
1544 case CmpInst::FCMP_ORD:
1545 return AMDGPU::S_CMP_O_F16;
1546 case CmpInst::FCMP_UNO:
1547 return AMDGPU::S_CMP_U_F16;
1548 case CmpInst::FCMP_UEQ:
1549 return AMDGPU::S_CMP_NLG_F16;
1550 case CmpInst::FCMP_UGT:
1551 return AMDGPU::S_CMP_NLE_F16;
1552 case CmpInst::FCMP_UGE:
1553 return AMDGPU::S_CMP_NLT_F16;
1554 case CmpInst::FCMP_ULT:
1555 return AMDGPU::S_CMP_NGE_F16;
1556 case CmpInst::FCMP_ULE:
1557 return AMDGPU::S_CMP_NGT_F16;
1558 case CmpInst::FCMP_UNE:
1559 return AMDGPU::S_CMP_NEQ_F16;
1560 default:
1561 llvm_unreachable("Unknown condition code!");
1562 }
1563 }
1564
1565 return -1;
1566}
1567
1568bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1569
1570 MachineBasicBlock *BB = I.getParent();
1571 const DebugLoc &DL = I.getDebugLoc();
1572
1573 Register SrcReg = I.getOperand(2).getReg();
1574 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1575
1576 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1577
1578 Register CCReg = I.getOperand(0).getReg();
1579 if (!isVCC(CCReg, *MRI)) {
1580 int Opcode = getS_CMPOpcode(Pred, Size);
1581 if (Opcode == -1)
1582 return false;
1583 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1584 .add(I.getOperand(2))
1585 .add(I.getOperand(3));
1586 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1587 .addReg(AMDGPU::SCC);
1588 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1589 bool Ret =
1590 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1591 I.eraseFromParent();
1592 return Ret;
1593 }
1594
1595 if (I.getOpcode() == AMDGPU::G_FCMP)
1596 return false;
1597
1598 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1599 if (Opcode == -1)
1600 return false;
1601
1602 MachineInstrBuilder ICmp;
1603 // t16 instructions
1604 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1605 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1606 .addImm(0)
1607 .add(I.getOperand(2))
1608 .addImm(0)
1609 .add(I.getOperand(3))
1610 .addImm(0); // op_sel
1611 } else {
1612 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1613 .add(I.getOperand(2))
1614 .add(I.getOperand(3));
1615 }
1616
1617 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1618 *TRI.getBoolRC(), *MRI);
1619 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1620 I.eraseFromParent();
1621 return true;
1622}
1623
1624bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1625 Register Dst = I.getOperand(0).getReg();
1626 if (isVCC(Dst, *MRI))
1627 return false;
1628
1629 LLT DstTy = MRI->getType(Dst);
1630 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1631 return false;
1632
1633 MachineBasicBlock *BB = I.getParent();
1634 const DebugLoc &DL = I.getDebugLoc();
1635 Register SrcReg = I.getOperand(2).getReg();
1636 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1637
1638 // i1 inputs are not supported in GlobalISel.
1639 if (Size == 1)
1640 return false;
1641
1642 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1643 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1645 I.eraseFromParent();
1646 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1647 }
1648
1649 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1650 if (Opcode == -1)
1651 return false;
1652
1653 MachineInstrBuilder SelectedMI;
1654 MachineOperand &LHS = I.getOperand(2);
1655 MachineOperand &RHS = I.getOperand(3);
1656 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1657 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1658 Register Src0Reg =
1659 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1660 Register Src1Reg =
1661 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1662 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1663 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1664 SelectedMI.addImm(Src0Mods);
1665 SelectedMI.addReg(Src0Reg);
1666 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1667 SelectedMI.addImm(Src1Mods);
1668 SelectedMI.addReg(Src1Reg);
1669 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1670 SelectedMI.addImm(0); // clamp
1671 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1672 SelectedMI.addImm(0); // op_sel
1673
1674 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1675 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1676
1677 I.eraseFromParent();
1678 return true;
1679}
1680
1681// Ballot has to zero bits in input lane-mask that are zero in current exec,
1682// Done as AND with exec. For inputs that are results of instruction that
1683// implicitly use same exec, for example compares in same basic block or SCC to
1684// VCC copy, use copy.
1687 MachineInstr *MI = MRI.getVRegDef(Reg);
1688 if (MI->getParent() != MBB)
1689 return false;
1690
1691 // Lane mask generated by SCC to VCC copy.
1692 if (MI->getOpcode() == AMDGPU::COPY) {
1693 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1694 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1695 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1696 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1697 return true;
1698 }
1699
1700 // Lane mask generated by SCC to VCC copy
1701 if (MI->getOpcode() == AMDGPU::G_AMDGPU_COPY_VCC_SCC)
1702 return true;
1703
1704 // Lane mask generated using compare with same exec.
1705 if (isa<GAnyCmp>(MI))
1706 return true;
1707
1708 Register LHS, RHS;
1709 // Look through AND.
1710 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1711 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1713
1714 return false;
1715}
1716
1717bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1718 MachineBasicBlock *BB = I.getParent();
1719 const DebugLoc &DL = I.getDebugLoc();
1720 Register DstReg = I.getOperand(0).getReg();
1721 Register SrcReg = I.getOperand(2).getReg();
1722 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1723 const unsigned WaveSize = STI.getWavefrontSize();
1724
1725 // In the common case, the return type matches the wave size.
1726 // However we also support emitting i64 ballots in wave32 mode.
1727 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1728 return false;
1729
1730 std::optional<ValueAndVReg> Arg =
1732
1733 Register Dst = DstReg;
1734 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1735 if (BallotSize != WaveSize) {
1736 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1737 }
1738
1739 if (Arg) {
1740 const int64_t Value = Arg->Value.getZExtValue();
1741 if (Value == 0) {
1742 // Dst = S_MOV 0
1743 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1744 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1745 } else {
1746 // Dst = COPY EXEC
1747 assert(Value == 1);
1748 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1749 }
1750 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1751 return false;
1752 } else {
1753 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1754 // Dst = COPY SrcReg
1755 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1756 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1757 return false;
1758 } else {
1759 // Dst = S_AND SrcReg, EXEC
1760 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1761 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1762 .addReg(SrcReg)
1763 .addReg(TRI.getExec())
1764 .setOperandDead(3); // Dead scc
1765 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1766 }
1767 }
1768
1769 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1770 if (BallotSize != WaveSize) {
1771 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1772 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1773 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1774 .addReg(Dst)
1775 .addImm(AMDGPU::sub0)
1776 .addReg(HiReg)
1777 .addImm(AMDGPU::sub1);
1778 }
1779
1780 I.eraseFromParent();
1781 return true;
1782}
1783
1784bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1785 Register DstReg = I.getOperand(0).getReg();
1786 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1787 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1788 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1789 return false;
1790
1791 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1792
1793 Module *M = MF->getFunction().getParent();
1794 const MDNode *Metadata = I.getOperand(2).getMetadata();
1795 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1796 auto *RelocSymbol = cast<GlobalVariable>(
1797 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1798
1799 MachineBasicBlock *BB = I.getParent();
1800 BuildMI(*BB, &I, I.getDebugLoc(),
1801 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1803
1804 I.eraseFromParent();
1805 return true;
1806}
1807
1808bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1809 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1810
1811 Register DstReg = I.getOperand(0).getReg();
1812 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1813 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1814 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1815
1816 MachineBasicBlock *MBB = I.getParent();
1817 const DebugLoc &DL = I.getDebugLoc();
1818
1819 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1820
1821 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1822 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1823 MIB.addImm(MFI->getLDSSize());
1824 } else {
1825 Module *M = MF->getFunction().getParent();
1826 const GlobalValue *GV =
1827 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1829 }
1830
1831 I.eraseFromParent();
1832 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1833 return true;
1834}
1835
1836bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1837 MachineBasicBlock *MBB = I.getParent();
1838 MachineFunction &MF = *MBB->getParent();
1839 const DebugLoc &DL = I.getDebugLoc();
1840
1841 MachineOperand &Dst = I.getOperand(0);
1842 Register DstReg = Dst.getReg();
1843 unsigned Depth = I.getOperand(2).getImm();
1844
1845 const TargetRegisterClass *RC
1846 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1847 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1848 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1849 return false;
1850
1851 // Check for kernel and shader functions
1852 if (Depth != 0 ||
1853 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1854 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1855 .addImm(0);
1856 I.eraseFromParent();
1857 return true;
1858 }
1859
1860 MachineFrameInfo &MFI = MF.getFrameInfo();
1861 // There is a call to @llvm.returnaddress in this function
1862 MFI.setReturnAddressIsTaken(true);
1863
1864 // Get the return address reg and mark it as an implicit live-in
1865 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1866 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1867 AMDGPU::SReg_64RegClass, DL);
1868 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1869 .addReg(LiveIn);
1870 I.eraseFromParent();
1871 return true;
1872}
1873
1874bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1875 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1876 // SelectionDAG uses for wave32 vs wave64.
1877 MachineBasicBlock *BB = MI.getParent();
1878 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1879 .add(MI.getOperand(1));
1880
1881 Register Reg = MI.getOperand(1).getReg();
1882 MI.eraseFromParent();
1883
1884 if (!MRI->getRegClassOrNull(Reg))
1885 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1886 return true;
1887}
1888
1889bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1890 MachineInstr &MI, Intrinsic::ID IntrID) const {
1891 MachineBasicBlock *MBB = MI.getParent();
1892 MachineFunction *MF = MBB->getParent();
1893 const DebugLoc &DL = MI.getDebugLoc();
1894
1895 unsigned IndexOperand = MI.getOperand(7).getImm();
1896 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1897 bool WaveDone = MI.getOperand(9).getImm() != 0;
1898
1899 if (WaveDone && !WaveRelease) {
1900 // TODO: Move this to IR verifier
1901 const Function &Fn = MF->getFunction();
1902 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1903 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1904 }
1905
1906 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1907 IndexOperand &= ~0x3f;
1908 unsigned CountDw = 0;
1909
1910 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1911 CountDw = (IndexOperand >> 24) & 0xf;
1912 IndexOperand &= ~(0xf << 24);
1913
1914 if (CountDw < 1 || CountDw > 4) {
1915 const Function &Fn = MF->getFunction();
1916 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1917 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1918 CountDw = 1;
1919 }
1920 }
1921
1922 if (IndexOperand) {
1923 const Function &Fn = MF->getFunction();
1924 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1925 Fn, "ds_ordered_count: bad index operand", DL));
1926 }
1927
1928 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1929 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1930
1931 unsigned Offset0 = OrderedCountIndex << 2;
1932 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1933
1934 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1935 Offset1 |= (CountDw - 1) << 6;
1936
1937 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1938 Offset1 |= ShaderType << 2;
1939
1940 unsigned Offset = Offset0 | (Offset1 << 8);
1941
1942 Register M0Val = MI.getOperand(2).getReg();
1943 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1944 .addReg(M0Val);
1945
1946 Register DstReg = MI.getOperand(0).getReg();
1947 Register ValReg = MI.getOperand(3).getReg();
1948 MachineInstrBuilder DS =
1949 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1950 .addReg(ValReg)
1951 .addImm(Offset)
1952 .cloneMemRefs(MI);
1953
1954 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1955 return false;
1956
1957 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1958 MI.eraseFromParent();
1959 return true;
1960}
1961
1962static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1963 switch (IntrID) {
1964 case Intrinsic::amdgcn_ds_gws_init:
1965 return AMDGPU::DS_GWS_INIT;
1966 case Intrinsic::amdgcn_ds_gws_barrier:
1967 return AMDGPU::DS_GWS_BARRIER;
1968 case Intrinsic::amdgcn_ds_gws_sema_v:
1969 return AMDGPU::DS_GWS_SEMA_V;
1970 case Intrinsic::amdgcn_ds_gws_sema_br:
1971 return AMDGPU::DS_GWS_SEMA_BR;
1972 case Intrinsic::amdgcn_ds_gws_sema_p:
1973 return AMDGPU::DS_GWS_SEMA_P;
1974 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1975 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1976 default:
1977 llvm_unreachable("not a gws intrinsic");
1978 }
1979}
1980
1981bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1982 Intrinsic::ID IID) const {
1983 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1984 !STI.hasGWSSemaReleaseAll()))
1985 return false;
1986
1987 // intrinsic ID, vsrc, offset
1988 const bool HasVSrc = MI.getNumOperands() == 3;
1989 assert(HasVSrc || MI.getNumOperands() == 2);
1990
1991 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1992 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1993 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1994 return false;
1995
1996 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1997 unsigned ImmOffset;
1998
1999 MachineBasicBlock *MBB = MI.getParent();
2000 const DebugLoc &DL = MI.getDebugLoc();
2001
2002 MachineInstr *Readfirstlane = nullptr;
2003
2004 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
2005 // incoming offset, in case there's an add of a constant. We'll have to put it
2006 // back later.
2007 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
2008 Readfirstlane = OffsetDef;
2009 BaseOffset = OffsetDef->getOperand(1).getReg();
2010 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
2011 }
2012
2013 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
2014 // If we have a constant offset, try to use the 0 in m0 as the base.
2015 // TODO: Look into changing the default m0 initialization value. If the
2016 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2017 // the immediate offset.
2018
2019 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
2020 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2021 .addImm(0);
2022 } else {
2023 std::tie(BaseOffset, ImmOffset) =
2024 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
2025
2026 if (Readfirstlane) {
2027 // We have the constant offset now, so put the readfirstlane back on the
2028 // variable component.
2029 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
2030 return false;
2031
2032 Readfirstlane->getOperand(1).setReg(BaseOffset);
2033 BaseOffset = Readfirstlane->getOperand(0).getReg();
2034 } else {
2035 if (!RBI.constrainGenericRegister(BaseOffset,
2036 AMDGPU::SReg_32RegClass, *MRI))
2037 return false;
2038 }
2039
2040 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2041 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
2042 .addReg(BaseOffset)
2043 .addImm(16)
2044 .setOperandDead(3); // Dead scc
2045
2046 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2047 .addReg(M0Base);
2048 }
2049
2050 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2051 // offset field) % 64. Some versions of the programming guide omit the m0
2052 // part, or claim it's from offset 0.
2053
2054 unsigned Opc = gwsIntrinToOpcode(IID);
2055 const MCInstrDesc &InstrDesc = TII.get(Opc);
2056
2057 if (HasVSrc) {
2058 Register VSrc = MI.getOperand(1).getReg();
2059
2060 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
2061 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
2062 const TargetRegisterClass *SubRC =
2063 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
2064
2065 if (!SubRC) {
2066 // 32-bit normal case.
2067 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
2068 return false;
2069
2070 BuildMI(*MBB, &MI, DL, InstrDesc)
2071 .addReg(VSrc)
2072 .addImm(ImmOffset)
2073 .cloneMemRefs(MI);
2074 } else {
2075 // Requires even register alignment, so create 64-bit value and pad the
2076 // top half with undef.
2077 Register DataReg = MRI->createVirtualRegister(DataRC);
2078 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
2079 return false;
2080
2081 Register UndefReg = MRI->createVirtualRegister(SubRC);
2082 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2083 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
2084 .addReg(VSrc)
2085 .addImm(AMDGPU::sub0)
2086 .addReg(UndefReg)
2087 .addImm(AMDGPU::sub1);
2088
2089 BuildMI(*MBB, &MI, DL, InstrDesc)
2090 .addReg(DataReg)
2091 .addImm(ImmOffset)
2092 .cloneMemRefs(MI);
2093 }
2094 } else {
2095 BuildMI(*MBB, &MI, DL, InstrDesc)
2096 .addImm(ImmOffset)
2097 .cloneMemRefs(MI);
2098 }
2099
2100 MI.eraseFromParent();
2101 return true;
2102}
2103
2104bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2105 bool IsAppend) const {
2106 Register PtrBase = MI.getOperand(2).getReg();
2107 LLT PtrTy = MRI->getType(PtrBase);
2108 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2109
2110 unsigned Offset;
2111 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2112
2113 // TODO: Should this try to look through readfirstlane like GWS?
2114 if (!isDSOffsetLegal(PtrBase, Offset)) {
2115 PtrBase = MI.getOperand(2).getReg();
2116 Offset = 0;
2117 }
2118
2119 MachineBasicBlock *MBB = MI.getParent();
2120 const DebugLoc &DL = MI.getDebugLoc();
2121 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2122
2123 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2124 .addReg(PtrBase);
2125 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2126 return false;
2127
2128 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2129 .addImm(Offset)
2130 .addImm(IsGDS ? -1 : 0)
2131 .cloneMemRefs(MI);
2132 MI.eraseFromParent();
2133 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2134 return true;
2135}
2136
2137bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2138 MachineFunction *MF = MI.getMF();
2139 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2140
2141 MFInfo->setInitWholeWave();
2142 return selectImpl(MI, *CoverageInfo);
2143}
2144
2145static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2146 bool &IsTexFail) {
2147 if (TexFailCtrl)
2148 IsTexFail = true;
2149
2150 TFE = TexFailCtrl & 0x1;
2151 TexFailCtrl &= ~(uint64_t)0x1;
2152 LWE = TexFailCtrl & 0x2;
2153 TexFailCtrl &= ~(uint64_t)0x2;
2154
2155 return TexFailCtrl == 0;
2156}
2157
2158bool AMDGPUInstructionSelector::selectImageIntrinsic(
2159 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2160 MachineBasicBlock *MBB = MI.getParent();
2161 const DebugLoc &DL = MI.getDebugLoc();
2162 unsigned IntrOpcode = Intr->BaseOpcode;
2163
2164 // For image atomic: use no-return opcode if result is unused.
2165 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2166 Register ResultDef = MI.getOperand(0).getReg();
2167 if (MRI->use_nodbg_empty(ResultDef))
2168 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2169 }
2170
2171 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2173
2174 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2175 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2176 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2177 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2178 const bool IsGFX13Plus = AMDGPU::isGFX13Plus(STI);
2179
2180 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2181
2182 Register VDataIn = AMDGPU::NoRegister;
2183 Register VDataOut = AMDGPU::NoRegister;
2184 LLT VDataTy;
2185 int NumVDataDwords = -1;
2186 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2187 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2188
2189 bool Unorm;
2190 if (!BaseOpcode->Sampler)
2191 Unorm = true;
2192 else
2193 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2194
2195 bool TFE;
2196 bool LWE;
2197 bool IsTexFail = false;
2198 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2199 TFE, LWE, IsTexFail))
2200 return false;
2201
2202 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2203 const bool IsA16 = (Flags & 1) != 0;
2204 const bool IsG16 = (Flags & 2) != 0;
2205
2206 // A16 implies 16 bit gradients if subtarget doesn't support G16
2207 if (IsA16 && !STI.hasG16() && !IsG16)
2208 return false;
2209
2210 unsigned DMask = 0;
2211 unsigned DMaskLanes = 0;
2212
2213 if (BaseOpcode->Atomic) {
2214 if (!BaseOpcode->NoReturn)
2215 VDataOut = MI.getOperand(0).getReg();
2216 VDataIn = MI.getOperand(2).getReg();
2217 LLT Ty = MRI->getType(VDataIn);
2218
2219 // Be careful to allow atomic swap on 16-bit element vectors.
2220 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2221 Ty.getSizeInBits() == 128 :
2222 Ty.getSizeInBits() == 64;
2223
2224 if (BaseOpcode->AtomicX2) {
2225 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2226
2227 DMask = Is64Bit ? 0xf : 0x3;
2228 NumVDataDwords = Is64Bit ? 4 : 2;
2229 } else {
2230 DMask = Is64Bit ? 0x3 : 0x1;
2231 NumVDataDwords = Is64Bit ? 2 : 1;
2232 }
2233 } else {
2234 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2235 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2236
2237 if (BaseOpcode->Store) {
2238 VDataIn = MI.getOperand(1).getReg();
2239 VDataTy = MRI->getType(VDataIn);
2240 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2241 } else if (BaseOpcode->NoReturn) {
2242 NumVDataDwords = 0;
2243 } else {
2244 VDataOut = MI.getOperand(0).getReg();
2245 VDataTy = MRI->getType(VDataOut);
2246 NumVDataDwords = DMaskLanes;
2247
2248 if (IsD16 && !STI.hasUnpackedD16VMem())
2249 NumVDataDwords = (DMaskLanes + 1) / 2;
2250 }
2251 }
2252
2253 // Set G16 opcode
2254 if (Subtarget->hasG16() && IsG16) {
2255 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2257 assert(G16MappingInfo);
2258 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2259 }
2260
2261 // TODO: Check this in verifier.
2262 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2263
2264 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2265 // Keep GLC only when the atomic's result is actually used.
2266 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2268 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2270 return false;
2271
2272 int NumVAddrRegs = 0;
2273 int NumVAddrDwords = 0;
2274 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2275 // Skip the $noregs and 0s inserted during legalization.
2276 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2277 if (!AddrOp.isReg())
2278 continue; // XXX - Break?
2279
2280 Register Addr = AddrOp.getReg();
2281 if (!Addr)
2282 break;
2283
2284 ++NumVAddrRegs;
2285 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2286 }
2287
2288 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2289 // NSA, these should have been packed into a single value in the first
2290 // address register
2291 const bool UseNSA =
2292 NumVAddrRegs != 1 &&
2293 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2294 : NumVAddrDwords == NumVAddrRegs);
2295 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2296 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2297 return false;
2298 }
2299
2300 if (IsTexFail)
2301 ++NumVDataDwords;
2302
2303 int Opcode = -1;
2304 if (IsGFX13Plus) {
2305 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx13,
2306 NumVDataDwords, NumVAddrDwords);
2307 } else if (IsGFX12Plus) {
2308 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2309 NumVDataDwords, NumVAddrDwords);
2310 } else if (IsGFX11Plus) {
2311 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2312 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2313 : AMDGPU::MIMGEncGfx11Default,
2314 NumVDataDwords, NumVAddrDwords);
2315 } else if (IsGFX10Plus) {
2316 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2317 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2318 : AMDGPU::MIMGEncGfx10Default,
2319 NumVDataDwords, NumVAddrDwords);
2320 } else {
2321 if (Subtarget->hasGFX90AInsts()) {
2322 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2323 NumVDataDwords, NumVAddrDwords);
2324 if (Opcode == -1) {
2325 LLVM_DEBUG(
2326 dbgs()
2327 << "requested image instruction is not supported on this GPU\n");
2328 return false;
2329 }
2330 }
2331 if (Opcode == -1 &&
2332 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2333 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2334 NumVDataDwords, NumVAddrDwords);
2335 if (Opcode == -1)
2336 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2337 NumVDataDwords, NumVAddrDwords);
2338 }
2339 if (Opcode == -1)
2340 return false;
2341
2342 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2343 .cloneMemRefs(MI);
2344
2345 if (VDataOut) {
2346 if (BaseOpcode->AtomicX2) {
2347 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2348
2349 Register TmpReg = MRI->createVirtualRegister(
2350 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2351 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2352
2353 MIB.addDef(TmpReg);
2354 if (!MRI->use_empty(VDataOut)) {
2355 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2356 .addReg(TmpReg, RegState::Kill, SubReg);
2357 }
2358
2359 } else {
2360 MIB.addDef(VDataOut); // vdata output
2361 }
2362 }
2363
2364 if (VDataIn)
2365 MIB.addReg(VDataIn); // vdata input
2366
2367 for (int I = 0; I != NumVAddrRegs; ++I) {
2368 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2369 if (SrcOp.isReg()) {
2370 assert(SrcOp.getReg() != 0);
2371 MIB.addReg(SrcOp.getReg());
2372 }
2373 }
2374
2375 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2376 if (BaseOpcode->Sampler)
2377 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2378
2379 MIB.addImm(DMask); // dmask
2380
2381 if (IsGFX10Plus)
2382 MIB.addImm(DimInfo->Encoding);
2383 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2384 MIB.addImm(Unorm);
2385
2386 MIB.addImm(CPol);
2387 MIB.addImm(IsA16 && // a16 or r128
2388 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2389 if (IsGFX10Plus)
2390 MIB.addImm(IsA16 ? -1 : 0);
2391
2392 if (!Subtarget->hasGFX90AInsts()) {
2393 MIB.addImm(TFE); // tfe
2394 } else if (TFE) {
2395 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2396 return false;
2397 }
2398
2399 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2400 MIB.addImm(LWE); // lwe
2401 if (!IsGFX10Plus)
2402 MIB.addImm(DimInfo->DA ? -1 : 0);
2403 if (BaseOpcode->HasD16)
2404 MIB.addImm(IsD16 ? -1 : 0);
2405
2406 MI.eraseFromParent();
2407 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2408 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2409 return true;
2410}
2411
2412// We need to handle this here because tablegen doesn't support matching
2413// instructions with multiple outputs.
2414bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2415 MachineInstr &MI) const {
2416 Register Dst0 = MI.getOperand(0).getReg();
2417 Register Dst1 = MI.getOperand(1).getReg();
2418
2419 const DebugLoc &DL = MI.getDebugLoc();
2420 MachineBasicBlock *MBB = MI.getParent();
2421
2422 Register Addr = MI.getOperand(3).getReg();
2423 Register Data0 = MI.getOperand(4).getReg();
2424 Register Data1 = MI.getOperand(5).getReg();
2425 unsigned Offset = MI.getOperand(6).getImm();
2426
2427 unsigned Opc;
2428 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2429 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2430 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2431 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2432 break;
2433 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2434 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2435 break;
2436 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2437 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2438 break;
2439 }
2440
2441 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2442 .addDef(Dst1)
2443 .addUse(Addr)
2444 .addUse(Data0)
2445 .addUse(Data1)
2446 .addImm(Offset)
2447 .cloneMemRefs(MI);
2448
2449 MI.eraseFromParent();
2450 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2451 return true;
2452}
2453
2454bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2455 MachineInstr &I) const {
2456 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2457 switch (IntrinsicID) {
2458 case Intrinsic::amdgcn_end_cf:
2459 return selectEndCfIntrinsic(I);
2460 case Intrinsic::amdgcn_ds_ordered_add:
2461 case Intrinsic::amdgcn_ds_ordered_swap:
2462 return selectDSOrderedIntrinsic(I, IntrinsicID);
2463 case Intrinsic::amdgcn_ds_gws_init:
2464 case Intrinsic::amdgcn_ds_gws_barrier:
2465 case Intrinsic::amdgcn_ds_gws_sema_v:
2466 case Intrinsic::amdgcn_ds_gws_sema_br:
2467 case Intrinsic::amdgcn_ds_gws_sema_p:
2468 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2469 return selectDSGWSIntrinsic(I, IntrinsicID);
2470 case Intrinsic::amdgcn_ds_append:
2471 return selectDSAppendConsume(I, true);
2472 case Intrinsic::amdgcn_ds_consume:
2473 return selectDSAppendConsume(I, false);
2474 case Intrinsic::amdgcn_init_whole_wave:
2475 return selectInitWholeWave(I);
2476 case Intrinsic::amdgcn_raw_buffer_load_lds:
2477 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2478 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2479 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2480 case Intrinsic::amdgcn_struct_buffer_load_lds:
2481 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2482 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2483 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2484 return selectBufferLoadLds(I);
2485 // Until we can store both the address space of the global and the LDS
2486 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2487 // that the argument is a global pointer (buffer pointers have been handled by
2488 // a LLVM IR-level lowering).
2489 case Intrinsic::amdgcn_load_to_lds:
2490 case Intrinsic::amdgcn_load_async_to_lds:
2491 case Intrinsic::amdgcn_global_load_lds:
2492 case Intrinsic::amdgcn_global_load_async_lds:
2493 return selectGlobalLoadLds(I);
2494 case Intrinsic::amdgcn_tensor_load_to_lds:
2495 case Intrinsic::amdgcn_tensor_store_from_lds:
2496 return selectTensorLoadStore(I, IntrinsicID);
2497 case Intrinsic::amdgcn_asyncmark:
2498 case Intrinsic::amdgcn_wait_asyncmark:
2499 if (!Subtarget->hasAsyncMark())
2500 return false;
2501 break;
2502 case Intrinsic::amdgcn_exp_compr:
2503 if (!STI.hasCompressedExport()) {
2505 return false;
2506 }
2507 break;
2508 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2509 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2510 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2511 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2512 return selectDSBvhStackIntrinsic(I);
2513 case Intrinsic::amdgcn_s_alloc_vgpr: {
2514 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2515 // SCC. We then need to COPY it into the result vreg.
2516 MachineBasicBlock *MBB = I.getParent();
2517 const DebugLoc &DL = I.getDebugLoc();
2518
2519 Register ResReg = I.getOperand(0).getReg();
2520
2521 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2522 .add(I.getOperand(2));
2523 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2524 .addReg(AMDGPU::SCC);
2525 I.eraseFromParent();
2526 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2527 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2528 }
2529 case Intrinsic::amdgcn_s_barrier_init:
2530 case Intrinsic::amdgcn_s_barrier_signal_var:
2531 return selectNamedBarrierInit(I, IntrinsicID);
2532 case Intrinsic::amdgcn_s_wakeup_barrier: {
2533 if (!STI.hasSWakeupBarrier()) {
2535 return false;
2536 }
2537 return selectNamedBarrierInst(I, IntrinsicID);
2538 }
2539 case Intrinsic::amdgcn_s_barrier_join:
2540 case Intrinsic::amdgcn_s_get_named_barrier_state:
2541 return selectNamedBarrierInst(I, IntrinsicID);
2542 case Intrinsic::amdgcn_s_get_barrier_state:
2543 return selectSGetBarrierState(I, IntrinsicID);
2544 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2545 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2546 }
2547 return selectImpl(I, *CoverageInfo);
2548}
2549
2550bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2551 if (selectImpl(I, *CoverageInfo))
2552 return true;
2553
2554 MachineBasicBlock *BB = I.getParent();
2555 const DebugLoc &DL = I.getDebugLoc();
2556
2557 Register DstReg = I.getOperand(0).getReg();
2558 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2559 assert(Size <= 32 || Size == 64);
2560 const MachineOperand &CCOp = I.getOperand(1);
2561 Register CCReg = CCOp.getReg();
2562 if (!isVCC(CCReg, *MRI)) {
2563 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2564 AMDGPU::S_CSELECT_B32;
2565 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2566 .addReg(CCReg);
2567
2568 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2569 // bank, because it does not cover the register class that we used to represent
2570 // for it. So we need to manually set the register class here.
2571 if (!MRI->getRegClassOrNull(CCReg))
2572 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2573 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2574 .add(I.getOperand(2))
2575 .add(I.getOperand(3));
2576
2578 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2579 I.eraseFromParent();
2580 return true;
2581 }
2582
2583 // Wide VGPR select should have been split in RegBankSelect.
2584 if (Size > 32)
2585 return false;
2586
2587 MachineInstr *Select =
2588 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2589 .addImm(0)
2590 .add(I.getOperand(3))
2591 .addImm(0)
2592 .add(I.getOperand(2))
2593 .add(I.getOperand(1));
2594
2596 I.eraseFromParent();
2597 return true;
2598}
2599
2600bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2601 Register DstReg = I.getOperand(0).getReg();
2602 Register SrcReg = I.getOperand(1).getReg();
2603 const LLT DstTy = MRI->getType(DstReg);
2604 const LLT SrcTy = MRI->getType(SrcReg);
2605 const LLT S1 = LLT::scalar(1);
2606
2607 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2608 const RegisterBank *DstRB;
2609 if (DstTy == S1) {
2610 // This is a special case. We don't treat s1 for legalization artifacts as
2611 // vcc booleans.
2612 DstRB = SrcRB;
2613 } else {
2614 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2615 if (SrcRB != DstRB)
2616 return false;
2617 }
2618
2619 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2620
2621 unsigned DstSize = DstTy.getSizeInBits();
2622 unsigned SrcSize = SrcTy.getSizeInBits();
2623
2624 const TargetRegisterClass *SrcRC =
2625 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2626 const TargetRegisterClass *DstRC =
2627 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2628 if (!SrcRC || !DstRC)
2629 return false;
2630
2631 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2632 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2633 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2634 return false;
2635 }
2636
2637 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2638 assert(STI.useRealTrue16Insts());
2639 const DebugLoc &DL = I.getDebugLoc();
2640 MachineBasicBlock *MBB = I.getParent();
2641 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2642 .addReg(SrcReg, {}, AMDGPU::lo16);
2643 I.eraseFromParent();
2644 return true;
2645 }
2646
2647 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2648 MachineBasicBlock *MBB = I.getParent();
2649 const DebugLoc &DL = I.getDebugLoc();
2650
2651 Register LoReg = MRI->createVirtualRegister(DstRC);
2652 Register HiReg = MRI->createVirtualRegister(DstRC);
2653 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2654 .addReg(SrcReg, {}, AMDGPU::sub0);
2655 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2656 .addReg(SrcReg, {}, AMDGPU::sub1);
2657
2658 if (IsVALU && STI.hasSDWA()) {
2659 // Write the low 16-bits of the high element into the high 16-bits of the
2660 // low element.
2661 MachineInstr *MovSDWA =
2662 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2663 .addImm(0) // $src0_modifiers
2664 .addReg(HiReg) // $src0
2665 .addImm(0) // $clamp
2666 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2667 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2668 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2669 .addReg(LoReg, RegState::Implicit);
2670 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2671 } else {
2672 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2673 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2674 Register ImmReg = MRI->createVirtualRegister(DstRC);
2675 if (IsVALU) {
2676 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2677 .addImm(16)
2678 .addReg(HiReg);
2679 } else {
2680 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2681 .addReg(HiReg)
2682 .addImm(16)
2683 .setOperandDead(3); // Dead scc
2684 }
2685
2686 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2687 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2688 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2689
2690 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2691 .addImm(0xffff);
2692 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2693 .addReg(LoReg)
2694 .addReg(ImmReg);
2695 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2696 .addReg(TmpReg0)
2697 .addReg(TmpReg1);
2698
2699 if (!IsVALU) {
2700 And.setOperandDead(3); // Dead scc
2701 Or.setOperandDead(3); // Dead scc
2702 }
2703 }
2704
2705 I.eraseFromParent();
2706 return true;
2707 }
2708
2709 if (!DstTy.isScalar())
2710 return false;
2711
2712 if (SrcSize > 32) {
2713 unsigned SubRegIdx = DstSize < 32
2714 ? static_cast<unsigned>(AMDGPU::sub0)
2715 : TRI.getSubRegFromChannel(0, DstSize / 32);
2716 if (SubRegIdx == AMDGPU::NoSubRegister)
2717 return false;
2718
2719 // Deal with weird cases where the class only partially supports the subreg
2720 // index.
2721 const TargetRegisterClass *SrcWithSubRC
2722 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2723 if (!SrcWithSubRC)
2724 return false;
2725
2726 if (SrcWithSubRC != SrcRC) {
2727 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2728 return false;
2729 }
2730
2731 I.getOperand(1).setSubReg(SubRegIdx);
2732 }
2733
2734 I.setDesc(TII.get(TargetOpcode::COPY));
2735 return true;
2736}
2737
2738/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2739static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2741 int SignedMask = static_cast<int>(Mask);
2742 return SignedMask >= -16 && SignedMask <= 64;
2743}
2744
2745// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2746const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2747 Register Reg, const MachineRegisterInfo &MRI,
2748 const TargetRegisterInfo &TRI) const {
2749 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2750 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2751 return RB;
2752
2753 // Ignore the type, since we don't use vcc in artifacts.
2754 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2755 return &RBI.getRegBankFromRegClass(*RC, LLT());
2756 return nullptr;
2757}
2758
2759bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2760 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2761 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2762 const DebugLoc &DL = I.getDebugLoc();
2763 MachineBasicBlock &MBB = *I.getParent();
2764 const Register DstReg = I.getOperand(0).getReg();
2765 const Register SrcReg = I.getOperand(1).getReg();
2766
2767 const LLT DstTy = MRI->getType(DstReg);
2768 const LLT SrcTy = MRI->getType(SrcReg);
2769 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2770 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2771 const unsigned DstSize = DstTy.getSizeInBits();
2772 if (!DstTy.isScalar())
2773 return false;
2774
2775 // Artifact casts should never use vcc.
2776 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2777
2778 // FIXME: This should probably be illegal and split earlier.
2779 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2780 if (DstSize <= 32)
2781 return selectCOPY(I);
2782
2783 const TargetRegisterClass *SrcRC =
2784 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2785 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2786 const TargetRegisterClass *DstRC =
2787 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2788
2789 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2790 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2791 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2792 .addReg(SrcReg)
2793 .addImm(AMDGPU::sub0)
2794 .addReg(UndefReg)
2795 .addImm(AMDGPU::sub1);
2796 I.eraseFromParent();
2797
2798 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2799 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2800 }
2801
2802 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2803 // 64-bit should have been split up in RegBankSelect
2804
2805 // Try to use an and with a mask if it will save code size.
2806 unsigned Mask;
2807 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2808 MachineInstr *ExtI =
2809 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2810 .addImm(Mask)
2811 .addReg(SrcReg);
2812 I.eraseFromParent();
2813 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2814 return true;
2815 }
2816
2817 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2818 MachineInstr *ExtI =
2819 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2820 .addReg(SrcReg)
2821 .addImm(0) // Offset
2822 .addImm(SrcSize); // Width
2823 I.eraseFromParent();
2824 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2825 return true;
2826 }
2827
2828 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2829 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2830 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2831 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2832 return false;
2833
2834 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2835 const unsigned SextOpc = SrcSize == 8 ?
2836 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2837 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2838 .addReg(SrcReg);
2839 I.eraseFromParent();
2840 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2841 }
2842
2843 // Using a single 32-bit SALU to calculate the high half is smaller than
2844 // S_BFE with a literal constant operand.
2845 if (DstSize > 32 && SrcSize == 32) {
2846 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2847 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2848 if (Signed) {
2849 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2850 .addReg(SrcReg, {}, SubReg)
2851 .addImm(31)
2852 .setOperandDead(3); // Dead scc
2853 } else {
2854 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2855 .addImm(0);
2856 }
2857 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2858 .addReg(SrcReg, {}, SubReg)
2859 .addImm(AMDGPU::sub0)
2860 .addReg(HiReg)
2861 .addImm(AMDGPU::sub1);
2862 I.eraseFromParent();
2863 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2864 *MRI);
2865 }
2866
2867 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2868 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2869
2870 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2871 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2872 // We need a 64-bit register source, but the high bits don't matter.
2873 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2874 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2875 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2876
2877 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2878 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2879 .addReg(SrcReg, {}, SubReg)
2880 .addImm(AMDGPU::sub0)
2881 .addReg(UndefReg)
2882 .addImm(AMDGPU::sub1);
2883
2884 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2885 .addReg(ExtReg)
2886 .addImm(SrcSize << 16);
2887
2888 I.eraseFromParent();
2889 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2890 }
2891
2892 unsigned Mask;
2893 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2894 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2895 .addReg(SrcReg)
2896 .addImm(Mask)
2897 .setOperandDead(3); // Dead scc
2898 } else {
2899 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2900 .addReg(SrcReg)
2901 .addImm(SrcSize << 16);
2902 }
2903
2904 I.eraseFromParent();
2905 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2906 }
2907
2908 return false;
2909}
2910
2914
2916 Register BitcastSrc;
2917 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2918 Reg = BitcastSrc;
2919 return Reg;
2920}
2921
2923 Register &Out) {
2924 // When unmerging a register that is composed of 2 x 16-bit values allow to
2925 // use an extract hi instruction for the upper 16 bits. We only need to check
2926 // the size of `In` as all defs are guaranteed to be the same type for
2927 // GUnmerge.
2928 if (auto *Unmerge = dyn_cast<GUnmerge>(MRI.getVRegDef(In))) {
2929 if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(1).getReg() == In &&
2930 MRI.getType(In).getSizeInBits() == 16) {
2931 Out = Unmerge->getSourceReg();
2932 return true;
2933 }
2934 }
2935
2936 Register Trunc;
2937 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2938 return false;
2939
2940 Register LShlSrc;
2941 Register Cst;
2942 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2943 Cst = stripCopy(Cst, MRI);
2944 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2945 Out = stripBitCast(LShlSrc, MRI);
2946 return true;
2947 }
2948 }
2949
2950 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2951 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2952 return false;
2953
2954 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2955 LLT::fixed_vector(2, 16));
2956
2957 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2958 assert(Mask.size() == 2);
2959
2960 if (Mask[0] == 1 && Mask[1] <= 1) {
2961 Out = Shuffle->getOperand(0).getReg();
2962 return true;
2963 }
2964
2965 return false;
2966}
2967
2968bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2969 if (!Subtarget->hasSALUFloatInsts())
2970 return false;
2971
2972 Register Dst = I.getOperand(0).getReg();
2973 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2974 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2975 return false;
2976
2977 Register Src = I.getOperand(1).getReg();
2978
2979 if (MRI->getType(Dst) == LLT::scalar(32) &&
2980 MRI->getType(Src) == LLT::scalar(16)) {
2981 if (isExtractHiElt(*MRI, Src, Src)) {
2982 MachineBasicBlock *BB = I.getParent();
2983 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2984 .addUse(Src);
2985 I.eraseFromParent();
2986 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2987 }
2988 }
2989
2990 return false;
2991}
2992
2993bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2994 // Only manually handle the f64 SGPR case.
2995 //
2996 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2997 // the bit ops theoretically have a second result due to the implicit def of
2998 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2999 // that is easy by disabling the check. The result works, but uses a
3000 // nonsensical sreg32orlds_and_sreg_1 regclass.
3001 //
3002 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
3003 // the variadic REG_SEQUENCE operands.
3004
3005 Register Dst = MI.getOperand(0).getReg();
3006 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
3007 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
3008 MRI->getType(Dst) != LLT::scalar(64))
3009 return false;
3010
3011 Register Src = MI.getOperand(1).getReg();
3012 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
3013 if (Fabs)
3014 Src = Fabs->getOperand(1).getReg();
3015
3016 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
3017 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
3018 return false;
3019
3020 MachineBasicBlock *BB = MI.getParent();
3021 const DebugLoc &DL = MI.getDebugLoc();
3022 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3023 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3024 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3025 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3026
3027 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
3028 .addReg(Src, {}, AMDGPU::sub0);
3029 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
3030 .addReg(Src, {}, AMDGPU::sub1);
3031 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3032 .addImm(0x80000000);
3033
3034 // Set or toggle sign bit.
3035 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
3036 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
3037 .addReg(HiReg)
3038 .addReg(ConstReg)
3039 .setOperandDead(3); // Dead scc
3040 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3041 .addReg(LoReg)
3042 .addImm(AMDGPU::sub0)
3043 .addReg(OpReg)
3044 .addImm(AMDGPU::sub1);
3045 MI.eraseFromParent();
3046 return true;
3047}
3048
3049// FIXME: This is a workaround for the same tablegen problems as G_FNEG
3050bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
3051 Register Dst = MI.getOperand(0).getReg();
3052 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
3053 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
3054 MRI->getType(Dst) != LLT::scalar(64))
3055 return false;
3056
3057 Register Src = MI.getOperand(1).getReg();
3058 MachineBasicBlock *BB = MI.getParent();
3059 const DebugLoc &DL = MI.getDebugLoc();
3060 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3061 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3062 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3063 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3064
3065 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
3066 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
3067 return false;
3068
3069 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
3070 .addReg(Src, {}, AMDGPU::sub0);
3071 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
3072 .addReg(Src, {}, AMDGPU::sub1);
3073 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3074 .addImm(0x7fffffff);
3075
3076 // Clear sign bit.
3077 // TODO: Should this used S_BITSET0_*?
3078 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
3079 .addReg(HiReg)
3080 .addReg(ConstReg)
3081 .setOperandDead(3); // Dead scc
3082 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3083 .addReg(LoReg)
3084 .addImm(AMDGPU::sub0)
3085 .addReg(OpReg)
3086 .addImm(AMDGPU::sub1);
3087
3088 MI.eraseFromParent();
3089 return true;
3090}
3091
3092static bool isConstant(const MachineInstr &MI) {
3093 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
3094}
3095
3096void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
3097 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
3098
3099 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
3100 const MachineInstr *PtrMI =
3101 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
3102
3103 assert(PtrMI);
3104
3105 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3106 return;
3107
3108 GEPInfo GEPInfo;
3109
3110 for (unsigned i = 1; i != 3; ++i) {
3111 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3112 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3113 assert(OpDef);
3114 if (i == 2 && isConstant(*OpDef)) {
3115 // TODO: Could handle constant base + variable offset, but a combine
3116 // probably should have commuted it.
3117 assert(GEPInfo.Imm == 0);
3118 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3119 continue;
3120 }
3121 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3122 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3123 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3124 else
3125 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3126 }
3127
3128 AddrInfo.push_back(GEPInfo);
3129 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3130}
3131
3132bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3133 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3134}
3135
3136bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3137 if (!MI.hasOneMemOperand())
3138 return false;
3139
3140 const MachineMemOperand *MMO = *MI.memoperands_begin();
3141 const Value *Ptr = MMO->getValue();
3142
3143 // UndefValue means this is a load of a kernel input. These are uniform.
3144 // Sometimes LDS instructions have constant pointers.
3145 // If Ptr is null, then that means this mem operand contains a
3146 // PseudoSourceValue like GOT.
3148 return true;
3149
3151 return true;
3152
3153 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3154 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3155 AMDGPU::SGPRRegBankID;
3156
3157 const Instruction *I = dyn_cast<Instruction>(Ptr);
3158 return I && I->getMetadata("amdgpu.uniform");
3159}
3160
3161bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3162 for (const GEPInfo &GEPInfo : AddrInfo) {
3163 if (!GEPInfo.VgprParts.empty())
3164 return true;
3165 }
3166 return false;
3167}
3168
3169void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3170 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3171 unsigned AS = PtrTy.getAddressSpace();
3173 STI.ldsRequiresM0Init()) {
3174 MachineBasicBlock *BB = I.getParent();
3175
3176 // If DS instructions require M0 initialization, insert it before selecting.
3177 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3178 .addImm(-1);
3179 }
3180}
3181
3182bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3183 MachineInstr &I) const {
3184 initM0(I);
3185 return selectImpl(I, *CoverageInfo);
3186}
3187
3189 if (Reg.isPhysical())
3190 return false;
3191
3193 const unsigned Opcode = MI.getOpcode();
3194
3195 if (Opcode == AMDGPU::COPY)
3196 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3197
3198 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3199 Opcode == AMDGPU::G_XOR)
3200 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3201 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3202
3203 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3204 return GI->is(Intrinsic::amdgcn_class);
3205
3206 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3207}
3208
3209bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3210 MachineBasicBlock *BB = I.getParent();
3211 MachineOperand &CondOp = I.getOperand(0);
3212 Register CondReg = CondOp.getReg();
3213 const DebugLoc &DL = I.getDebugLoc();
3214
3215 unsigned BrOpcode;
3216 Register CondPhysReg;
3217 const TargetRegisterClass *ConstrainRC;
3218
3219 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3220 // whether the branch is uniform when selecting the instruction. In
3221 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3222 // RegBankSelect knows what it's doing if the branch condition is scc, even
3223 // though it currently does not.
3224 if (!isVCC(CondReg, *MRI)) {
3225 if (MRI->getType(CondReg) != LLT::scalar(32))
3226 return false;
3227
3228 CondPhysReg = AMDGPU::SCC;
3229 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3230 ConstrainRC = &AMDGPU::SReg_32RegClass;
3231 } else {
3232 // FIXME: Should scc->vcc copies and with exec?
3233
3234 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3235 // need to insert an and with exec.
3236 if (!isVCmpResult(CondReg, *MRI)) {
3237 const bool Is64 = STI.isWave64();
3238 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3239 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3240
3241 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3242 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3243 .addReg(CondReg)
3244 .addReg(Exec)
3245 .setOperandDead(3); // Dead scc
3246 CondReg = TmpReg;
3247 }
3248
3249 CondPhysReg = TRI.getVCC();
3250 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3251 ConstrainRC = TRI.getBoolRC();
3252 }
3253
3254 if (!MRI->getRegClassOrNull(CondReg))
3255 MRI->setRegClass(CondReg, ConstrainRC);
3256
3257 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3258 .addReg(CondReg);
3259 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3260 .addMBB(I.getOperand(1).getMBB());
3261
3262 I.eraseFromParent();
3263 return true;
3264}
3265
3266bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3267 MachineInstr &I) const {
3268 Register DstReg = I.getOperand(0).getReg();
3269 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3270 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3271 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3272 if (IsVGPR)
3273 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3274
3275 return RBI.constrainGenericRegister(
3276 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3277}
3278
3279bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3280 Register DstReg = I.getOperand(0).getReg();
3281 Register SrcReg = I.getOperand(1).getReg();
3282 Register MaskReg = I.getOperand(2).getReg();
3283 LLT Ty = MRI->getType(DstReg);
3284 LLT MaskTy = MRI->getType(MaskReg);
3285 MachineBasicBlock *BB = I.getParent();
3286 const DebugLoc &DL = I.getDebugLoc();
3287
3288 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3289 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3290 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3291 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3292 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3293 return false;
3294
3295 // Try to avoid emitting a bit operation when we only need to touch half of
3296 // the 64-bit pointer.
3297 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3298 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3299 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3300
3301 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3302 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3303
3304 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3305 !CanCopyLow32 && !CanCopyHi32) {
3306 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3307 .addReg(SrcReg)
3308 .addReg(MaskReg)
3309 .setOperandDead(3); // Dead scc
3310 I.eraseFromParent();
3311 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3312 return true;
3313 }
3314
3315 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3316 const TargetRegisterClass &RegRC
3317 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3318
3319 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3320 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3321 const TargetRegisterClass *MaskRC =
3322 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3323
3324 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3325 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3326 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3327 return false;
3328
3329 if (Ty.getSizeInBits() == 32) {
3330 assert(MaskTy.getSizeInBits() == 32 &&
3331 "ptrmask should have been narrowed during legalize");
3332
3333 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3334 .addReg(SrcReg)
3335 .addReg(MaskReg);
3336
3337 if (!IsVGPR)
3338 NewOp.setOperandDead(3); // Dead scc
3339 I.eraseFromParent();
3340 return true;
3341 }
3342
3343 Register HiReg = MRI->createVirtualRegister(&RegRC);
3344 Register LoReg = MRI->createVirtualRegister(&RegRC);
3345
3346 // Extract the subregisters from the source pointer.
3347 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3348 .addReg(SrcReg, {}, AMDGPU::sub0);
3349 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3350 .addReg(SrcReg, {}, AMDGPU::sub1);
3351
3352 Register MaskedLo, MaskedHi;
3353
3354 if (CanCopyLow32) {
3355 // If all the bits in the low half are 1, we only need a copy for it.
3356 MaskedLo = LoReg;
3357 } else {
3358 // Extract the mask subregister and apply the and.
3359 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3360 MaskedLo = MRI->createVirtualRegister(&RegRC);
3361
3362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3363 .addReg(MaskReg, {}, AMDGPU::sub0);
3364 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3365 .addReg(LoReg)
3366 .addReg(MaskLo);
3367 }
3368
3369 if (CanCopyHi32) {
3370 // If all the bits in the high half are 1, we only need a copy for it.
3371 MaskedHi = HiReg;
3372 } else {
3373 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3374 MaskedHi = MRI->createVirtualRegister(&RegRC);
3375
3376 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3377 .addReg(MaskReg, {}, AMDGPU::sub1);
3378 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3379 .addReg(HiReg)
3380 .addReg(MaskHi);
3381 }
3382
3383 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3384 .addReg(MaskedLo)
3385 .addImm(AMDGPU::sub0)
3386 .addReg(MaskedHi)
3387 .addImm(AMDGPU::sub1);
3388 I.eraseFromParent();
3389 return true;
3390}
3391
3392/// Return the register to use for the index value, and the subregister to use
3393/// for the indirectly accessed register.
3394static std::pair<Register, unsigned>
3396 const TargetRegisterClass *SuperRC, Register IdxReg,
3397 unsigned EltSize, GISelValueTracking &ValueTracking) {
3398 Register IdxBaseReg;
3399 int Offset;
3400
3401 std::tie(IdxBaseReg, Offset) =
3402 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3403 if (IdxBaseReg == AMDGPU::NoRegister) {
3404 // This will happen if the index is a known constant. This should ordinarily
3405 // be legalized out, but handle it as a register just in case.
3406 assert(Offset == 0);
3407 IdxBaseReg = IdxReg;
3408 }
3409
3410 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3411
3412 // Skip out of bounds offsets, or else we would end up using an undefined
3413 // register.
3414 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3415 return std::pair(IdxReg, SubRegs[0]);
3416 return std::pair(IdxBaseReg, SubRegs[Offset]);
3417}
3418
3419bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3420 MachineInstr &MI) const {
3421 Register DstReg = MI.getOperand(0).getReg();
3422 Register SrcReg = MI.getOperand(1).getReg();
3423 Register IdxReg = MI.getOperand(2).getReg();
3424
3425 LLT DstTy = MRI->getType(DstReg);
3426 LLT SrcTy = MRI->getType(SrcReg);
3427
3428 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3429 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3430 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3431
3432 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3433 // into a waterfall loop.
3434 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3435 return false;
3436
3437 const TargetRegisterClass *SrcRC =
3438 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3439 const TargetRegisterClass *DstRC =
3440 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3441 if (!SrcRC || !DstRC)
3442 return false;
3443 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3444 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3445 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3446 return false;
3447
3448 MachineBasicBlock *BB = MI.getParent();
3449 const DebugLoc &DL = MI.getDebugLoc();
3450 const bool Is64 = DstTy.getSizeInBits() == 64;
3451
3452 unsigned SubReg;
3453 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3454 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3455
3456 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3457 if (DstTy.getSizeInBits() != 32 && !Is64)
3458 return false;
3459
3460 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3461 .addReg(IdxReg);
3462
3463 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3464 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3465 .addReg(SrcReg, {}, SubReg)
3466 .addReg(SrcReg, RegState::Implicit);
3467 MI.eraseFromParent();
3468 return true;
3469 }
3470
3471 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3472 return false;
3473
3474 if (!STI.useVGPRIndexMode()) {
3475 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3476 .addReg(IdxReg);
3477 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3478 .addReg(SrcReg, {}, SubReg)
3479 .addReg(SrcReg, RegState::Implicit);
3480 MI.eraseFromParent();
3481 return true;
3482 }
3483
3484 const MCInstrDesc &GPRIDXDesc =
3485 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3486 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3487 .addReg(SrcReg)
3488 .addReg(IdxReg)
3489 .addImm(SubReg);
3490
3491 MI.eraseFromParent();
3492 return true;
3493}
3494
3495// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3496bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3497 MachineInstr &MI) const {
3498 Register DstReg = MI.getOperand(0).getReg();
3499 Register VecReg = MI.getOperand(1).getReg();
3500 Register ValReg = MI.getOperand(2).getReg();
3501 Register IdxReg = MI.getOperand(3).getReg();
3502
3503 LLT VecTy = MRI->getType(DstReg);
3504 LLT ValTy = MRI->getType(ValReg);
3505 unsigned VecSize = VecTy.getSizeInBits();
3506 unsigned ValSize = ValTy.getSizeInBits();
3507
3508 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3509 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3510 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3511
3512 assert(VecTy.getElementType() == ValTy);
3513
3514 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3515 // into a waterfall loop.
3516 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3517 return false;
3518
3519 const TargetRegisterClass *VecRC =
3520 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3521 const TargetRegisterClass *ValRC =
3522 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3523
3524 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3525 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3526 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3527 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3528 return false;
3529
3530 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3531 return false;
3532
3533 unsigned SubReg;
3534 std::tie(IdxReg, SubReg) =
3535 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3536
3537 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3538 STI.useVGPRIndexMode();
3539
3540 MachineBasicBlock *BB = MI.getParent();
3541 const DebugLoc &DL = MI.getDebugLoc();
3542
3543 if (!IndexMode) {
3544 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3545 .addReg(IdxReg);
3546
3547 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3548 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3549 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3550 .addReg(VecReg)
3551 .addReg(ValReg)
3552 .addImm(SubReg);
3553 MI.eraseFromParent();
3554 return true;
3555 }
3556
3557 const MCInstrDesc &GPRIDXDesc =
3558 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3559 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3560 .addReg(VecReg)
3561 .addReg(ValReg)
3562 .addReg(IdxReg)
3563 .addImm(SubReg);
3564
3565 MI.eraseFromParent();
3566 return true;
3567}
3568
3569static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3570 switch (Intr) {
3571 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3572 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3573 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3574 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3575 case Intrinsic::amdgcn_load_async_to_lds:
3576 case Intrinsic::amdgcn_global_load_async_lds:
3577 return true;
3578 }
3579 return false;
3580}
3581
3582bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3583 if (!Subtarget->hasVMemToLDSLoad())
3584 return false;
3585 unsigned Opc;
3586 unsigned Size = MI.getOperand(3).getImm();
3587 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3588
3589 // The struct intrinsic variants add one additional operand over raw.
3590 const bool HasVIndex = MI.getNumOperands() == 9;
3591 Register VIndex;
3592 int OpOffset = 0;
3593 if (HasVIndex) {
3594 VIndex = MI.getOperand(4).getReg();
3595 OpOffset = 1;
3596 }
3597
3598 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3599 std::optional<ValueAndVReg> MaybeVOffset =
3601 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3602
3603 switch (Size) {
3604 default:
3605 return false;
3606 case 1:
3607 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3608 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3609 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3610 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3611 break;
3612 case 2:
3613 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3614 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3615 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3616 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3617 break;
3618 case 4:
3619 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3620 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3621 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3622 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3623 break;
3624 case 12:
3625 if (!Subtarget->hasLDSLoadB96_B128())
3626 return false;
3627
3628 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3629 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3630 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3631 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3632 break;
3633 case 16:
3634 if (!Subtarget->hasLDSLoadB96_B128())
3635 return false;
3636
3637 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3638 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3639 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3640 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3641 break;
3642 }
3643
3644 MachineBasicBlock *MBB = MI.getParent();
3645 const DebugLoc &DL = MI.getDebugLoc();
3646 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3647 .add(MI.getOperand(2));
3648
3649 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3650
3651 if (HasVIndex && HasVOffset) {
3652 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3653 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3654 .addReg(VIndex)
3655 .addImm(AMDGPU::sub0)
3656 .addReg(VOffset)
3657 .addImm(AMDGPU::sub1);
3658
3659 MIB.addReg(IdxReg);
3660 } else if (HasVIndex) {
3661 MIB.addReg(VIndex);
3662 } else if (HasVOffset) {
3663 MIB.addReg(VOffset);
3664 }
3665
3666 MIB.add(MI.getOperand(1)); // rsrc
3667 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3668 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3669 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3670 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3671 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3672 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3673 MIB.addImm(
3674 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3675 ? 1
3676 : 0); // swz
3677 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3678
3679 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3680 // Don't set the offset value here because the pointer points to the base of
3681 // the buffer.
3682 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3683
3684 MachinePointerInfo StorePtrI = LoadPtrI;
3685 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3689
3690 auto F = LoadMMO->getFlags() &
3692 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3693 Size, LoadMMO->getBaseAlign());
3694
3695 MachineMemOperand *StoreMMO =
3696 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3697 sizeof(int32_t), LoadMMO->getBaseAlign());
3698
3699 MIB.setMemRefs({LoadMMO, StoreMMO});
3700
3701 MI.eraseFromParent();
3702 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3703 return true;
3704}
3705
3706/// Match a zero extend from a 32-bit value to 64-bits.
3707Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3708 Register ZExtSrc;
3709 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3710 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3711
3712 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3713 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3714 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3715 return Register();
3716
3717 assert(Def->getNumOperands() == 3 &&
3718 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3719 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3720 return Def->getOperand(1).getReg();
3721 }
3722
3723 return Register();
3724}
3725
3726/// Match a sign extend from a 32-bit value to 64-bits.
3727Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3728 Register SExtSrc;
3729 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3730 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3731
3732 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3733 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3734 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3735 return Register();
3736
3737 assert(Def->getNumOperands() == 3 &&
3738 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3739 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3740 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3741 m_SpecificICst(31))))
3742 return Def->getOperand(1).getReg();
3743
3744 if (VT->signBitIsZero(Reg))
3745 return matchZeroExtendFromS32(Reg);
3746
3747 return Register();
3748}
3749
3750/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3751/// is 32-bit.
3753AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3754 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3755 : matchZeroExtendFromS32(Reg);
3756}
3757
3758/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3759/// is 32-bit.
3761AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3762 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3763 : matchSignExtendFromS32(Reg);
3764}
3765
3767AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3768 bool IsSigned) const {
3769 if (IsSigned)
3770 return matchSignExtendFromS32OrS32(Reg);
3771
3772 return matchZeroExtendFromS32OrS32(Reg);
3773}
3774
3775Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3776 Register AnyExtSrc;
3777 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3778 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3779
3780 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3781 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3782 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3783 return Register();
3784
3785 assert(Def->getNumOperands() == 3 &&
3786 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3787
3788 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3789 return Def->getOperand(1).getReg();
3790
3791 return Register();
3792}
3793
3794bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3795 if (!Subtarget->hasVMemToLDSLoad())
3796 return false;
3797
3798 unsigned Opc;
3799 unsigned Size = MI.getOperand(3).getImm();
3800 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3801
3802 switch (Size) {
3803 default:
3804 return false;
3805 case 1:
3806 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3807 break;
3808 case 2:
3809 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3810 break;
3811 case 4:
3812 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3813 break;
3814 case 12:
3815 if (!Subtarget->hasLDSLoadB96_B128())
3816 return false;
3817 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3818 break;
3819 case 16:
3820 if (!Subtarget->hasLDSLoadB96_B128())
3821 return false;
3822 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3823 break;
3824 }
3825
3826 MachineBasicBlock *MBB = MI.getParent();
3827 const DebugLoc &DL = MI.getDebugLoc();
3828 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3829 .add(MI.getOperand(2));
3830
3831 Register Addr = MI.getOperand(1).getReg();
3832 Register VOffset;
3833 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3834 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3835 if (!isSGPR(Addr)) {
3836 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3837 if (isSGPR(AddrDef->Reg)) {
3838 Addr = AddrDef->Reg;
3839 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3840 Register SAddr =
3841 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3842 if (isSGPR(SAddr)) {
3843 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3844 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3845 Addr = SAddr;
3846 VOffset = Off;
3847 }
3848 }
3849 }
3850 }
3851
3852 if (isSGPR(Addr)) {
3854 if (!VOffset) {
3855 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3856 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3857 .addImm(0);
3858 }
3859 }
3860
3861 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3862 .addReg(Addr);
3863
3864 if (isSGPR(Addr))
3865 MIB.addReg(VOffset);
3866
3867 MIB.add(MI.getOperand(4)); // offset
3868
3869 unsigned Aux = MI.getOperand(5).getImm();
3870 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3871 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3872
3873 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3874 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3875 LoadPtrI.Offset = MI.getOperand(4).getImm();
3876 MachinePointerInfo StorePtrI = LoadPtrI;
3877 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3881 auto F = LoadMMO->getFlags() &
3883 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3884 Size, LoadMMO->getBaseAlign());
3885 MachineMemOperand *StoreMMO =
3886 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3887 sizeof(int32_t), Align(4));
3888
3889 MIB.setMemRefs({LoadMMO, StoreMMO});
3890
3891 MI.eraseFromParent();
3892 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3893 return true;
3894}
3895
3896bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3897 Intrinsic::ID IID) const {
3898 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3899 unsigned Opc =
3900 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3901 int NumGroups = 4;
3902
3903 // A lamda function to check whether an operand is a vector of all 0s.
3904 const auto isAllZeros = [&](MachineOperand &Opnd) {
3905 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3906 if (!DefMI)
3907 return false;
3908 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3909 };
3910
3911 // Use _D2 version if both group 2 and 3 are zero-initialized.
3912 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3913 NumGroups = 2;
3914 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3915 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3916 }
3917
3918 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3919 // for now because all existing targets only support up to 4 groups.
3920 MachineBasicBlock *MBB = MI.getParent();
3921 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3922 .add(MI.getOperand(1)) // D# group 0
3923 .add(MI.getOperand(2)); // D# group 1
3924
3925 if (NumGroups >= 4) { // Has at least 4 groups
3926 MIB.add(MI.getOperand(3)) // D# group 2
3927 .add(MI.getOperand(4)); // D# group 3
3928 }
3929
3930 MIB.addImm(0) // r128
3931 .add(MI.getOperand(6)); // cpol
3932
3933 MI.eraseFromParent();
3934 return true;
3935}
3936
3937bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3938 MachineInstr &MI) const {
3939 unsigned OpcodeOpIdx =
3940 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3941 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3942 MI.removeOperand(OpcodeOpIdx);
3943 MI.addImplicitDefUseOperands(*MI.getMF());
3944 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3945 return true;
3946}
3947
3948// FIXME: This should be removed and let the patterns select. We just need the
3949// AGPR/VGPR combination versions.
3950bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3951 unsigned Opc;
3952 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3953 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3954 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3955 break;
3956 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3957 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3958 break;
3959 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3960 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3961 break;
3962 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3963 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3964 break;
3965 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3966 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3967 break;
3968 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3969 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3970 break;
3971 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3972 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3973 break;
3974 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3975 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3976 break;
3977 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3978 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3979 break;
3980 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3981 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3982 break;
3983 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3984 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3985 break;
3986 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3987 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3988 break;
3989 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3990 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3991 break;
3992 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3993 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3994 break;
3995 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3996 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3997 break;
3998 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3999 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
4000 break;
4001 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
4002 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
4003 break;
4004 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
4005 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
4006 break;
4007 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
4008 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
4009 break;
4010 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
4011 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
4012 break;
4013 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
4014 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
4015 break;
4016 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
4017 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
4018 break;
4019 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
4020 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
4021 break;
4022 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
4023 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
4024 break;
4025 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
4026 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
4027 break;
4028 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
4029 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
4030 break;
4031 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
4032 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
4033 break;
4034 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
4035 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
4036 break;
4037 default:
4038 llvm_unreachable("unhandled smfmac intrinsic");
4039 }
4040
4041 auto VDst_In = MI.getOperand(4);
4042
4043 MI.setDesc(TII.get(Opc));
4044 MI.removeOperand(4); // VDst_In
4045 MI.removeOperand(1); // Intrinsic ID
4046 MI.addOperand(VDst_In); // Readd VDst_In to the end
4047 MI.addImplicitDefUseOperands(*MI.getMF());
4048 const MCInstrDesc &MCID = MI.getDesc();
4049 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
4050 MI.getOperand(0).setIsEarlyClobber(true);
4051 }
4052 return true;
4053}
4054
4055bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
4056 MachineInstr &MI, Intrinsic::ID IntrID) const {
4057 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
4058 !Subtarget->hasPermlane16Swap())
4059 return false;
4060 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
4061 !Subtarget->hasPermlane32Swap())
4062 return false;
4063
4064 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
4065 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
4066 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
4067
4068 MI.removeOperand(2);
4069 MI.setDesc(TII.get(Opcode));
4070 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
4071
4072 MachineOperand &FI = MI.getOperand(4);
4074
4075 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
4076 return true;
4077}
4078
4079bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
4080 Register DstReg = MI.getOperand(0).getReg();
4081 Register SrcReg = MI.getOperand(1).getReg();
4082 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4083 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4084 MachineBasicBlock *MBB = MI.getParent();
4085 const DebugLoc &DL = MI.getDebugLoc();
4086
4087 if (IsVALU) {
4088 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
4089 .addImm(Subtarget->getWavefrontSizeLog2())
4090 .addReg(SrcReg);
4091 } else {
4092 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
4093 .addReg(SrcReg)
4094 .addImm(Subtarget->getWavefrontSizeLog2())
4095 .setOperandDead(3); // Dead scc
4096 }
4097
4098 const TargetRegisterClass &RC =
4099 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
4100 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
4101 return false;
4102
4103 MI.eraseFromParent();
4104 return true;
4105}
4106
4107bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4108 MachineInstr &MI) const {
4109 assert(MI.getNumOperands() == 4);
4110 MachineBasicBlock *MBB = MI.getParent();
4111 const DebugLoc &DL = MI.getDebugLoc();
4112
4113 Register DstReg = MI.getOperand(0).getReg();
4114 Register ValReg = MI.getOperand(2).getReg();
4115 Register IdxReg = MI.getOperand(3).getReg();
4116
4117 const LLT DstTy = MRI->getType(DstReg);
4118 unsigned DstSize = DstTy.getSizeInBits();
4119 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4120 const TargetRegisterClass *DstRC =
4121 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4122
4123 if (DstTy != LLT::scalar(32))
4124 return false;
4125
4126 if (!Subtarget->supportsBPermute())
4127 return false;
4128
4129 // If we can bpermute across the whole wave, then just do that
4130 if (Subtarget->supportsWaveWideBPermute()) {
4131 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4132 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4133 .addImm(2)
4134 .addReg(IdxReg);
4135
4136 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4137 .addReg(ShiftIdxReg)
4138 .addReg(ValReg)
4139 .addImm(0);
4140 } else {
4141 // Otherwise, we need to make use of whole wave mode
4142 assert(Subtarget->isWave64());
4143
4144 // Set inactive lanes to poison
4145 Register UndefValReg =
4146 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4147 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4148
4149 Register UndefExecReg = MRI->createVirtualRegister(
4150 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4151 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4152
4153 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4154 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4155 .addImm(0)
4156 .addReg(ValReg)
4157 .addImm(0)
4158 .addReg(UndefValReg)
4159 .addReg(UndefExecReg);
4160
4161 // ds_bpermute requires index to be multiplied by 4
4162 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4163 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4164 .addImm(2)
4165 .addReg(IdxReg);
4166
4167 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4168 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4169 .addImm(0)
4170 .addReg(ShiftIdxReg)
4171 .addImm(0)
4172 .addReg(UndefValReg)
4173 .addReg(UndefExecReg);
4174
4175 // Get permutation of each half, then we'll select which one to use
4176 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4177 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4178 .addReg(PoisonIdxReg)
4179 .addReg(PoisonValReg)
4180 .addImm(0);
4181
4182 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4183 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4184 .addReg(PoisonValReg);
4185
4186 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4187 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4188 .addReg(PoisonIdxReg)
4189 .addReg(SwappedValReg)
4190 .addImm(0);
4191
4192 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4193 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4194 .addReg(OppSidePermReg);
4195
4196 // Select which side to take the permute from
4197 // We can get away with only using mbcnt_lo here since we're only
4198 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4199 // returns 32 for lanes 32-63.
4200 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4201 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4202 .addImm(-1)
4203 .addImm(0);
4204
4205 Register XORReg = MRI->createVirtualRegister(DstRC);
4206 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4207 .addReg(ThreadIDReg)
4208 .addReg(PoisonIdxReg);
4209
4210 Register ANDReg = MRI->createVirtualRegister(DstRC);
4211 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4212 .addReg(XORReg)
4213 .addImm(32);
4214
4215 Register CompareReg = MRI->createVirtualRegister(
4216 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4217 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4218 .addReg(ANDReg)
4219 .addImm(0);
4220
4221 // Finally do the selection
4222 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4223 .addImm(0)
4224 .addReg(WWMSwapPermReg)
4225 .addImm(0)
4226 .addReg(SameSidePermReg)
4227 .addReg(CompareReg);
4228 }
4229
4230 MI.eraseFromParent();
4231 return true;
4232}
4233
4234// Match BITOP3 operation and return a number of matched instructions plus
4235// truth table.
4236static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4238 const MachineRegisterInfo &MRI) {
4239 unsigned NumOpcodes = 0;
4240 uint8_t LHSBits, RHSBits;
4241
4242 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4243 // Define truth table given Src0, Src1, Src2 bits permutations:
4244 // 0 0 0
4245 // 0 0 1
4246 // 0 1 0
4247 // 0 1 1
4248 // 1 0 0
4249 // 1 0 1
4250 // 1 1 0
4251 // 1 1 1
4252 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4253
4254 if (mi_match(Op, MRI, m_AllOnesInt())) {
4255 Bits = 0xff;
4256 return true;
4257 }
4258 if (mi_match(Op, MRI, m_ZeroInt())) {
4259 Bits = 0;
4260 return true;
4261 }
4262
4263 for (unsigned I = 0; I < Src.size(); ++I) {
4264 // Try to find existing reused operand
4265 if (Src[I] == Op) {
4266 Bits = SrcBits[I];
4267 return true;
4268 }
4269 // Try to replace parent operator
4270 if (Src[I] == R) {
4271 Bits = SrcBits[I];
4272 Src[I] = Op;
4273 return true;
4274 }
4275 }
4276
4277 if (Src.size() == 3) {
4278 // No room left for operands. Try one last time, there can be a 'not' of
4279 // one of our source operands. In this case we can compute the bits
4280 // without growing Src vector.
4281 Register LHS;
4282 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4284 for (unsigned I = 0; I < Src.size(); ++I) {
4285 if (Src[I] == LHS) {
4286 Bits = ~SrcBits[I];
4287 return true;
4288 }
4289 }
4290 }
4291
4292 return false;
4293 }
4294
4295 Bits = SrcBits[Src.size()];
4296 Src.push_back(Op);
4297 return true;
4298 };
4299
4300 MachineInstr *MI = MRI.getVRegDef(R);
4301 switch (MI->getOpcode()) {
4302 case TargetOpcode::G_AND:
4303 case TargetOpcode::G_OR:
4304 case TargetOpcode::G_XOR: {
4305 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4306 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4307
4308 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4309 if (!getOperandBits(LHS, LHSBits) ||
4310 !getOperandBits(RHS, RHSBits)) {
4311 Src = std::move(Backup);
4312 return std::make_pair(0, 0);
4313 }
4314
4315 // Recursion is naturally limited by the size of the operand vector.
4316 //
4317 // When LHS and RHS share a common sub-expression, one side's recursion
4318 // may decompose that sub-expression and replace the Src slot the other
4319 // side occupies with sub-operands via the "replace parent" path in
4320 // getOperandBits. The other side's cached bit-pattern then refers to a
4321 // slot whose contents changed, producing a wrong truth table.
4322 //
4323 // We detect this in three ways:
4324 // (A) If LHS recursed, its truth table is valid against the Src state
4325 // when LHS recursion completed (SrcAfterLHS). If RHS recursion
4326 // then mutates a Src slot that LHSBits depends on, LHSBits is
4327 // stale.
4328 // (B) If RHS did not recurse, RHSBits came from getOperandBits and
4329 // refers to a specific Src slot. If that slot's contents changed
4330 // (by either recursion), RHSBits is stale.
4331 // (C) Symmetrically for LHS if it did not recurse.
4332 SmallVector<Register, 3> SrcBeforeRecurse(Src.begin(), Src.end());
4333 uint8_t LHSBitsOrig = LHSBits;
4334 uint8_t RHSBitsOrig = RHSBits;
4335
4336 auto LHSOp = BitOp3_Op(LHS, Src, MRI);
4337 if (LHSOp.first) {
4338 NumOpcodes += LHSOp.first;
4339 LHSBits = LHSOp.second;
4340 }
4341
4342 SmallVector<Register, 3> SrcAfterLHS(Src.begin(), Src.end());
4343
4344 auto RHSOp = BitOp3_Op(RHS, Src, MRI);
4345 if (RHSOp.first) {
4346 NumOpcodes += RHSOp.first;
4347 RHSBits = RHSOp.second;
4348 }
4349
4350 // dependsOnSlot: true iff the truth table TT varies with slot Slot.
4351 auto dependsOnSlot = [](uint8_t TT, int Slot) -> bool {
4352 if (Slot < 0 || Slot > 2)
4353 return false;
4354 const uint8_t Masks[3] = {0x0f, 0x33, 0x55};
4355 const int Shifts[3] = {4, 2, 1};
4356 return ((TT ^ (TT >> Shifts[Slot])) & Masks[Slot]) != 0;
4357 };
4358
4359 // findSlot: locate the Src slot a getOperandBits result depends on,
4360 // including negated (NOT) patterns that getOperandBits resolves via
4361 // the ~SrcBits[I] shortcut.
4362 const uint8_t SrcBitsConst[3] = {0xf0, 0xcc, 0xaa};
4363 auto findSlot = [&](uint8_t Bits, Register Op,
4364 const SmallVectorImpl<Register> &S) -> int {
4365 Register NegatedInner;
4366 bool IsNegationOp = mi_match(Op, MRI, m_Not(m_Reg(NegatedInner)));
4367 if (IsNegationOp)
4368 NegatedInner = getSrcRegIgnoringCopies(NegatedInner, MRI);
4369 for (int I = 0; I < (int)S.size(); I++) {
4370 if (Bits == SrcBitsConst[I] && S[I] == Op)
4371 return I;
4372 if (IsNegationOp && Bits == (uint8_t)~SrcBitsConst[I] &&
4373 S[I] == NegatedInner)
4374 return I;
4375 }
4376 return -1;
4377 };
4378
4379 bool Stale = false;
4380
4381 // (A) LHS recursed: its truth table is against SrcAfterLHS.
4382 // Check if RHS recursion mutated a slot that LHSBits uses.
4383 if (LHSOp.first) {
4384 for (int I = 0; I < (int)SrcAfterLHS.size() && I < 3; I++) {
4385 if (I < (int)Src.size() && Src[I] != SrcAfterLHS[I] &&
4386 dependsOnSlot(LHSBits, I)) {
4387 Stale = true;
4388 break;
4389 }
4390 }
4391 }
4392
4393 // (B) RHS did not recurse: RHSBits from getOperandBits is against
4394 // SrcBeforeRecurse. Check if that slot was mutated since then.
4395 if (!Stale && !RHSOp.first) {
4396 int Slot = findSlot(RHSBitsOrig, RHS, SrcBeforeRecurse);
4397 if (Slot >= 0 &&
4398 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4399 Stale = true;
4400 }
4401
4402 // (C) LHS did not recurse: LHSBits from getOperandBits is against
4403 // SrcBeforeRecurse. Check if that slot was mutated since then.
4404 if (!Stale && !LHSOp.first) {
4405 int Slot = findSlot(LHSBitsOrig, LHS, SrcBeforeRecurse);
4406 if (Slot >= 0 &&
4407 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4408 Stale = true;
4409 }
4410
4411 if (Stale) {
4412 Src = std::move(SrcBeforeRecurse);
4413 LHSBits = LHSBitsOrig;
4414 RHSBits = RHSBitsOrig;
4415 NumOpcodes = 0;
4416 }
4417 break;
4418 }
4419 default:
4420 return std::make_pair(0, 0);
4421 }
4422
4423 uint8_t TTbl;
4424 switch (MI->getOpcode()) {
4425 case TargetOpcode::G_AND:
4426 TTbl = LHSBits & RHSBits;
4427 break;
4428 case TargetOpcode::G_OR:
4429 TTbl = LHSBits | RHSBits;
4430 break;
4431 case TargetOpcode::G_XOR:
4432 TTbl = LHSBits ^ RHSBits;
4433 break;
4434 default:
4435 break;
4436 }
4437
4438 return std::make_pair(NumOpcodes + 1, TTbl);
4439}
4440
4441bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4442 if (!Subtarget->hasBitOp3Insts())
4443 return false;
4444
4445 Register DstReg = MI.getOperand(0).getReg();
4446 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4447 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4448 if (!IsVALU)
4449 return false;
4450
4452 uint8_t TTbl;
4453 unsigned NumOpcodes;
4454
4455 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4456
4457 // Src.empty() case can happen if all operands are all zero or all ones.
4458 // Normally it shall be optimized out before reaching this.
4459 if (NumOpcodes < 2 || Src.empty())
4460 return false;
4461
4462 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4463 if (NumOpcodes == 2 && IsB32) {
4464 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4465 // asm more readable. This cannot be modeled with AddedComplexity because
4466 // selector does not know how many operations did we match.
4467 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4468 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4469 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4470 return false;
4471 } else if (NumOpcodes < 4) {
4472 // For a uniform case threshold should be higher to account for moves
4473 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4474 // in SGPRs and a readtfirstlane after.
4475 return false;
4476 }
4477
4478 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4479 if (!IsB32 && STI.hasTrue16BitInsts())
4480 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4481 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4482 unsigned CBL = STI.getConstantBusLimit(Opc);
4483 MachineBasicBlock *MBB = MI.getParent();
4484 const DebugLoc &DL = MI.getDebugLoc();
4485
4486 for (unsigned I = 0; I < Src.size(); ++I) {
4487 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4488 if (RB->getID() != AMDGPU::SGPRRegBankID)
4489 continue;
4490 if (CBL > 0) {
4491 --CBL;
4492 continue;
4493 }
4494 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4495 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4496 .addReg(Src[I]);
4497 Src[I] = NewReg;
4498 }
4499
4500 // Last operand can be ignored, turning a ternary operation into a binary.
4501 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4502 // 'c' with 'a' here without changing the answer. In some pathological
4503 // cases it should be possible to get an operation with a single operand
4504 // too if optimizer would not catch it.
4505 while (Src.size() < 3)
4506 Src.push_back(Src[0]);
4507
4508 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4509 if (!IsB32)
4510 MIB.addImm(0); // src_mod0
4511 MIB.addReg(Src[0]);
4512 if (!IsB32)
4513 MIB.addImm(0); // src_mod1
4514 MIB.addReg(Src[1]);
4515 if (!IsB32)
4516 MIB.addImm(0); // src_mod2
4517 MIB.addReg(Src[2])
4518 .addImm(TTbl);
4519 if (!IsB32)
4520 MIB.addImm(0); // op_sel
4521
4522 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4523 MI.eraseFromParent();
4524
4525 return true;
4526}
4527
4528bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4529 Register SrcReg = MI.getOperand(0).getReg();
4530 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4531 return false;
4532
4533 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4534 Register SP =
4535 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4536 Register WaveAddr = getWaveAddress(DefMI);
4537 MachineBasicBlock *MBB = MI.getParent();
4538 const DebugLoc &DL = MI.getDebugLoc();
4539
4540 if (!WaveAddr) {
4541 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4542 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4543 .addReg(SrcReg)
4544 .addImm(Subtarget->getWavefrontSizeLog2())
4545 .setOperandDead(3); // Dead scc
4546 }
4547
4548 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4549 .addReg(WaveAddr);
4550
4551 MI.eraseFromParent();
4552 return true;
4553}
4554
4556
4557 if (!I.isPreISelOpcode()) {
4558 if (I.isCopy())
4559 return selectCOPY(I);
4560 return true;
4561 }
4562
4563 switch (I.getOpcode()) {
4564 case TargetOpcode::G_AND:
4565 case TargetOpcode::G_OR:
4566 case TargetOpcode::G_XOR:
4567 if (selectBITOP3(I))
4568 return true;
4569 if (selectImpl(I, *CoverageInfo))
4570 return true;
4571 return selectG_AND_OR_XOR(I);
4572 case TargetOpcode::G_ADD:
4573 case TargetOpcode::G_SUB:
4574 case TargetOpcode::G_PTR_ADD:
4575 if (selectImpl(I, *CoverageInfo))
4576 return true;
4577 return selectG_ADD_SUB(I);
4578 case TargetOpcode::G_UADDO:
4579 case TargetOpcode::G_USUBO:
4580 case TargetOpcode::G_UADDE:
4581 case TargetOpcode::G_USUBE:
4582 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4583 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4584 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4585 return selectG_AMDGPU_MAD_64_32(I);
4586 case TargetOpcode::G_INTTOPTR:
4587 case TargetOpcode::G_BITCAST:
4588 case TargetOpcode::G_PTRTOINT:
4589 case TargetOpcode::G_FREEZE:
4590 return selectCOPY(I);
4591 case TargetOpcode::G_FNEG:
4592 if (selectImpl(I, *CoverageInfo))
4593 return true;
4594 return selectG_FNEG(I);
4595 case TargetOpcode::G_FABS:
4596 if (selectImpl(I, *CoverageInfo))
4597 return true;
4598 return selectG_FABS(I);
4599 case TargetOpcode::G_EXTRACT:
4600 return selectG_EXTRACT(I);
4601 case TargetOpcode::G_MERGE_VALUES:
4602 case TargetOpcode::G_CONCAT_VECTORS:
4603 return selectG_MERGE_VALUES(I);
4604 case TargetOpcode::G_UNMERGE_VALUES:
4605 return selectG_UNMERGE_VALUES(I);
4606 case TargetOpcode::G_BUILD_VECTOR:
4607 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4608 return selectG_BUILD_VECTOR(I);
4609 case TargetOpcode::G_IMPLICIT_DEF:
4610 return selectG_IMPLICIT_DEF(I);
4611 case TargetOpcode::G_INSERT:
4612 return selectG_INSERT(I);
4613 case TargetOpcode::G_INTRINSIC:
4614 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4615 return selectG_INTRINSIC(I);
4616 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4617 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4618 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4619 case TargetOpcode::G_ICMP:
4620 case TargetOpcode::G_FCMP:
4621 if (selectG_ICMP_or_FCMP(I))
4622 return true;
4623 return selectImpl(I, *CoverageInfo);
4624 case TargetOpcode::G_LOAD:
4625 case TargetOpcode::G_ZEXTLOAD:
4626 case TargetOpcode::G_SEXTLOAD:
4627 case TargetOpcode::G_STORE:
4628 case TargetOpcode::G_ATOMIC_CMPXCHG:
4629 case TargetOpcode::G_ATOMICRMW_XCHG:
4630 case TargetOpcode::G_ATOMICRMW_ADD:
4631 case TargetOpcode::G_ATOMICRMW_SUB:
4632 case TargetOpcode::G_ATOMICRMW_AND:
4633 case TargetOpcode::G_ATOMICRMW_OR:
4634 case TargetOpcode::G_ATOMICRMW_XOR:
4635 case TargetOpcode::G_ATOMICRMW_MIN:
4636 case TargetOpcode::G_ATOMICRMW_MAX:
4637 case TargetOpcode::G_ATOMICRMW_UMIN:
4638 case TargetOpcode::G_ATOMICRMW_UMAX:
4639 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4640 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4641 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4642 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4643 case TargetOpcode::G_ATOMICRMW_FADD:
4644 case TargetOpcode::G_ATOMICRMW_FMIN:
4645 case TargetOpcode::G_ATOMICRMW_FMAX:
4646 return selectG_LOAD_STORE_ATOMICRMW(I);
4647 case TargetOpcode::G_SELECT:
4648 return selectG_SELECT(I);
4649 case TargetOpcode::G_TRUNC:
4650 return selectG_TRUNC(I);
4651 case TargetOpcode::G_SEXT:
4652 case TargetOpcode::G_ZEXT:
4653 case TargetOpcode::G_ANYEXT:
4654 case TargetOpcode::G_SEXT_INREG:
4655 // This is a workaround. For extension from type i1, `selectImpl()` uses
4656 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4657 // i1 can only be hold in a SGPR class.
4658 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4659 selectImpl(I, *CoverageInfo))
4660 return true;
4661 return selectG_SZA_EXT(I);
4662 case TargetOpcode::G_FPEXT:
4663 if (selectG_FPEXT(I))
4664 return true;
4665 return selectImpl(I, *CoverageInfo);
4666 case TargetOpcode::G_BRCOND:
4667 return selectG_BRCOND(I);
4668 case TargetOpcode::G_GLOBAL_VALUE:
4669 return selectG_GLOBAL_VALUE(I);
4670 case TargetOpcode::G_PTRMASK:
4671 return selectG_PTRMASK(I);
4672 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4673 return selectG_EXTRACT_VECTOR_ELT(I);
4674 case TargetOpcode::G_INSERT_VECTOR_ELT:
4675 return selectG_INSERT_VECTOR_ELT(I);
4676 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4677 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4678 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4679 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4680 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4681 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4683 assert(Intr && "not an image intrinsic with image pseudo");
4684 return selectImageIntrinsic(I, Intr);
4685 }
4686 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4687 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4688 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4689 return selectBVHIntersectRayIntrinsic(I);
4690 case AMDGPU::G_SBFX:
4691 case AMDGPU::G_UBFX:
4692 return selectG_SBFX_UBFX(I);
4693 case AMDGPU::G_SI_CALL:
4694 I.setDesc(TII.get(AMDGPU::SI_CALL));
4695 return true;
4696 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4697 return selectWaveAddress(I);
4698 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4699 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4700 return true;
4701 }
4702 case AMDGPU::G_STACKRESTORE:
4703 return selectStackRestore(I);
4704 case AMDGPU::G_PHI:
4705 return selectPHI(I);
4706 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4707 return selectCOPY_SCC_VCC(I);
4708 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4709 return selectCOPY_VCC_SCC(I);
4710 case AMDGPU::G_AMDGPU_READANYLANE:
4711 return selectReadAnyLane(I);
4712 case TargetOpcode::G_CONSTANT:
4713 case TargetOpcode::G_FCONSTANT:
4714 default:
4715 return selectImpl(I, *CoverageInfo);
4716 }
4717 return false;
4718}
4719
4721AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4722 return {{
4723 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4724 }};
4725
4726}
4727
4728std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4729 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4730 unsigned Mods = 0;
4731 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4732
4733 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4734 Src = MI->getOperand(1).getReg();
4735 Mods |= SISrcMods::NEG;
4736 MI = getDefIgnoringCopies(Src, *MRI);
4737 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4738 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4739 // denormal mode, but we're implicitly canonicalizing in a source operand.
4740 const ConstantFP *LHS =
4741 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4742 if (LHS && LHS->isZero()) {
4743 Mods |= SISrcMods::NEG;
4744 Src = MI->getOperand(2).getReg();
4745 }
4746 }
4747
4748 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4749 Src = MI->getOperand(1).getReg();
4750 Mods |= SISrcMods::ABS;
4751 }
4752
4753 if (OpSel)
4754 Mods |= SISrcMods::OP_SEL_0;
4755
4756 return std::pair(Src, Mods);
4757}
4758
4759std::pair<Register, unsigned>
4760AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4761 unsigned Mods;
4762 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4763 Mods |= SISrcMods::OP_SEL_1;
4764 return std::pair(Src, Mods);
4765}
4766
4767Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4768 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4769 bool ForceVGPR) const {
4770 if ((Mods != 0 || ForceVGPR) &&
4771 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4772
4773 // If we looked through copies to find source modifiers on an SGPR operand,
4774 // we now have an SGPR register source. To avoid potentially violating the
4775 // constant bus restriction, we need to insert a copy to a VGPR.
4776 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4777 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4778 TII.get(AMDGPU::COPY), VGPRSrc)
4779 .addReg(Src);
4780 Src = VGPRSrc;
4781 }
4782
4783 return Src;
4784}
4785
4786///
4787/// This will select either an SGPR or VGPR operand and will save us from
4788/// having to write an extra tablegen pattern.
4790AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4791 return {{
4792 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4793 }};
4794}
4795
4797AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4798 Register Src;
4799 unsigned Mods;
4800 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4801
4802 return {{
4803 [=](MachineInstrBuilder &MIB) {
4804 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4805 },
4806 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4807 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4808 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4809 }};
4810}
4811
4813AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4814 Register Src;
4815 unsigned Mods;
4816 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4817 /*IsCanonicalizing=*/true,
4818 /*AllowAbs=*/false);
4819
4820 return {{
4821 [=](MachineInstrBuilder &MIB) {
4822 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4823 },
4824 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4825 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4826 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4827 }};
4828}
4829
4831AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4832 return {{
4833 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4834 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4835 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4836 }};
4837}
4838
4840AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4841 Register Src;
4842 unsigned Mods;
4843 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4844
4845 return {{
4846 [=](MachineInstrBuilder &MIB) {
4847 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4848 },
4849 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4850 }};
4851}
4852
4854AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4855 MachineOperand &Root) const {
4856 Register Src;
4857 unsigned Mods;
4858 std::tie(Src, Mods) =
4859 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4860
4861 return {{
4862 [=](MachineInstrBuilder &MIB) {
4863 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4864 },
4865 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4866 }};
4867}
4868
4870AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4871 Register Src;
4872 unsigned Mods;
4873 std::tie(Src, Mods) =
4874 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4875 /*AllowAbs=*/false);
4876
4877 return {{
4878 [=](MachineInstrBuilder &MIB) {
4879 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4880 },
4881 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4882 }};
4883}
4884
4886AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4887 Register Reg = Root.getReg();
4888 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4889 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4890 return {};
4891 return {{
4892 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4893 }};
4894}
4895
4896enum class SrcStatus {
4901 // This means current op = [op_upper, op_lower] and src = -op_lower.
4904 // This means current op = [op_upper, op_lower] and src = [op_upper,
4905 // -op_lower].
4913};
4914/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4915static bool isTruncHalf(const MachineInstr *MI,
4916 const MachineRegisterInfo &MRI) {
4917 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4918 return false;
4919
4920 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4921 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4922 return DstSize * 2 == SrcSize;
4923}
4924
4925/// Test if the MI is logic shift right with half bits,
4926/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4927static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4928 if (MI->getOpcode() != AMDGPU::G_LSHR)
4929 return false;
4930
4931 Register ShiftSrc;
4932 std::optional<ValueAndVReg> ShiftAmt;
4933 if (mi_match(MI->getOperand(0).getReg(), MRI,
4934 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4935 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4936 unsigned Shift = ShiftAmt->Value.getZExtValue();
4937 return Shift * 2 == SrcSize;
4938 }
4939 return false;
4940}
4941
4942/// Test if the MI is shift left with half bits,
4943/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4944static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4945 if (MI->getOpcode() != AMDGPU::G_SHL)
4946 return false;
4947
4948 Register ShiftSrc;
4949 std::optional<ValueAndVReg> ShiftAmt;
4950 if (mi_match(MI->getOperand(0).getReg(), MRI,
4951 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4952 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4953 unsigned Shift = ShiftAmt->Value.getZExtValue();
4954 return Shift * 2 == SrcSize;
4955 }
4956 return false;
4957}
4958
4959/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4960static bool isUnmergeHalf(const MachineInstr *MI,
4961 const MachineRegisterInfo &MRI) {
4962 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4963 return false;
4964 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4965 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4966}
4967
4969
4971 const MachineRegisterInfo &MRI) {
4972 LLT OpTy = MRI.getType(Reg);
4973 if (OpTy.isScalar())
4974 return TypeClass::SCALAR;
4975 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4978}
4979
4981 const MachineRegisterInfo &MRI) {
4982 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4983 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4984 return SrcStatus::INVALID;
4985
4986 switch (S) {
4987 case SrcStatus::IS_SAME:
4988 if (NegType == TypeClass::VECTOR_OF_TWO) {
4989 // Vector of 2:
4990 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4991 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4992 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4993 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4995 }
4996 if (NegType == TypeClass::SCALAR) {
4997 // Scalar:
4998 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4999 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
5000 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5001 // [SrcHi, SrcLo] = [-OpHi, OpLo]
5002 return SrcStatus::IS_HI_NEG;
5003 }
5004 break;
5006 if (NegType == TypeClass::VECTOR_OF_TWO) {
5007 // Vector of 2:
5008 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5009 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
5010 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
5011 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
5012 return SrcStatus::IS_LO_NEG;
5013 }
5014 if (NegType == TypeClass::SCALAR) {
5015 // Scalar:
5016 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5017 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
5018 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5019 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
5020 return SrcStatus::IS_SAME;
5021 }
5022 break;
5024 if (NegType == TypeClass::VECTOR_OF_TWO) {
5025 // Vector of 2:
5026 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
5027 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
5028 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
5029 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
5030 return SrcStatus::IS_HI_NEG;
5031 }
5032 if (NegType == TypeClass::SCALAR) {
5033 // Scalar:
5034 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
5035 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
5036 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5037 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
5039 }
5040 break;
5042 if (NegType == TypeClass::VECTOR_OF_TWO) {
5043 // Vector of 2:
5044 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
5045 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
5046 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
5047 // [SrcHi, SrcLo] = [OpHi, OpLo]
5048 return SrcStatus::IS_SAME;
5049 }
5050 if (NegType == TypeClass::SCALAR) {
5051 // Scalar:
5052 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
5053 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
5054 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5055 // [SrcHi, SrcLo] = [OpHi, -OpLo]
5056 return SrcStatus::IS_LO_NEG;
5057 }
5058 break;
5060 // Vector of 2:
5061 // Src = CurrUpper
5062 // Curr = [CurrUpper, CurrLower]
5063 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5064 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5065 // Src = -OpUpper
5066 //
5067 // Scalar:
5068 // Src = CurrUpper
5069 // Curr = [CurrUpper, CurrLower]
5070 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5071 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5072 // Src = -OpUpper
5075 if (NegType == TypeClass::VECTOR_OF_TWO) {
5076 // Vector of 2:
5077 // Src = CurrLower
5078 // Curr = [CurrUpper, CurrLower]
5079 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5080 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5081 // Src = -OpLower
5083 }
5084 if (NegType == TypeClass::SCALAR) {
5085 // Scalar:
5086 // Src = CurrLower
5087 // Curr = [CurrUpper, CurrLower]
5088 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5089 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5090 // Src = OpLower
5092 }
5093 break;
5095 // Vector of 2:
5096 // Src = -CurrUpper
5097 // Curr = [CurrUpper, CurrLower]
5098 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5099 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5100 // Src = -(-OpUpper) = OpUpper
5101 //
5102 // Scalar:
5103 // Src = -CurrUpper
5104 // Curr = [CurrUpper, CurrLower]
5105 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5106 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5107 // Src = -(-OpUpper) = OpUpper
5110 if (NegType == TypeClass::VECTOR_OF_TWO) {
5111 // Vector of 2:
5112 // Src = -CurrLower
5113 // Curr = [CurrUpper, CurrLower]
5114 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5115 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5116 // Src = -(-OpLower) = OpLower
5118 }
5119 if (NegType == TypeClass::SCALAR) {
5120 // Scalar:
5121 // Src = -CurrLower
5122 // Curr = [CurrUpper, CurrLower]
5123 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5124 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5125 // Src = -OpLower
5127 }
5128 break;
5129 default:
5130 break;
5131 }
5132 llvm_unreachable("unexpected SrcStatus & NegType combination");
5133}
5134
5135static std::optional<std::pair<Register, SrcStatus>>
5136calcNextStatus(std::pair<Register, SrcStatus> Curr,
5137 const MachineRegisterInfo &MRI) {
5138 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
5139
5140 unsigned Opc = MI->getOpcode();
5141
5142 // Handle general Opc cases.
5143 switch (Opc) {
5144 case AMDGPU::G_BITCAST:
5145 return std::optional<std::pair<Register, SrcStatus>>(
5146 {MI->getOperand(1).getReg(), Curr.second});
5147 case AMDGPU::COPY:
5148 if (MI->getOperand(1).getReg().isPhysical())
5149 return std::nullopt;
5150 return std::optional<std::pair<Register, SrcStatus>>(
5151 {MI->getOperand(1).getReg(), Curr.second});
5152 case AMDGPU::G_FNEG: {
5153 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
5154 if (Stat == SrcStatus::INVALID)
5155 return std::nullopt;
5156 return std::optional<std::pair<Register, SrcStatus>>(
5157 {MI->getOperand(1).getReg(), Stat});
5158 }
5159 default:
5160 break;
5161 }
5162
5163 // Calc next Stat from current Stat.
5164 switch (Curr.second) {
5165 case SrcStatus::IS_SAME:
5166 if (isTruncHalf(MI, MRI))
5167 return std::optional<std::pair<Register, SrcStatus>>(
5168 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5169 else if (isUnmergeHalf(MI, MRI)) {
5170 if (Curr.first == MI->getOperand(0).getReg())
5171 return std::optional<std::pair<Register, SrcStatus>>(
5172 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
5173 return std::optional<std::pair<Register, SrcStatus>>(
5174 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
5175 }
5176 break;
5178 if (isTruncHalf(MI, MRI)) {
5179 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5180 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
5181 // = [OpLowerHi, OpLowerLo]
5182 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5183 // = [-OpLowerHi, OpLowerLo]
5184 // = -OpLower
5185 return std::optional<std::pair<Register, SrcStatus>>(
5186 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5187 }
5188 if (isUnmergeHalf(MI, MRI)) {
5189 if (Curr.first == MI->getOperand(0).getReg())
5190 return std::optional<std::pair<Register, SrcStatus>>(
5191 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5192 return std::optional<std::pair<Register, SrcStatus>>(
5193 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5194 }
5195 break;
5197 if (isShlHalf(MI, MRI))
5198 return std::optional<std::pair<Register, SrcStatus>>(
5199 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5200 break;
5202 if (isLshrHalf(MI, MRI))
5203 return std::optional<std::pair<Register, SrcStatus>>(
5204 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5205 break;
5207 if (isShlHalf(MI, MRI))
5208 return std::optional<std::pair<Register, SrcStatus>>(
5209 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5210 break;
5212 if (isLshrHalf(MI, MRI))
5213 return std::optional<std::pair<Register, SrcStatus>>(
5214 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5215 break;
5216 default:
5217 break;
5218 }
5219 return std::nullopt;
5220}
5221
5222/// This is used to control valid status that current MI supports. For example,
5223/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5224/// bit on VOP3P.
5225/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5226/// for different MI on different arch
5228private:
5229 bool HasNeg = false;
5230 // Assume all complex pattern of VOP3P have opsel.
5231 bool HasOpsel = true;
5232
5233public:
5235 const MachineInstr *MI = MRI.getVRegDef(Reg);
5236 unsigned Opc = MI->getOpcode();
5237
5238 if (Opc == TargetOpcode::G_INTRINSIC) {
5239 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5240 // Only float point intrinsic has neg & neg_hi bits.
5241 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5242 HasNeg = true;
5244 // Keep same for generic op.
5245 HasNeg = true;
5246 }
5247 }
5248 bool checkOptions(SrcStatus Stat) const {
5249 if (!HasNeg &&
5250 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5251 return false;
5252 }
5253 if (!HasOpsel &&
5254 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5255 return false;
5256 }
5257 return true;
5258 }
5259};
5260
5263 int MaxDepth = 3) {
5264 int Depth = 0;
5265 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5267
5268 while (Depth <= MaxDepth && Curr.has_value()) {
5269 Depth++;
5270 if (SO.checkOptions(Curr.value().second))
5271 Statlist.push_back(Curr.value());
5272 Curr = calcNextStatus(Curr.value(), MRI);
5273 }
5274
5275 return Statlist;
5276}
5277
5278static std::pair<Register, SrcStatus>
5280 int MaxDepth = 3) {
5281 int Depth = 0;
5282 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5283 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5284
5285 while (Depth <= MaxDepth && Curr.has_value()) {
5286 Depth++;
5287 SrcStatus Stat = Curr.value().second;
5288 if (SO.checkOptions(Stat)) {
5289 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5291 LastSameOrNeg = Curr.value();
5292 }
5293 Curr = calcNextStatus(Curr.value(), MRI);
5294 }
5295
5296 return LastSameOrNeg;
5297}
5298
5299static bool isSameBitWidth(Register Reg1, Register Reg2,
5300 const MachineRegisterInfo &MRI) {
5301 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5302 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5303 return Width1 == Width2;
5304}
5305
5306static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5307 // SrcStatus::IS_LOWER_HALF remain 0.
5308 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5309 Mods ^= SISrcMods::NEG_HI;
5310 Mods |= SISrcMods::OP_SEL_1;
5311 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5312 Mods |= SISrcMods::OP_SEL_1;
5313 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5314 Mods ^= SISrcMods::NEG_HI;
5315 else if (HiStat == SrcStatus::IS_HI_NEG)
5316 Mods ^= SISrcMods::NEG_HI;
5317
5318 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5319 Mods ^= SISrcMods::NEG;
5320 Mods |= SISrcMods::OP_SEL_0;
5321 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5322 Mods |= SISrcMods::OP_SEL_0;
5323 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5324 Mods |= SISrcMods::NEG;
5325 else if (LoStat == SrcStatus::IS_HI_NEG)
5326 Mods ^= SISrcMods::NEG;
5327
5328 return Mods;
5329}
5330
5331static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5332 Register RootReg, const SIInstrInfo &TII,
5333 const MachineRegisterInfo &MRI) {
5334 auto IsHalfState = [](SrcStatus S) {
5337 };
5338 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5339 IsHalfState(HiStat);
5340}
5341
5342std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5343 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5344 unsigned Mods = 0;
5345 // No modification if Root type is not form of <2 x Type>.
5346 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5347 Mods |= SISrcMods::OP_SEL_1;
5348 return {RootReg, Mods};
5349 }
5350
5351 SearchOptions SO(RootReg, MRI);
5352
5353 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5354
5355 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5357 else if (Stat.second == SrcStatus::IS_HI_NEG)
5358 Mods ^= SISrcMods::NEG_HI;
5359 else if (Stat.second == SrcStatus::IS_LO_NEG)
5360 Mods ^= SISrcMods::NEG;
5361
5362 // 64-bit VOP3P instructions do not have OPSEL or ABS. Bail on v2f64 or v2i64.
5363 // TODO: Select NEG_LO and NEG_HI modifiers from BUILD_VECTOR.
5364 if (MRI.getType(RootReg).getSizeInBits() == 128) {
5365 Mods |= SISrcMods::OP_SEL_1; // Just the default, OPSEL unsupported.
5366 return {Stat.first, Mods};
5367 }
5368
5369 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5370
5371 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5372 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5373 Mods |= SISrcMods::OP_SEL_1;
5374 return {Stat.first, Mods};
5375 }
5376
5378 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5379
5380 if (StatlistHi.empty()) {
5381 Mods |= SISrcMods::OP_SEL_1;
5382 return {Stat.first, Mods};
5383 }
5384
5386 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5387
5388 if (StatlistLo.empty()) {
5389 Mods |= SISrcMods::OP_SEL_1;
5390 return {Stat.first, Mods};
5391 }
5392
5393 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5394 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5395 if (StatlistHi[I].first == StatlistLo[J].first &&
5396 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5397 StatlistHi[I].first, RootReg, TII, MRI))
5398 return {StatlistHi[I].first,
5399 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5400 }
5401 }
5402 // Packed instructions do not have abs modifiers.
5403 Mods |= SISrcMods::OP_SEL_1;
5404
5405 return {Stat.first, Mods};
5406}
5407
5408// Removed unused function `getAllKindImm` to eliminate dead code.
5409
5410static bool checkRB(Register Reg, unsigned int RBNo,
5411 const AMDGPURegisterBankInfo &RBI,
5412 const MachineRegisterInfo &MRI,
5413 const TargetRegisterInfo &TRI) {
5414 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5415 return RB->getID() == RBNo;
5416}
5417
5418// This function is used to get the correct register bank for returned reg.
5419// Assume:
5420// 1. VOP3P is always legal for VGPR.
5421// 2. RootOp's regbank is legal.
5422// Thus
5423// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5424// 2. If RootOp is VGPR, then NewOp must be VGPR.
5426 const AMDGPURegisterBankInfo &RBI,
5428 const TargetRegisterInfo &TRI,
5429 const SIInstrInfo &TII) {
5430 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5431 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5432 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5433 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5434 return NewReg;
5435
5436 MachineInstr *MI = MRI.getVRegDef(RootReg);
5437 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5438 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5439 return RootReg;
5440 }
5441
5442 MachineBasicBlock *BB = MI->getParent();
5443 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5444
5446 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5447 .addReg(NewReg);
5448
5449 // Only accept VGPR.
5450 return MIB->getOperand(0).getReg();
5451}
5452
5454AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5455 bool IsDOT) const {
5456 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5457 Register Reg;
5458 unsigned Mods;
5459 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5460
5461 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5462 return {{
5463 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5464 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5465 }};
5466}
5467
5469AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5470
5471 return selectVOP3PRetHelper(Root);
5472}
5473
5475AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5476
5477 return selectVOP3PRetHelper(Root, true);
5478}
5479
5481AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5482 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5483 Register Src;
5484 unsigned Mods;
5485 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
5486 if (Mods != SISrcMods::OP_SEL_1)
5487 return {};
5488
5489 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5490}
5491
5493AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5494 Register Src;
5495 unsigned Mods;
5496 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5497
5498 return {{
5499 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5500 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5501 }};
5502}
5503
5505AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5506 Register Src;
5507 unsigned Mods;
5508 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5509 if (Mods != SISrcMods::OP_SEL_1)
5510 return {};
5511
5512 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5513}
5514
5516AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5517 MachineOperand &Root) const {
5518 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5519 "expected i1 value");
5520 unsigned Mods = SISrcMods::OP_SEL_1;
5521 if (Root.getImm() != 0)
5522 Mods |= SISrcMods::OP_SEL_0;
5523
5524 return {{
5525 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5526 }};
5527}
5528
5530 MachineInstr *InsertPt,
5531 MachineRegisterInfo &MRI) {
5532 const TargetRegisterClass *DstRegClass;
5533 switch (Elts.size()) {
5534 case 8:
5535 DstRegClass = &AMDGPU::VReg_256RegClass;
5536 break;
5537 case 4:
5538 DstRegClass = &AMDGPU::VReg_128RegClass;
5539 break;
5540 case 2:
5541 DstRegClass = &AMDGPU::VReg_64RegClass;
5542 break;
5543 default:
5544 llvm_unreachable("unhandled Reg sequence size");
5545 }
5546
5547 MachineIRBuilder B(*InsertPt);
5548 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5549 .addDef(MRI.createVirtualRegister(DstRegClass));
5550 for (unsigned i = 0; i < Elts.size(); ++i) {
5551 MIB.addReg(Elts[i]);
5553 }
5554 return MIB->getOperand(0).getReg();
5555}
5556
5557static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5559 MachineInstr *InsertPt,
5560 MachineRegisterInfo &MRI) {
5561 if (ModOpcode == TargetOpcode::G_FNEG) {
5562 Mods |= SISrcMods::NEG;
5563 // Check if all elements also have abs modifier
5564 SmallVector<Register, 8> NegAbsElts;
5565 for (auto El : Elts) {
5566 Register FabsSrc;
5567 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5568 break;
5569 NegAbsElts.push_back(FabsSrc);
5570 }
5571 if (Elts.size() != NegAbsElts.size()) {
5572 // Neg
5573 Src = buildRegSequence(Elts, InsertPt, MRI);
5574 } else {
5575 // Neg and Abs
5576 Mods |= SISrcMods::NEG_HI;
5577 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5578 }
5579 } else {
5580 assert(ModOpcode == TargetOpcode::G_FABS);
5581 // Abs
5582 Mods |= SISrcMods::NEG_HI;
5583 Src = buildRegSequence(Elts, InsertPt, MRI);
5584 }
5585}
5586
5588AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5589 Register Src = Root.getReg();
5590 unsigned Mods = SISrcMods::OP_SEL_1;
5592
5593 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5594 assert(BV->getNumSources() > 0);
5595 // Based on first element decide which mod we match, neg or abs
5596 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5597 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5598 ? AMDGPU::G_FNEG
5599 : AMDGPU::G_FABS;
5600 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5601 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5602 if (ElF32->getOpcode() != ModOpcode)
5603 break;
5604 EltsF32.push_back(ElF32->getOperand(1).getReg());
5605 }
5606
5607 // All elements had ModOpcode modifier
5608 if (BV->getNumSources() == EltsF32.size()) {
5609 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5610 *MRI);
5611 }
5612 }
5613
5614 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5615 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5616}
5617
5619AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5620 Register Src = Root.getReg();
5621 unsigned Mods = SISrcMods::OP_SEL_1;
5622 SmallVector<Register, 8> EltsV2F16;
5623
5624 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5625 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5626 Register FNegSrc;
5627 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5628 break;
5629 EltsV2F16.push_back(FNegSrc);
5630 }
5631
5632 // All elements had ModOpcode modifier
5633 if (CV->getNumSources() == EltsV2F16.size()) {
5634 Mods |= SISrcMods::NEG;
5635 Mods |= SISrcMods::NEG_HI;
5636 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5637 }
5638 }
5639
5640 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5641 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5642}
5643
5645AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5646 Register Src = Root.getReg();
5647 unsigned Mods = SISrcMods::OP_SEL_1;
5648 SmallVector<Register, 8> EltsV2F16;
5649
5650 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5651 assert(CV->getNumSources() > 0);
5652 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5653 // Based on first element decide which mod we match, neg or abs
5654 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5655 ? AMDGPU::G_FNEG
5656 : AMDGPU::G_FABS;
5657
5658 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5659 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5660 if (ElV2F16->getOpcode() != ModOpcode)
5661 break;
5662 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5663 }
5664
5665 // All elements had ModOpcode modifier
5666 if (CV->getNumSources() == EltsV2F16.size()) {
5667 MachineIRBuilder B(*Root.getParent());
5668 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5669 *MRI);
5670 }
5671 }
5672
5673 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5674 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5675}
5676
5678AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5679 std::optional<FPValueAndVReg> FPValReg;
5680 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5681 if (TII.isInlineConstant(FPValReg->Value)) {
5682 return {{[=](MachineInstrBuilder &MIB) {
5683 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5684 }}};
5685 }
5686 // Non-inlineable splat floats should not fall-through for integer immediate
5687 // checks.
5688 return {};
5689 }
5690
5691 APInt ICst;
5692 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5693 if (TII.isInlineConstant(ICst)) {
5694 return {
5695 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5696 }
5697 }
5698
5699 return {};
5700}
5701
5703AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5704 Register Src =
5705 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5706 unsigned Key = 0;
5707
5708 Register ShiftSrc;
5709 std::optional<ValueAndVReg> ShiftAmt;
5710 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5711 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5712 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5713 Key = ShiftAmt->Value.getZExtValue() / 8;
5714 Src = ShiftSrc;
5715 }
5716
5717 return {{
5718 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5719 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5720 }};
5721}
5722
5724AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5725
5726 Register Src =
5727 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5728 unsigned Key = 0;
5729
5730 Register ShiftSrc;
5731 std::optional<ValueAndVReg> ShiftAmt;
5732 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5733 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5734 ShiftAmt->Value.getZExtValue() == 16) {
5735 Src = ShiftSrc;
5736 Key = 1;
5737 }
5738
5739 return {{
5740 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5741 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5742 }};
5743}
5744
5746AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5747 Register Src =
5748 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5749 unsigned Key = 0;
5750
5751 Register S32 = matchZeroExtendFromS32(Src);
5752 if (!S32)
5753 S32 = matchAnyExtendFromS32(Src);
5754
5755 if (S32) {
5756 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5757 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5758 assert(Def->getNumOperands() == 3);
5759 Register DstReg1 = Def->getOperand(1).getReg();
5760 if (mi_match(S32, *MRI,
5761 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5762 Src = Def->getOperand(2).getReg();
5763 Key = 1;
5764 }
5765 }
5766 }
5767
5768 return {{
5769 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5770 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5771 }};
5772}
5773
5775AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5776 Register Src;
5777 unsigned Mods;
5778 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5779
5780 // FIXME: Handle op_sel
5781 return {{
5782 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5783 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5784 }};
5785}
5786
5787// FIXME-TRUE16 remove when fake16 is removed
5789AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5790 Register Src;
5791 unsigned Mods;
5792 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5793 /*IsCanonicalizing=*/true,
5794 /*AllowAbs=*/false,
5795 /*OpSel=*/false);
5796
5797 return {{
5798 [=](MachineInstrBuilder &MIB) {
5799 MIB.addReg(
5800 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5801 },
5802 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5803 }};
5804}
5805
5807AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5808 Register Src;
5809 unsigned Mods;
5810 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5811 /*IsCanonicalizing=*/true,
5812 /*AllowAbs=*/false,
5813 /*OpSel=*/true);
5814
5815 return {{
5816 [=](MachineInstrBuilder &MIB) {
5817 MIB.addReg(
5818 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5819 },
5820 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5821 }};
5822}
5823
5824// Given \p Offset and load specified by the \p Root operand check if \p Offset
5825// is a multiple of the load byte size. If it is update \p Offset to a
5826// pre-scaled value and return true.
5827bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5829 bool IsSigned) const {
5830 if (!Subtarget->hasScaleOffset())
5831 return false;
5832
5833 const MachineInstr &MI = *Root.getParent();
5834 MachineMemOperand *MMO = *MI.memoperands_begin();
5835
5836 if (!MMO->getSize().hasValue())
5837 return false;
5838
5839 uint64_t Size = MMO->getSize().getValue();
5840
5841 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5842 if (!OffsetReg)
5843 OffsetReg = Offset;
5844
5845 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5846 OffsetReg = Def->Reg;
5847
5848 Register Op0;
5849 MachineInstr *Mul;
5850 bool ScaleOffset =
5851 (isPowerOf2_64(Size) &&
5852 mi_match(OffsetReg, *MRI,
5853 m_GShl(m_Reg(Op0),
5856 mi_match(OffsetReg, *MRI,
5858 m_Copy(m_SpecificICst(Size))))) ||
5859 mi_match(
5860 OffsetReg, *MRI,
5861 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5862 m_Reg(Op0), m_SpecificICst(Size))) ||
5863 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5864 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5865 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5866 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5867 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5868 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5869 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5870 mi_match(Mul->getOperand(3).getReg(), *MRI,
5872 m_Copy(m_SpecificICst(Size))))) &&
5873 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5874
5875 if (ScaleOffset)
5876 Offset = Op0;
5877
5878 return ScaleOffset;
5879}
5880
5881bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5882 Register &Base,
5883 Register *SOffset,
5884 int64_t *Offset,
5885 bool *ScaleOffset) const {
5886 MachineInstr *MI = Root.getParent();
5887 MachineBasicBlock *MBB = MI->getParent();
5888
5889 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5890 // then we can select all ptr + 32-bit offsets.
5891 SmallVector<GEPInfo, 4> AddrInfo;
5892 getAddrModeInfo(*MI, *MRI, AddrInfo);
5893
5894 if (AddrInfo.empty())
5895 return false;
5896
5897 const GEPInfo &GEPI = AddrInfo[0];
5898 std::optional<int64_t> EncodedImm;
5899
5900 if (ScaleOffset)
5901 *ScaleOffset = false;
5902
5903 if (SOffset && Offset) {
5904 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5905 /*HasSOffset=*/true);
5906 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5907 AddrInfo.size() > 1) {
5908 const GEPInfo &GEPI2 = AddrInfo[1];
5909 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5910 Register OffsetReg = GEPI2.SgprParts[1];
5911 if (ScaleOffset)
5912 *ScaleOffset =
5913 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5914 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5915 if (OffsetReg) {
5916 Base = GEPI2.SgprParts[0];
5917 *SOffset = OffsetReg;
5918 *Offset = *EncodedImm;
5919 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5920 return true;
5921
5922 // For unbuffered smem loads, it is illegal for the Immediate Offset
5923 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5924 // is negative. Handle the case where the Immediate Offset + SOffset
5925 // is negative.
5926 auto SKnown = VT->getKnownBits(*SOffset);
5927 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5928 return false;
5929
5930 return true;
5931 }
5932 }
5933 }
5934 return false;
5935 }
5936
5937 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5938 /*HasSOffset=*/false);
5939 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5940 Base = GEPI.SgprParts[0];
5941 *Offset = *EncodedImm;
5942 return true;
5943 }
5944
5945 // SGPR offset is unsigned.
5946 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5947 GEPI.Imm != 0) {
5948 // If we make it this far we have a load with an 32-bit immediate offset.
5949 // It is OK to select this using a sgpr offset, because we have already
5950 // failed trying to select this load into one of the _IMM variants since
5951 // the _IMM Patterns are considered before the _SGPR patterns.
5952 Base = GEPI.SgprParts[0];
5953 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5954 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5955 .addImm(GEPI.Imm);
5956 return true;
5957 }
5958
5959 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5960 Register OffsetReg = GEPI.SgprParts[1];
5961 if (ScaleOffset)
5962 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5963 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5964 if (OffsetReg) {
5965 Base = GEPI.SgprParts[0];
5966 *SOffset = OffsetReg;
5967 return true;
5968 }
5969 }
5970
5971 return false;
5972}
5973
5975AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5976 Register Base;
5977 int64_t Offset;
5978 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5979 /* ScaleOffset */ nullptr))
5980 return std::nullopt;
5981
5982 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5983 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5984}
5985
5987AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5988 SmallVector<GEPInfo, 4> AddrInfo;
5989 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5990
5991 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5992 return std::nullopt;
5993
5994 const GEPInfo &GEPInfo = AddrInfo[0];
5995 Register PtrReg = GEPInfo.SgprParts[0];
5996 std::optional<int64_t> EncodedImm =
5997 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5998 if (!EncodedImm)
5999 return std::nullopt;
6000
6001 return {{
6002 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
6003 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
6004 }};
6005}
6006
6008AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
6009 Register Base, SOffset;
6010 bool ScaleOffset;
6011 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
6012 &ScaleOffset))
6013 return std::nullopt;
6014
6015 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
6016 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
6017 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6018 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
6019}
6020
6022AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
6023 Register Base, SOffset;
6024 int64_t Offset;
6025 bool ScaleOffset;
6026 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
6027 return std::nullopt;
6028
6029 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
6030 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
6031 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6032 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6033 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
6034}
6035
6036std::pair<Register, int> AMDGPUInstructionSelector::selectFlatOffsetImpl(
6037 MachineOperand &Root, AMDGPU::FlatAddrSpace FlatVariant) const {
6038 MachineInstr *MI = Root.getParent();
6039
6040 auto Default = std::pair(Root.getReg(), 0);
6041
6042 if (!STI.hasFlatInstOffsets())
6043 return Default;
6044
6045 Register PtrBase;
6046 int64_t ConstOffset;
6047 bool IsInBounds;
6048 std::tie(PtrBase, ConstOffset, IsInBounds) =
6049 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6050
6051 // Adding the offset to the base address with an immediate in a FLAT
6052 // instruction must not change the memory aperture in which the address falls.
6053 // Therefore we can only fold offsets from inbounds GEPs into FLAT
6054 // instructions.
6055 if (ConstOffset == 0 ||
6056 (FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch &&
6057 !isFlatScratchBaseLegal(Root.getReg())) ||
6058 (FlatVariant == AMDGPU::FlatAddrSpace::FLAT && !IsInBounds))
6059 return Default;
6060
6061 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
6062 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
6063 return Default;
6064
6065 return std::pair(PtrBase, ConstOffset);
6066}
6067
6069AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
6070 auto PtrWithOffset = selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FLAT);
6071
6072 return {{
6073 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
6074 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
6075 }};
6076}
6077
6079AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
6080 auto PtrWithOffset =
6081 selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FlatGlobal);
6082
6083 return {{
6084 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
6085 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
6086 }};
6087}
6088
6090AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
6091 auto PtrWithOffset =
6092 selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FlatScratch);
6093
6094 return {{
6095 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
6096 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
6097 }};
6098}
6099
6100// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
6102AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
6103 unsigned CPolBits,
6104 bool NeedIOffset) const {
6105 Register Addr = Root.getReg();
6106 Register PtrBase;
6107 int64_t ConstOffset;
6108 int64_t ImmOffset = 0;
6109
6110 // Match the immediate offset first, which canonically is moved as low as
6111 // possible.
6112 std::tie(PtrBase, ConstOffset, std::ignore) =
6113 getPtrBaseWithConstantOffset(Addr, *MRI);
6114
6115 if (ConstOffset != 0) {
6116 if (NeedIOffset &&
6117 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
6119 Addr = PtrBase;
6120 ImmOffset = ConstOffset;
6121 } else {
6122 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
6123 if (isSGPR(PtrBaseDef->Reg)) {
6124 if (ConstOffset > 0) {
6125 // Offset is too large.
6126 //
6127 // saddr + large_offset -> saddr +
6128 // (voffset = large_offset & ~MaxOffset) +
6129 // (large_offset & MaxOffset);
6130 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
6131 if (NeedIOffset) {
6132 std::tie(SplitImmOffset, RemainderOffset) =
6133 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
6135 }
6136
6137 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
6138 : isUInt<32>(RemainderOffset)) {
6139 MachineInstr *MI = Root.getParent();
6140 MachineBasicBlock *MBB = MI->getParent();
6141 Register HighBits =
6142 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6143
6144 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6145 HighBits)
6146 .addImm(RemainderOffset);
6147
6148 if (NeedIOffset)
6149 return {{
6150 [=](MachineInstrBuilder &MIB) {
6151 MIB.addReg(PtrBase);
6152 }, // saddr
6153 [=](MachineInstrBuilder &MIB) {
6154 MIB.addReg(HighBits);
6155 }, // voffset
6156 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
6157 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
6158 }};
6159 return {{
6160 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
6161 [=](MachineInstrBuilder &MIB) {
6162 MIB.addReg(HighBits);
6163 }, // voffset
6164 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
6165 }};
6166 }
6167 }
6168
6169 // We are adding a 64 bit SGPR and a constant. If constant bus limit
6170 // is 1 we would need to perform 1 or 2 extra moves for each half of
6171 // the constant and it is better to do a scalar add and then issue a
6172 // single VALU instruction to materialize zero. Otherwise it is less
6173 // instructions to perform VALU adds with immediates or inline literals.
6174 unsigned NumLiterals =
6175 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
6176 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
6177 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
6178 return std::nullopt;
6179 }
6180 }
6181 }
6182
6183 // Match the variable offset.
6184 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6185 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6186 // Look through the SGPR->VGPR copy.
6187 Register SAddr =
6188 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
6189
6190 if (isSGPR(SAddr)) {
6191 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
6192
6193 // It's possible voffset is an SGPR here, but the copy to VGPR will be
6194 // inserted later.
6195 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
6196 Subtarget->hasSignedGVSOffset());
6197 if (Register VOffset = matchExtendFromS32OrS32(
6198 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
6199 if (NeedIOffset)
6200 return {{[=](MachineInstrBuilder &MIB) { // saddr
6201 MIB.addReg(SAddr);
6202 },
6203 [=](MachineInstrBuilder &MIB) { // voffset
6204 MIB.addReg(VOffset);
6205 },
6206 [=](MachineInstrBuilder &MIB) { // offset
6207 MIB.addImm(ImmOffset);
6208 },
6209 [=](MachineInstrBuilder &MIB) { // cpol
6210 MIB.addImm(CPolBits |
6211 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6212 }}};
6213 return {{[=](MachineInstrBuilder &MIB) { // saddr
6214 MIB.addReg(SAddr);
6215 },
6216 [=](MachineInstrBuilder &MIB) { // voffset
6217 MIB.addReg(VOffset);
6218 },
6219 [=](MachineInstrBuilder &MIB) { // cpol
6220 MIB.addImm(CPolBits |
6221 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6222 }}};
6223 }
6224 }
6225 }
6226
6227 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6228 // drop this.
6229 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6230 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6231 return std::nullopt;
6232
6233 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6234 // moves required to copy a 64-bit SGPR to VGPR.
6235 MachineInstr *MI = Root.getParent();
6236 MachineBasicBlock *MBB = MI->getParent();
6237 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6238
6239 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6240 .addImm(0);
6241
6242 if (NeedIOffset)
6243 return {{
6244 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6245 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6246 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6247 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6248 }};
6249 return {{
6250 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6251 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6252 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6253 }};
6254}
6255
6257AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6258 return selectGlobalSAddr(Root, 0);
6259}
6260
6262AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6263 const MachineInstr &I = *Root.getParent();
6264
6265 // We are assuming CPol is always the last operand of the intrinsic.
6266 auto PassedCPol =
6267 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6268 return selectGlobalSAddr(Root, PassedCPol);
6269}
6270
6272AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6273 const MachineInstr &I = *Root.getParent();
6274
6275 // We are assuming CPol is second from last operand of the intrinsic.
6276 auto PassedCPol =
6277 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6278 return selectGlobalSAddr(Root, PassedCPol);
6279}
6280
6282AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6283 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6284}
6285
6287AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6288 MachineOperand &Root) const {
6289 const MachineInstr &I = *Root.getParent();
6290
6291 // We are assuming CPol is always the last operand of the intrinsic.
6292 auto PassedCPol =
6293 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6294 return selectGlobalSAddr(Root, PassedCPol, false);
6295}
6296
6298AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6299 MachineOperand &Root) const {
6300 const MachineInstr &I = *Root.getParent();
6301
6302 // We are assuming CPol is second from last operand of the intrinsic.
6303 auto PassedCPol =
6304 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6305 return selectGlobalSAddr(Root, PassedCPol, false);
6306}
6307
6309AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6310 Register Addr = Root.getReg();
6311 Register PtrBase;
6312 int64_t ConstOffset;
6313 int64_t ImmOffset = 0;
6314
6315 // Match the immediate offset first, which canonically is moved as low as
6316 // possible.
6317 std::tie(PtrBase, ConstOffset, std::ignore) =
6318 getPtrBaseWithConstantOffset(Addr, *MRI);
6319
6320 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6321 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6323 Addr = PtrBase;
6324 ImmOffset = ConstOffset;
6325 }
6326
6327 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6328 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6329 int FI = AddrDef->MI->getOperand(1).getIndex();
6330 return {{
6331 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6332 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6333 }};
6334 }
6335
6336 Register SAddr = AddrDef->Reg;
6337
6338 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6339 Register LHS = AddrDef->MI->getOperand(1).getReg();
6340 Register RHS = AddrDef->MI->getOperand(2).getReg();
6341 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6342 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6343
6344 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6345 isSGPR(RHSDef->Reg)) {
6346 int FI = LHSDef->MI->getOperand(1).getIndex();
6347 MachineInstr &I = *Root.getParent();
6348 MachineBasicBlock *BB = I.getParent();
6349 const DebugLoc &DL = I.getDebugLoc();
6350 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6351
6352 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6353 .addFrameIndex(FI)
6354 .addReg(RHSDef->Reg)
6355 .setOperandDead(3); // Dead scc
6356 }
6357 }
6358
6359 if (!isSGPR(SAddr))
6360 return std::nullopt;
6361
6362 return {{
6363 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6364 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6365 }};
6366}
6367
6368// Check whether the flat scratch SVS swizzle bug affects this access.
6369bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6370 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6371 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6372 return false;
6373
6374 // The bug affects the swizzling of SVS accesses if there is any carry out
6375 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6376 // voffset to (soffset + inst_offset).
6377 auto VKnown = VT->getKnownBits(VAddr);
6378 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6379 KnownBits::makeConstant(APInt(32, ImmOffset)));
6380 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6381 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6382 return (VMax & 3) + (SMax & 3) >= 4;
6383}
6384
6386AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6387 Register Addr = Root.getReg();
6388 Register PtrBase;
6389 int64_t ConstOffset;
6390 int64_t ImmOffset = 0;
6391
6392 // Match the immediate offset first, which canonically is moved as low as
6393 // possible.
6394 std::tie(PtrBase, ConstOffset, std::ignore) =
6395 getPtrBaseWithConstantOffset(Addr, *MRI);
6396
6397 Register OrigAddr = Addr;
6398 if (ConstOffset != 0 &&
6399 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6401 Addr = PtrBase;
6402 ImmOffset = ConstOffset;
6403 }
6404
6405 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6406 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6407 return std::nullopt;
6408
6409 Register RHS = AddrDef->MI->getOperand(2).getReg();
6410 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6411 return std::nullopt;
6412
6413 Register LHS = AddrDef->MI->getOperand(1).getReg();
6414 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6415
6416 if (OrigAddr != Addr) {
6417 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6418 return std::nullopt;
6419 } else {
6420 if (!isFlatScratchBaseLegalSV(OrigAddr))
6421 return std::nullopt;
6422 }
6423
6424 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6425 return std::nullopt;
6426
6427 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6429 : 0;
6430
6431 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6432 int FI = LHSDef->MI->getOperand(1).getIndex();
6433 return {{
6434 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6435 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6436 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6437 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6438 }};
6439 }
6440
6441 if (!isSGPR(LHS))
6442 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6443 LHS = Def->Reg;
6444
6445 if (!isSGPR(LHS))
6446 return std::nullopt;
6447
6448 return {{
6449 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6450 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6451 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6452 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6453 }};
6454}
6455
6457AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6458 MachineInstr *MI = Root.getParent();
6459 MachineBasicBlock *MBB = MI->getParent();
6460 MachineFunction *MF = MBB->getParent();
6461 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6462
6463 int64_t Offset = 0;
6464 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6466 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6467
6468 // TODO: Should this be inside the render function? The iterator seems to
6469 // move.
6470 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6471 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6472 HighBits)
6473 .addImm(Offset & ~MaxOffset);
6474
6475 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6476 MIB.addReg(Info->getScratchRSrcReg());
6477 },
6478 [=](MachineInstrBuilder &MIB) { // vaddr
6479 MIB.addReg(HighBits);
6480 },
6481 [=](MachineInstrBuilder &MIB) { // soffset
6482 // Use constant zero for soffset and rely on eliminateFrameIndex
6483 // to choose the appropriate frame register if need be.
6484 MIB.addImm(0);
6485 },
6486 [=](MachineInstrBuilder &MIB) { // offset
6487 MIB.addImm(Offset & MaxOffset);
6488 }}};
6489 }
6490
6491 assert(Offset == 0 || Offset == -1);
6492
6493 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6494 // offsets.
6495 std::optional<int> FI;
6496 Register VAddr = Root.getReg();
6497
6498 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6499 Register PtrBase;
6500 int64_t ConstOffset;
6501 std::tie(PtrBase, ConstOffset, std::ignore) =
6502 getPtrBaseWithConstantOffset(VAddr, *MRI);
6503 if (ConstOffset != 0) {
6504 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6505 (!STI.privateMemoryResourceIsRangeChecked() ||
6506 VT->signBitIsZero(PtrBase))) {
6507 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6508 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6509 FI = PtrBaseDef->getOperand(1).getIndex();
6510 else
6511 VAddr = PtrBase;
6512 Offset = ConstOffset;
6513 }
6514 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6515 FI = RootDef->getOperand(1).getIndex();
6516 }
6517
6518 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6519 MIB.addReg(Info->getScratchRSrcReg());
6520 },
6521 [=](MachineInstrBuilder &MIB) { // vaddr
6522 if (FI)
6523 MIB.addFrameIndex(*FI);
6524 else
6525 MIB.addReg(VAddr);
6526 },
6527 [=](MachineInstrBuilder &MIB) { // soffset
6528 // Use constant zero for soffset and rely on eliminateFrameIndex
6529 // to choose the appropriate frame register if need be.
6530 MIB.addImm(0);
6531 },
6532 [=](MachineInstrBuilder &MIB) { // offset
6533 MIB.addImm(Offset);
6534 }}};
6535}
6536
6537bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6538 int64_t Offset) const {
6539 if (!isUInt<16>(Offset))
6540 return false;
6541
6542 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6543 return true;
6544
6545 // On Southern Islands instruction with a negative base value and an offset
6546 // don't seem to work.
6547 return VT->signBitIsZero(Base);
6548}
6549
6550bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6551 int64_t Offset1,
6552 unsigned Size) const {
6553 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6554 return false;
6555 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6556 return false;
6557
6558 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6559 return true;
6560
6561 // On Southern Islands instruction with a negative base value and an offset
6562 // don't seem to work.
6563 return VT->signBitIsZero(Base);
6564}
6565
6566// Return whether the operation has NoUnsignedWrap property.
6567static bool isNoUnsignedWrap(MachineInstr *Addr) {
6568 return Addr->getOpcode() == TargetOpcode::G_OR ||
6569 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6571}
6572
6573// Check that the base address of flat scratch load/store in the form of `base +
6574// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6575// requirement). We always treat the first operand as the base address here.
6576bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6577 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6578
6579 if (isNoUnsignedWrap(AddrMI))
6580 return true;
6581
6582 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6583 // values.
6584 if (STI.hasSignedScratchOffsets())
6585 return true;
6586
6587 Register LHS = AddrMI->getOperand(1).getReg();
6588 Register RHS = AddrMI->getOperand(2).getReg();
6589
6590 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6591 std::optional<ValueAndVReg> RhsValReg =
6593 // If the immediate offset is negative and within certain range, the base
6594 // address cannot also be negative. If the base is also negative, the sum
6595 // would be either negative or much larger than the valid range of scratch
6596 // memory a thread can access.
6597 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6598 RhsValReg->Value.getSExtValue() > -0x40000000)
6599 return true;
6600 }
6601
6602 return VT->signBitIsZero(LHS);
6603}
6604
6605// Check address value in SGPR/VGPR are legal for flat scratch in the form
6606// of: SGPR + VGPR.
6607bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6608 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6609
6610 if (isNoUnsignedWrap(AddrMI))
6611 return true;
6612
6613 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6614 // values.
6615 if (STI.hasSignedScratchOffsets())
6616 return true;
6617
6618 Register LHS = AddrMI->getOperand(1).getReg();
6619 Register RHS = AddrMI->getOperand(2).getReg();
6620 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6621}
6622
6623// Check address value in SGPR/VGPR are legal for flat scratch in the form
6624// of: SGPR + VGPR + Imm.
6625bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6626 Register Addr) const {
6627 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6628 // values.
6629 if (STI.hasSignedScratchOffsets())
6630 return true;
6631
6632 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6633 Register Base = AddrMI->getOperand(1).getReg();
6634 std::optional<DefinitionAndSourceRegister> BaseDef =
6636 std::optional<ValueAndVReg> RHSOffset =
6638 assert(RHSOffset);
6639
6640 // If the immediate offset is negative and within certain range, the base
6641 // address cannot also be negative. If the base is also negative, the sum
6642 // would be either negative or much larger than the valid range of scratch
6643 // memory a thread can access.
6644 if (isNoUnsignedWrap(BaseDef->MI) &&
6645 (isNoUnsignedWrap(AddrMI) ||
6646 (RHSOffset->Value.getSExtValue() < 0 &&
6647 RHSOffset->Value.getSExtValue() > -0x40000000)))
6648 return true;
6649
6650 Register LHS = BaseDef->MI->getOperand(1).getReg();
6651 Register RHS = BaseDef->MI->getOperand(2).getReg();
6652 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6653}
6654
6655bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6656 unsigned ShAmtBits) const {
6657 assert(MI.getOpcode() == TargetOpcode::G_AND);
6658
6659 std::optional<APInt> RHS =
6660 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6661 if (!RHS)
6662 return false;
6663
6664 if (RHS->countr_one() >= ShAmtBits)
6665 return true;
6666
6667 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6668 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6669}
6670
6672AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6673 MachineOperand &Root) const {
6674 Register Reg = Root.getReg();
6675 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6676
6677 std::optional<DefinitionAndSourceRegister> Def =
6679 assert(Def && "this shouldn't be an optional result");
6680 Reg = Def->Reg;
6681
6682 if (Register WaveBase = getWaveAddress(Def->MI)) {
6683 return {{
6684 [=](MachineInstrBuilder &MIB) { // rsrc
6685 MIB.addReg(Info->getScratchRSrcReg());
6686 },
6687 [=](MachineInstrBuilder &MIB) { // soffset
6688 MIB.addReg(WaveBase);
6689 },
6690 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6691 }};
6692 }
6693
6694 int64_t Offset = 0;
6695
6696 // FIXME: Copy check is a hack
6698 if (mi_match(Reg, *MRI,
6699 m_GPtrAdd(m_Reg(BasePtr),
6701 if (!TII.isLegalMUBUFImmOffset(Offset))
6702 return {};
6703 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6704 Register WaveBase = getWaveAddress(BasePtrDef);
6705 if (!WaveBase)
6706 return {};
6707
6708 return {{
6709 [=](MachineInstrBuilder &MIB) { // rsrc
6710 MIB.addReg(Info->getScratchRSrcReg());
6711 },
6712 [=](MachineInstrBuilder &MIB) { // soffset
6713 MIB.addReg(WaveBase);
6714 },
6715 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6716 }};
6717 }
6718
6719 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6720 !TII.isLegalMUBUFImmOffset(Offset))
6721 return {};
6722
6723 return {{
6724 [=](MachineInstrBuilder &MIB) { // rsrc
6725 MIB.addReg(Info->getScratchRSrcReg());
6726 },
6727 [=](MachineInstrBuilder &MIB) { // soffset
6728 MIB.addImm(0);
6729 },
6730 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6731 }};
6732}
6733
6734std::pair<Register, unsigned>
6735AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6736 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6737 int64_t ConstAddr = 0;
6738
6739 Register PtrBase;
6740 int64_t Offset;
6741 std::tie(PtrBase, Offset, std::ignore) =
6742 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6743
6744 if (Offset) {
6745 if (isDSOffsetLegal(PtrBase, Offset)) {
6746 // (add n0, c0)
6747 return std::pair(PtrBase, Offset);
6748 }
6749 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6750 // TODO
6751
6752
6753 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6754 // TODO
6755
6756 }
6757
6758 return std::pair(Root.getReg(), 0);
6759}
6760
6762AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6763 Register Reg;
6764 unsigned Offset;
6765 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6766 return {{
6767 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6768 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6769 }};
6770}
6771
6773AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6774 return selectDSReadWrite2(Root, 4);
6775}
6776
6778AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6779 return selectDSReadWrite2(Root, 8);
6780}
6781
6783AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6784 unsigned Size) const {
6785 Register Reg;
6786 unsigned Offset;
6787 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6788 return {{
6789 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6790 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6791 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6792 }};
6793}
6794
6795std::pair<Register, unsigned>
6796AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6797 unsigned Size) const {
6798 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6799 int64_t ConstAddr = 0;
6800
6801 Register PtrBase;
6802 int64_t Offset;
6803 std::tie(PtrBase, Offset, std::ignore) =
6804 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6805
6806 if (Offset) {
6807 int64_t OffsetValue0 = Offset;
6808 int64_t OffsetValue1 = Offset + Size;
6809 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6810 // (add n0, c0)
6811 return std::pair(PtrBase, OffsetValue0 / Size);
6812 }
6813 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6814 // TODO
6815
6816 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6817 // TODO
6818
6819 }
6820
6821 return std::pair(Root.getReg(), 0);
6822}
6823
6824/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6825/// the base value with the constant offset, and if the offset computation is
6826/// known to be inbounds. There may be intervening copies between \p Root and
6827/// the identified constant. Returns \p Root, 0, false if this does not match
6828/// the pattern.
6829std::tuple<Register, int64_t, bool>
6830AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6831 Register Root, const MachineRegisterInfo &MRI) const {
6832 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6833 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6834 return {Root, 0, false};
6835
6836 MachineOperand &RHS = RootI->getOperand(2);
6837 std::optional<ValueAndVReg> MaybeOffset =
6839 if (!MaybeOffset)
6840 return {Root, 0, false};
6841 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6842 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6843 IsInBounds};
6844}
6845
6847 MIB.addImm(0);
6848}
6849
6850/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6851/// BasePtr is not valid, a null base pointer will be used.
6853 uint32_t FormatLo, uint32_t FormatHi,
6854 Register BasePtr) {
6855 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6856 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6857 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6858 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6859
6860 B.buildInstr(AMDGPU::S_MOV_B32)
6861 .addDef(RSrc2)
6862 .addImm(FormatLo);
6863 B.buildInstr(AMDGPU::S_MOV_B32)
6864 .addDef(RSrc3)
6865 .addImm(FormatHi);
6866
6867 // Build the half of the subregister with the constants before building the
6868 // full 128-bit register. If we are building multiple resource descriptors,
6869 // this will allow CSEing of the 2-component register.
6870 B.buildInstr(AMDGPU::REG_SEQUENCE)
6871 .addDef(RSrcHi)
6872 .addReg(RSrc2)
6873 .addImm(AMDGPU::sub0)
6874 .addReg(RSrc3)
6875 .addImm(AMDGPU::sub1);
6876
6877 Register RSrcLo = BasePtr;
6878 if (!BasePtr) {
6879 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6880 B.buildInstr(AMDGPU::S_MOV_B64)
6881 .addDef(RSrcLo)
6882 .addImm(0);
6883 }
6884
6885 B.buildInstr(AMDGPU::REG_SEQUENCE)
6886 .addDef(RSrc)
6887 .addReg(RSrcLo)
6888 .addImm(AMDGPU::sub0_sub1)
6889 .addReg(RSrcHi)
6890 .addImm(AMDGPU::sub2_sub3);
6891
6892 return RSrc;
6893}
6894
6896 const SIInstrInfo &TII, Register BasePtr) {
6897 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6898
6899 // FIXME: Why are half the "default" bits ignored based on the addressing
6900 // mode?
6901 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6902}
6903
6905 const SIInstrInfo &TII, Register BasePtr) {
6906 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6907
6908 // FIXME: Why are half the "default" bits ignored based on the addressing
6909 // mode?
6910 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6911}
6912
6913AMDGPUInstructionSelector::MUBUFAddressData
6914AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6915 MUBUFAddressData Data;
6916 Data.N0 = Src;
6917
6918 Register PtrBase;
6919 int64_t Offset;
6920
6921 std::tie(PtrBase, Offset, std::ignore) =
6922 getPtrBaseWithConstantOffset(Src, *MRI);
6923 if (isUInt<32>(Offset)) {
6924 Data.N0 = PtrBase;
6925 Data.Offset = Offset;
6926 }
6927
6928 if (MachineInstr *InputAdd
6929 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6930 Data.N2 = InputAdd->getOperand(1).getReg();
6931 Data.N3 = InputAdd->getOperand(2).getReg();
6932
6933 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6934 // FIXME: Don't know this was defined by operand 0
6935 //
6936 // TODO: Remove this when we have copy folding optimizations after
6937 // RegBankSelect.
6938 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6939 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6940 }
6941
6942 return Data;
6943}
6944
6945/// Return if the addr64 mubuf mode should be used for the given address.
6946bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6947 // (ptr_add N2, N3) -> addr64, or
6948 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6949 if (Addr.N2)
6950 return true;
6951
6952 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6953 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6954}
6955
6956/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6957/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6958/// component.
6959void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6960 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6961 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6962 return;
6963
6964 // Illegal offset, store it in soffset.
6965 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6966 B.buildInstr(AMDGPU::S_MOV_B32)
6967 .addDef(SOffset)
6968 .addImm(ImmOffset);
6969 ImmOffset = 0;
6970}
6971
6972bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6973 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6974 Register &SOffset, int64_t &Offset) const {
6975 // FIXME: Predicates should stop this from reaching here.
6976 // addr64 bit was removed for volcanic islands.
6977 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6978 return false;
6979
6980 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6981 if (!shouldUseAddr64(AddrData))
6982 return false;
6983
6984 Register N0 = AddrData.N0;
6985 Register N2 = AddrData.N2;
6986 Register N3 = AddrData.N3;
6987 Offset = AddrData.Offset;
6988
6989 // Base pointer for the SRD.
6990 Register SRDPtr;
6991
6992 if (N2) {
6993 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6994 assert(N3);
6995 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6996 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6997 // addr64, and construct the default resource from a 0 address.
6998 VAddr = N0;
6999 } else {
7000 SRDPtr = N3;
7001 VAddr = N2;
7002 }
7003 } else {
7004 // N2 is not divergent.
7005 SRDPtr = N2;
7006 VAddr = N3;
7007 }
7008 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
7009 // Use the default null pointer in the resource
7010 VAddr = N0;
7011 } else {
7012 // N0 -> offset, or
7013 // (N0 + C1) -> offset
7014 SRDPtr = N0;
7015 }
7016
7017 MachineIRBuilder B(*Root.getParent());
7018 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
7019 splitIllegalMUBUFOffset(B, SOffset, Offset);
7020 return true;
7021}
7022
7023bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
7024 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
7025 int64_t &Offset) const {
7026
7027 // FIXME: Pattern should not reach here.
7028 if (STI.useFlatForGlobal())
7029 return false;
7030
7031 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
7032 if (shouldUseAddr64(AddrData))
7033 return false;
7034
7035 // N0 -> offset, or
7036 // (N0 + C1) -> offset
7037 Register SRDPtr = AddrData.N0;
7038 Offset = AddrData.Offset;
7039
7040 // TODO: Look through extensions for 32-bit soffset.
7041 MachineIRBuilder B(*Root.getParent());
7042
7043 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
7044 splitIllegalMUBUFOffset(B, SOffset, Offset);
7045 return true;
7046}
7047
7049AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
7050 Register VAddr;
7051 Register RSrcReg;
7052 Register SOffset;
7053 int64_t Offset = 0;
7054
7055 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
7056 return {};
7057
7058 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
7059 // pattern.
7060 return {{
7061 [=](MachineInstrBuilder &MIB) { // rsrc
7062 MIB.addReg(RSrcReg);
7063 },
7064 [=](MachineInstrBuilder &MIB) { // vaddr
7065 MIB.addReg(VAddr);
7066 },
7067 [=](MachineInstrBuilder &MIB) { // soffset
7068 if (SOffset)
7069 MIB.addReg(SOffset);
7070 else if (STI.hasRestrictedSOffset())
7071 MIB.addReg(AMDGPU::SGPR_NULL);
7072 else
7073 MIB.addImm(0);
7074 },
7075 [=](MachineInstrBuilder &MIB) { // offset
7076 MIB.addImm(Offset);
7077 },
7078 addZeroImm, // cpol
7079 addZeroImm, // tfe
7080 addZeroImm // swz
7081 }};
7082}
7083
7085AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
7086 Register RSrcReg;
7087 Register SOffset;
7088 int64_t Offset = 0;
7089
7090 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
7091 return {};
7092
7093 return {{
7094 [=](MachineInstrBuilder &MIB) { // rsrc
7095 MIB.addReg(RSrcReg);
7096 },
7097 [=](MachineInstrBuilder &MIB) { // soffset
7098 if (SOffset)
7099 MIB.addReg(SOffset);
7100 else if (STI.hasRestrictedSOffset())
7101 MIB.addReg(AMDGPU::SGPR_NULL);
7102 else
7103 MIB.addImm(0);
7104 },
7105 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
7106 addZeroImm, // cpol
7107 addZeroImm, // tfe
7108 addZeroImm, // swz
7109 }};
7110}
7111
7113AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
7114
7115 Register SOffset = Root.getReg();
7116
7117 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
7118 SOffset = AMDGPU::SGPR_NULL;
7119
7120 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
7121}
7122
7123/// Get an immediate that must be 32-bits, and treated as zero extended.
7124static std::optional<uint64_t>
7126 // getIConstantVRegVal sexts any values, so see if that matters.
7127 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
7128 if (!OffsetVal || !isInt<32>(*OffsetVal))
7129 return std::nullopt;
7130 return Lo_32(*OffsetVal);
7131}
7132
7134AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
7135 std::optional<uint64_t> OffsetVal =
7136 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
7137 if (!OffsetVal)
7138 return {};
7139
7140 std::optional<int64_t> EncodedImm =
7141 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
7142 if (!EncodedImm)
7143 return {};
7144
7145 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
7146}
7147
7149AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
7150 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
7151
7152 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
7153 if (!OffsetVal)
7154 return {};
7155
7156 std::optional<int64_t> EncodedImm =
7158 if (!EncodedImm)
7159 return {};
7160
7161 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
7162}
7163
7165AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
7166 // Match the (soffset + offset) pair as a 32-bit register base and
7167 // an immediate offset.
7168 Register SOffset;
7169 unsigned Offset;
7170 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
7171 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
7172 if (!SOffset)
7173 return std::nullopt;
7174
7175 std::optional<int64_t> EncodedOffset =
7176 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
7177 if (!EncodedOffset)
7178 return std::nullopt;
7179
7180 assert(MRI->getType(SOffset) == LLT::scalar(32));
7181 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
7182 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
7183}
7184
7185std::pair<Register, unsigned>
7186AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
7187 bool &Matched) const {
7188 Matched = false;
7189
7190 Register Src;
7191 unsigned Mods;
7192 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
7193
7194 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
7195 assert(MRI->getType(Src) == LLT::scalar(16));
7196
7197 // Only change Src if src modifier could be gained. In such cases new Src
7198 // could be sgpr but this does not violate constant bus restriction for
7199 // instruction that is being selected.
7200 Src = stripBitCast(Src, *MRI);
7201
7202 const auto CheckAbsNeg = [&]() {
7203 // Be careful about folding modifiers if we already have an abs. fneg is
7204 // applied last, so we don't want to apply an earlier fneg.
7205 if ((Mods & SISrcMods::ABS) == 0) {
7206 unsigned ModsTmp;
7207 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7208
7209 if ((ModsTmp & SISrcMods::NEG) != 0)
7210 Mods ^= SISrcMods::NEG;
7211
7212 if ((ModsTmp & SISrcMods::ABS) != 0)
7213 Mods |= SISrcMods::ABS;
7214 }
7215 };
7216
7217 CheckAbsNeg();
7218
7219 // op_sel/op_sel_hi decide the source type and source.
7220 // If the source's op_sel_hi is set, it indicates to do a conversion from
7221 // fp16. If the sources's op_sel is set, it picks the high half of the
7222 // source register.
7223
7224 Mods |= SISrcMods::OP_SEL_1;
7225
7226 if (isExtractHiElt(*MRI, Src, Src)) {
7227 Mods |= SISrcMods::OP_SEL_0;
7228 CheckAbsNeg();
7229 }
7230
7231 Matched = true;
7232 }
7233
7234 return {Src, Mods};
7235}
7236
7238AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7239 MachineOperand &Root) const {
7240 Register Src;
7241 unsigned Mods;
7242 bool Matched;
7243 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7244 if (!Matched)
7245 return {};
7246
7247 return {{
7248 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7249 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7250 }};
7251}
7252
7254AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7255 Register Src;
7256 unsigned Mods;
7257 bool Matched;
7258 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7259
7260 return {{
7261 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7262 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7263 }};
7264}
7265
7266bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7267 MachineInstr &I, Intrinsic::ID IntrID) const {
7268 MachineBasicBlock *MBB = I.getParent();
7269 const DebugLoc &DL = I.getDebugLoc();
7270 Register CCReg = I.getOperand(0).getReg();
7271
7272 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7273 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7274
7275 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7276 .addImm(I.getOperand(2).getImm());
7277
7278 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7279
7280 I.eraseFromParent();
7281 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7282 *MRI);
7283}
7284
7285bool AMDGPUInstructionSelector::selectSGetBarrierState(
7286 MachineInstr &I, Intrinsic::ID IntrID) const {
7287 MachineBasicBlock *MBB = I.getParent();
7288 const DebugLoc &DL = I.getDebugLoc();
7289 const MachineOperand &BarOp = I.getOperand(2);
7290 std::optional<int64_t> BarValImm =
7291 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7292
7293 if (!BarValImm) {
7294 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7295 .addReg(BarOp.getReg());
7296 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7297 }
7298 MachineInstrBuilder MIB;
7299 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7300 : AMDGPU::S_GET_BARRIER_STATE_M0;
7301 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7302
7303 auto DstReg = I.getOperand(0).getReg();
7304 const TargetRegisterClass *DstRC =
7305 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7306 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7307 return false;
7308 MIB.addDef(DstReg);
7309 if (BarValImm) {
7310 MIB.addImm(*BarValImm);
7311 }
7312 I.eraseFromParent();
7313 return true;
7314}
7315
7316unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7317 if (HasInlineConst) {
7318 switch (IntrID) {
7319 default:
7320 llvm_unreachable("not a named barrier op");
7321 case Intrinsic::amdgcn_s_barrier_join:
7322 return AMDGPU::S_BARRIER_JOIN_IMM;
7323 case Intrinsic::amdgcn_s_wakeup_barrier:
7324 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7325 case Intrinsic::amdgcn_s_get_named_barrier_state:
7326 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7327 };
7328 } else {
7329 switch (IntrID) {
7330 default:
7331 llvm_unreachable("not a named barrier op");
7332 case Intrinsic::amdgcn_s_barrier_join:
7333 return AMDGPU::S_BARRIER_JOIN_M0;
7334 case Intrinsic::amdgcn_s_wakeup_barrier:
7335 return AMDGPU::S_WAKEUP_BARRIER_M0;
7336 case Intrinsic::amdgcn_s_get_named_barrier_state:
7337 return AMDGPU::S_GET_BARRIER_STATE_M0;
7338 };
7339 }
7340}
7341
7342bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7343 MachineInstr &I, Intrinsic::ID IntrID) const {
7344 MachineBasicBlock *MBB = I.getParent();
7345 const DebugLoc &DL = I.getDebugLoc();
7346 const MachineOperand &BarOp = I.getOperand(1);
7347 const MachineOperand &CntOp = I.getOperand(2);
7348
7349 // A member count of 0 means "keep existing member count". That plus a known
7350 // constant value for the barrier ID lets us use the immarg form.
7351 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7352 std::optional<int64_t> CntImm =
7353 getIConstantVRegSExtVal(CntOp.getReg(), *MRI);
7354 if (CntImm && *CntImm == 0) {
7355 std::optional<int64_t> BarValImm =
7356 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7357 if (BarValImm) {
7358 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7359 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
7360 .addImm(BarID);
7361 I.eraseFromParent();
7362 return true;
7363 }
7364 }
7365 }
7366
7367 // BarID = (BarOp >> 4) & 0x3F
7368 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7369 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7370 .add(BarOp)
7371 .addImm(4u)
7372 .setOperandDead(3); // Dead scc
7373
7374 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7375 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7376 .addReg(TmpReg0)
7377 .addImm(0x3F)
7378 .setOperandDead(3); // Dead scc
7379
7380 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7381 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7382 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7383 .add(CntOp)
7384 .addImm(0x3F)
7385 .setOperandDead(3); // Dead scc
7386
7387 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7388 constexpr unsigned ShAmt = 16;
7389 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7390 .addReg(TmpReg2)
7391 .addImm(ShAmt)
7392 .setOperandDead(3); // Dead scc
7393
7394 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7395 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7396 .addReg(TmpReg1)
7397 .addReg(TmpReg3)
7398 .setOperandDead(3); // Dead scc;
7399
7400 auto CopyMIB =
7401 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7402 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7403
7404 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7405 ? AMDGPU::S_BARRIER_INIT_M0
7406 : AMDGPU::S_BARRIER_SIGNAL_M0;
7407 MachineInstrBuilder MIB;
7408 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7409
7410 I.eraseFromParent();
7411 return true;
7412}
7413
7414bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7415 MachineInstr &I, Intrinsic::ID IntrID) const {
7416 MachineBasicBlock *MBB = I.getParent();
7417 const DebugLoc &DL = I.getDebugLoc();
7418 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7419 ? I.getOperand(2)
7420 : I.getOperand(1);
7421 std::optional<int64_t> BarValImm =
7422 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7423
7424 if (!BarValImm) {
7425 // BarID = (BarOp >> 4) & 0x3F
7426 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7427 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7428 .addReg(BarOp.getReg())
7429 .addImm(4u)
7430 .setOperandDead(3); // Dead scc;
7431
7432 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7433 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7434 .addReg(TmpReg0)
7435 .addImm(0x3F)
7436 .setOperandDead(3); // Dead scc;
7437
7438 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7439 .addReg(TmpReg1);
7440 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7441 }
7442
7443 MachineInstrBuilder MIB;
7444 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7445 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7446
7447 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7448 auto DstReg = I.getOperand(0).getReg();
7449 const TargetRegisterClass *DstRC =
7450 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7451 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7452 return false;
7453 MIB.addDef(DstReg);
7454 }
7455
7456 if (BarValImm) {
7457 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7458 MIB.addImm(BarId);
7459 }
7460
7461 I.eraseFromParent();
7462 return true;
7463}
7464
7465void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7466 const MachineInstr &MI,
7467 int OpIdx) const {
7468 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7469 "Expected G_CONSTANT");
7470 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7471}
7472
7473void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7474 const MachineInstr &MI,
7475 int OpIdx) const {
7476 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7477 "Expected G_CONSTANT");
7478 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7479}
7480
7481void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7482 const MachineInstr &MI,
7483 int OpIdx) const {
7484 const MachineOperand &Op = MI.getOperand(1);
7485 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7486 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7487}
7488
7489void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7490 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7491 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7492 "Expected G_CONSTANT");
7493 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7494}
7495
7496/// This only really exists to satisfy DAG type checking machinery, so is a
7497/// no-op here.
7498void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7499 const MachineInstr &MI,
7500 int OpIdx) const {
7501 const MachineOperand &Op = MI.getOperand(OpIdx);
7502 int64_t Imm;
7503 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7504 MIB.addImm(Imm);
7505 else
7506 MIB.addImm(Op.getImm());
7507}
7508
7509void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7510 const MachineInstr &MI,
7511 int OpIdx) const {
7512 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7513}
7514
7515void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7516 const MachineInstr &MI,
7517 int OpIdx) const {
7518 assert(OpIdx >= 0 && "expected to match an immediate operand");
7519 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7520}
7521
7522void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7523 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7524 assert(OpIdx >= 0 && "expected to match an immediate operand");
7525 MIB.addImm(
7526 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7527}
7528
7529void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7530 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7531 assert(OpIdx >= 0 && "expected to match an immediate operand");
7532 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7534 : (int64_t)SISrcMods::DST_OP_SEL);
7535}
7536
7537void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7538 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7539 assert(OpIdx >= 0 && "expected to match an immediate operand");
7540 MIB.addImm(
7541 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7542}
7543
7544void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7545 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7546 assert(OpIdx >= 0 && "expected to match an immediate operand");
7547 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7548 ? (int64_t)(SISrcMods::OP_SEL_0)
7549 : 0);
7550}
7551
7552void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7553 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7554 assert(OpIdx >= 0 && "expected to match an immediate operand");
7555 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7556 : 0);
7557}
7558
7559void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7560 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7561 assert(OpIdx >= 0 && "expected to match an immediate operand");
7562 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7563 : 0);
7564}
7565
7566void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7567 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7568 assert(OpIdx >= 0 && "expected to match an immediate operand");
7569 MIB.addImm(
7570 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7571}
7572
7573void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7574 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7575 assert(OpIdx >= 0 && "expected to match an immediate operand");
7576 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7577 ? (int64_t)SISrcMods::DST_OP_SEL
7578 : 0);
7579}
7580
7581void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7582 const MachineInstr &MI,
7583 int OpIdx) const {
7584 assert(OpIdx >= 0 && "expected to match an immediate operand");
7585 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7588}
7589
7590void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7591 const MachineInstr &MI,
7592 int OpIdx) const {
7593 assert(OpIdx >= 0 && "expected to match an immediate operand");
7594 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7597 MIB.addImm(Swizzle);
7598}
7599
7600void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7601 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7602 assert(OpIdx >= 0 && "expected to match an immediate operand");
7603 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7606 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7607}
7608
7609void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7610 const MachineInstr &MI,
7611 int OpIdx) const {
7612 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7613}
7614
7615void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7616 const MachineInstr &MI,
7617 int OpIdx) const {
7618 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7619 int ExpVal = APF.getExactLog2Abs();
7620 assert(ExpVal != INT_MIN);
7621 MIB.addImm(ExpVal);
7622}
7623
7624void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7625 const MachineInstr &MI,
7626 int OpIdx) const {
7627 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7628 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7629 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7630 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7631 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7632}
7633
7634void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7635 const MachineInstr &MI,
7636 int OpIdx) const {
7637 unsigned Mods = SISrcMods::OP_SEL_1;
7638 if (MI.getOperand(OpIdx).getImm())
7639 Mods ^= SISrcMods::NEG;
7640 MIB.addImm((int64_t)Mods);
7641}
7642
7643void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7644 const MachineInstr &MI,
7645 int OpIdx) const {
7646 unsigned Mods = SISrcMods::OP_SEL_1;
7647 if (MI.getOperand(OpIdx).getImm())
7649 MIB.addImm((int64_t)Mods);
7650}
7651
7652void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7653 const MachineInstr &MI,
7654 int OpIdx) const {
7655 unsigned Val = MI.getOperand(OpIdx).getImm();
7656 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7657 if (Val == 1) // neg
7658 Mods ^= SISrcMods::NEG;
7659 if (Val == 2) // abs
7660 Mods ^= SISrcMods::ABS;
7661 if (Val == 3) // neg and abs
7662 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7663 MIB.addImm((int64_t)Mods);
7664}
7665
7666void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7667 const MachineInstr &MI,
7668 int OpIdx) const {
7669 uint32_t V = MI.getOperand(2).getImm();
7672 if (!Subtarget->hasSafeCUPrefetch())
7673 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7674 MIB.addImm(V);
7675}
7676
7677/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7678void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7679 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7680 unsigned Val = MI.getOperand(OpIdx).getImm();
7681 unsigned New = 0;
7682 if (Val & 0x1)
7684 if (Val & 0x2)
7686 MIB.addImm(New);
7687}
7688
7689bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7690 return TII.isInlineConstant(Imm);
7691}
7692
7693bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7694 return TII.isInlineConstant(Imm);
7695}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void diagnoseUnsupportedIntrinsic(const MachineInstr &I)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1600
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:754
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:752
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:742
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
bool isFPPredicate() const
Definition InstrTypes.h:845
bool isIntPredicate() const
Definition InstrTypes.h:846
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.h:218
Diagnostic information for unsupported feature in backend.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX13Plus(const MCSubtargetInfo &STI)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:861
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1447
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:60
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:656
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:464
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:297
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:159
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:497
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:317
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:442
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:436
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:472
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:504
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.