LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
246
247 Register DstReg = I.getOperand(0).getReg();
248 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
249
250 I.eraseFromParent();
251 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
252}
253
254bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
255 const DebugLoc &DL = I.getDebugLoc();
256 MachineBasicBlock *BB = I.getParent();
257
258 Register DstReg = I.getOperand(0).getReg();
259 Register SrcReg = I.getOperand(1).getReg();
260 std::optional<ValueAndVReg> Arg =
261 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
262
263 if (Arg) {
264 const int64_t Value = Arg->Value.getZExtValue();
265 if (Value == 0) {
266 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
267 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
268 } else {
269 assert(Value == 1);
270 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
271 }
272 I.eraseFromParent();
273 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
274 }
275
276 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
277 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
278
279 unsigned SelectOpcode =
280 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
281 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
282 .addReg(TRI.getExec())
283 .addImm(0);
284
285 I.eraseFromParent();
287 return true;
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302 return true;
303}
304
305bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
306 const Register DefReg = I.getOperand(0).getReg();
307 const LLT DefTy = MRI->getType(DefReg);
308
309 // S1 G_PHIs should not be selected in instruction-select, instead:
310 // - divergent S1 G_PHI should go through lane mask merging algorithm
311 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
312 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
313 if (DefTy == LLT::scalar(1))
314 return false;
315
316 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
317
318 const RegClassOrRegBank &RegClassOrBank =
319 MRI->getRegClassOrRegBank(DefReg);
320
321 const TargetRegisterClass *DefRC =
323 if (!DefRC) {
324 if (!DefTy.isValid()) {
325 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
326 return false;
327 }
328
329 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
330 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
331 if (!DefRC) {
332 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
333 return false;
334 }
335 }
336
337 // If inputs have register bank, assign corresponding reg class.
338 // Note: registers don't need to have the same reg bank.
339 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
340 const Register SrcReg = I.getOperand(i).getReg();
341
342 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
343 if (RB) {
344 const LLT SrcTy = MRI->getType(SrcReg);
345 const TargetRegisterClass *SrcRC =
346 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
347 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
348 return false;
349 }
350 }
351
352 I.setDesc(TII.get(TargetOpcode::PHI));
353 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
354}
355
357AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
358 const TargetRegisterClass &SubRC,
359 unsigned SubIdx) const {
360
361 MachineInstr *MI = MO.getParent();
362 MachineBasicBlock *BB = MO.getParent()->getParent();
363 Register DstReg = MRI->createVirtualRegister(&SubRC);
364
365 if (MO.isReg()) {
366 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
367 Register Reg = MO.getReg();
368 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
369 .addReg(Reg, {}, ComposedSubIdx);
370
371 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
372 MO.isKill(), MO.isDead(), MO.isUndef(),
373 MO.isEarlyClobber(), 0, MO.isDebug(),
374 MO.isInternalRead());
375 }
376
377 assert(MO.isImm());
378
379 APInt Imm(64, MO.getImm());
380
381 switch (SubIdx) {
382 default:
383 llvm_unreachable("do not know to split immediate with this sub index.");
384 case AMDGPU::sub0:
385 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
386 case AMDGPU::sub1:
387 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
388 }
389}
390
391static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
392 switch (Opc) {
393 case AMDGPU::G_AND:
394 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
395 case AMDGPU::G_OR:
396 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
397 case AMDGPU::G_XOR:
398 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
399 default:
400 llvm_unreachable("not a bit op");
401 }
402}
403
404bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
405 Register DstReg = I.getOperand(0).getReg();
406 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
407
408 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
409 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
410 DstRB->getID() != AMDGPU::VCCRegBankID)
411 return false;
412
413 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
414 STI.isWave64());
415 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
416
417 // Dead implicit-def of scc
418 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
419 true, // isImp
420 false, // isKill
421 true)); // isDead
422 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
423 return true;
424}
425
426bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
427 MachineBasicBlock *BB = I.getParent();
428 MachineFunction *MF = BB->getParent();
429 Register DstReg = I.getOperand(0).getReg();
430 const DebugLoc &DL = I.getDebugLoc();
431 LLT Ty = MRI->getType(DstReg);
432 if (Ty.isVector())
433 return false;
434
435 unsigned Size = Ty.getSizeInBits();
436 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
437 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
438 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
439
440 if (Size == 32) {
441 if (IsSALU) {
442 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
443 MachineInstr *Add =
444 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
445 .add(I.getOperand(1))
446 .add(I.getOperand(2))
447 .setOperandDead(3); // Dead scc
448 I.eraseFromParent();
449 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
450 return true;
451 }
452
453 if (STI.hasAddNoCarryInsts()) {
454 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
455 I.setDesc(TII.get(Opc));
456 I.addOperand(*MF, MachineOperand::CreateImm(0));
457 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
458 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
459 return true;
460 }
461
462 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
463
464 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
465 MachineInstr *Add
466 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
467 .addDef(UnusedCarry, RegState::Dead)
468 .add(I.getOperand(1))
469 .add(I.getOperand(2))
470 .addImm(0);
471 I.eraseFromParent();
472 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
473 return true;
474 }
475
476 assert(!Sub && "illegal sub should not reach here");
477
478 const TargetRegisterClass &RC
479 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
480 const TargetRegisterClass &HalfRC
481 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
482
483 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
484 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
485 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
486 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
487
488 Register DstLo = MRI->createVirtualRegister(&HalfRC);
489 Register DstHi = MRI->createVirtualRegister(&HalfRC);
490
491 if (IsSALU) {
492 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
493 .add(Lo1)
494 .add(Lo2);
495 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
496 .add(Hi1)
497 .add(Hi2)
498 .setOperandDead(3); // Dead scc
499 } else {
500 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
501 Register CarryReg = MRI->createVirtualRegister(CarryRC);
502 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
503 .addDef(CarryReg)
504 .add(Lo1)
505 .add(Lo2)
506 .addImm(0);
507 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
508 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
509 .add(Hi1)
510 .add(Hi2)
511 .addReg(CarryReg, RegState::Kill)
512 .addImm(0);
513
514 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
515 }
516
517 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
518 .addReg(DstLo)
519 .addImm(AMDGPU::sub0)
520 .addReg(DstHi)
521 .addImm(AMDGPU::sub1);
522
523
524 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
525 return false;
526
527 I.eraseFromParent();
528 return true;
529}
530
531bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
532 MachineInstr &I) const {
533 MachineBasicBlock *BB = I.getParent();
534 MachineFunction *MF = BB->getParent();
535 const DebugLoc &DL = I.getDebugLoc();
536 Register Dst0Reg = I.getOperand(0).getReg();
537 Register Dst1Reg = I.getOperand(1).getReg();
538 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
539 I.getOpcode() == AMDGPU::G_UADDE;
540 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
541 I.getOpcode() == AMDGPU::G_USUBE;
542
543 if (isVCC(Dst1Reg, *MRI)) {
544 unsigned NoCarryOpc =
545 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
546 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
547 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
548 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
549 I.addOperand(*MF, MachineOperand::CreateImm(0));
550 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
551 return true;
552 }
553
554 Register Src0Reg = I.getOperand(2).getReg();
555 Register Src1Reg = I.getOperand(3).getReg();
556
557 if (HasCarryIn) {
558 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
559 .addReg(I.getOperand(4).getReg());
560 }
561
562 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
563 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
564
565 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
566 .add(I.getOperand(2))
567 .add(I.getOperand(3));
568
569 if (MRI->use_nodbg_empty(Dst1Reg)) {
570 CarryInst.setOperandDead(3); // Dead scc
571 } else {
572 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
573 .addReg(AMDGPU::SCC);
574 if (!MRI->getRegClassOrNull(Dst1Reg))
575 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
576 }
577
578 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
579 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
580 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 if (HasCarryIn &&
584 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
585 AMDGPU::SReg_32RegClass, *MRI))
586 return false;
587
588 I.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
593 MachineInstr &I) const {
594 MachineBasicBlock *BB = I.getParent();
595 MachineFunction *MF = BB->getParent();
596 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
597 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
598 MRI->use_nodbg_empty(I.getOperand(1).getReg());
599
600 unsigned Opc;
601 if (Subtarget->hasMADIntraFwdBug())
602 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
603 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
604 else if (UseNoCarry)
605 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
606 : AMDGPU::V_MAD_NC_I64_I32_e64;
607 else
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
609
610 if (UseNoCarry)
611 I.removeOperand(1);
612
613 I.setDesc(TII.get(Opc));
614 I.addOperand(*MF, MachineOperand::CreateImm(0));
615 I.addImplicitDefUseOperands(*MF);
616 I.getOperand(0).setIsEarlyClobber(true);
617 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
618 return true;
619}
620
621// TODO: We should probably legalize these to only using 32-bit results.
622bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
623 MachineBasicBlock *BB = I.getParent();
624 Register DstReg = I.getOperand(0).getReg();
625 Register SrcReg = I.getOperand(1).getReg();
626 LLT DstTy = MRI->getType(DstReg);
627 LLT SrcTy = MRI->getType(SrcReg);
628 const unsigned SrcSize = SrcTy.getSizeInBits();
629 unsigned DstSize = DstTy.getSizeInBits();
630
631 // TODO: Should handle any multiple of 32 offset.
632 unsigned Offset = I.getOperand(2).getImm();
633 if (Offset % 32 != 0 || DstSize > 128)
634 return false;
635
636 // 16-bit operations really use 32-bit registers.
637 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
638 if (DstSize == 16)
639 DstSize = 32;
640
641 const TargetRegisterClass *DstRC =
642 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
643 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
644 return false;
645
646 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
647 const TargetRegisterClass *SrcRC =
648 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
649 if (!SrcRC)
650 return false;
651 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
652 DstSize / 32);
653 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
654 if (!SrcRC)
655 return false;
656
657 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
658 *SrcRC, I.getOperand(1));
659 const DebugLoc &DL = I.getDebugLoc();
660 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
661 .addReg(SrcReg, {}, SubReg);
662
663 I.eraseFromParent();
664 return true;
665}
666
667bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
668 MachineBasicBlock *BB = MI.getParent();
669 Register DstReg = MI.getOperand(0).getReg();
670 LLT DstTy = MRI->getType(DstReg);
671 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
672
673 const unsigned SrcSize = SrcTy.getSizeInBits();
674 if (SrcSize < 32)
675 return selectImpl(MI, *CoverageInfo);
676
677 const DebugLoc &DL = MI.getDebugLoc();
678 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
679 const unsigned DstSize = DstTy.getSizeInBits();
680 const TargetRegisterClass *DstRC =
681 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
682 if (!DstRC)
683 return false;
684
685 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
686 MachineInstrBuilder MIB =
687 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
688 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
689 MachineOperand &Src = MI.getOperand(I + 1);
690 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
691 MIB.addImm(SubRegs[I]);
692
693 const TargetRegisterClass *SrcRC
694 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
695 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
696 return false;
697 }
698
699 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
700 return false;
701
702 MI.eraseFromParent();
703 return true;
704}
705
706bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
707 MachineBasicBlock *BB = MI.getParent();
708 const int NumDst = MI.getNumOperands() - 1;
709
710 MachineOperand &Src = MI.getOperand(NumDst);
711
712 Register SrcReg = Src.getReg();
713 Register DstReg0 = MI.getOperand(0).getReg();
714 LLT DstTy = MRI->getType(DstReg0);
715 LLT SrcTy = MRI->getType(SrcReg);
716
717 const unsigned DstSize = DstTy.getSizeInBits();
718 const unsigned SrcSize = SrcTy.getSizeInBits();
719 const DebugLoc &DL = MI.getDebugLoc();
720 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
721
722 const TargetRegisterClass *SrcRC =
723 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
724 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
725 return false;
726
727 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
728 // source, and this relies on the fact that the same subregister indices are
729 // used for both.
730 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
731 for (int I = 0, E = NumDst; I != E; ++I) {
732 MachineOperand &Dst = MI.getOperand(I);
733 // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits.
734 if (SrcBank->getID() == AMDGPU::SGPRRegBankID &&
735 SubRegs[I] == AMDGPU::hi16) {
736 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg())
737 .addReg(SrcReg)
738 .addImm(16);
739 } else {
740 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
741 .addReg(SrcReg, {}, SubRegs[I]);
742 }
743
744 // Make sure the subregister index is valid for the source register.
745 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
746 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
747 return false;
748
749 const TargetRegisterClass *DstRC =
750 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
751 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
752 return false;
753 }
754
755 MI.eraseFromParent();
756 return true;
757}
758
759bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
760 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
761 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
762
763 Register Src0 = MI.getOperand(1).getReg();
764 Register Src1 = MI.getOperand(2).getReg();
765 LLT SrcTy = MRI->getType(Src0);
766 const unsigned SrcSize = SrcTy.getSizeInBits();
767
768 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
769 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
770 return selectG_MERGE_VALUES(MI);
771 }
772
773 // Selection logic below is for V2S16 only.
774 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
775 Register Dst = MI.getOperand(0).getReg();
776 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
777 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
778 SrcTy != LLT::scalar(32)))
779 return selectImpl(MI, *CoverageInfo);
780
781 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
782 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
783 return false;
784
785 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
786 DstBank->getID() == AMDGPU::VGPRRegBankID);
787 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
788
789 const DebugLoc &DL = MI.getDebugLoc();
790 MachineBasicBlock *BB = MI.getParent();
791
792 // First, before trying TableGen patterns, check if both sources are
793 // constants. In those cases, we can trivially compute the final constant
794 // and emit a simple move.
795 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
796 if (ConstSrc1) {
797 auto ConstSrc0 =
798 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
799 if (ConstSrc0) {
800 const int64_t K0 = ConstSrc0->Value.getSExtValue();
801 const int64_t K1 = ConstSrc1->Value.getSExtValue();
802 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
803 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
804 uint32_t Imm = Lo16 | (Hi16 << 16);
805
806 // VALU
807 if (IsVector) {
808 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
809 MI.eraseFromParent();
810 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
811 }
812
813 // SALU
814 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
815 MI.eraseFromParent();
816 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
817 }
818 }
819
820 // Now try TableGen patterns.
821 if (selectImpl(MI, *CoverageInfo))
822 return true;
823
824 // TODO: This should probably be a combine somewhere
825 // (build_vector $src0, undef) -> copy $src0
826 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
827 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
828 MI.setDesc(TII.get(AMDGPU::COPY));
829 MI.removeOperand(2);
830 const auto &RC =
831 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
832 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
833 RBI.constrainGenericRegister(Src0, RC, *MRI);
834 }
835
836 // TODO: Can be improved?
837 if (IsVector) {
838 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
839 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
840 .addImm(0xFFFF)
841 .addReg(Src0);
842 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
843
844 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
845 .addReg(Src1)
846 .addImm(16)
847 .addReg(TmpReg);
848 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
849
850 MI.eraseFromParent();
851 return true;
852 }
853
854 Register ShiftSrc0;
855 Register ShiftSrc1;
856
857 // With multiple uses of the shift, this will duplicate the shift and
858 // increase register pressure.
859 //
860 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
861 // => (S_PACK_HH_B32_B16 $src0, $src1)
862 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
863 // => (S_PACK_HL_B32_B16 $src0, $src1)
864 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
865 // => (S_PACK_LH_B32_B16 $src0, $src1)
866 // (build_vector $src0, $src1)
867 // => (S_PACK_LL_B32_B16 $src0, $src1)
868
869 bool Shift0 = mi_match(
870 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
871
872 bool Shift1 = mi_match(
873 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
874
875 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
876 if (Shift0 && Shift1) {
877 Opc = AMDGPU::S_PACK_HH_B32_B16;
878 MI.getOperand(1).setReg(ShiftSrc0);
879 MI.getOperand(2).setReg(ShiftSrc1);
880 } else if (Shift1) {
881 Opc = AMDGPU::S_PACK_LH_B32_B16;
882 MI.getOperand(2).setReg(ShiftSrc1);
883 } else if (Shift0) {
884 auto ConstSrc1 =
885 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
886 if (ConstSrc1 && ConstSrc1->Value == 0) {
887 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
888 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
889 .addReg(ShiftSrc0)
890 .addImm(16)
891 .setOperandDead(3); // Dead scc
892
893 MI.eraseFromParent();
894 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
895 return true;
896 }
897 if (STI.hasSPackHL()) {
898 Opc = AMDGPU::S_PACK_HL_B32_B16;
899 MI.getOperand(1).setReg(ShiftSrc0);
900 }
901 }
902
903 MI.setDesc(TII.get(Opc));
904 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
905 return true;
906}
907
908bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
909 const MachineOperand &MO = I.getOperand(0);
910
911 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
912 // regbank check here is to know why getConstrainedRegClassForOperand failed.
913 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
914 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
915 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
916 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
917 return true;
918 }
919
920 return false;
921}
922
923bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
924 MachineBasicBlock *BB = I.getParent();
925
926 Register DstReg = I.getOperand(0).getReg();
927 Register Src0Reg = I.getOperand(1).getReg();
928 Register Src1Reg = I.getOperand(2).getReg();
929 LLT Src1Ty = MRI->getType(Src1Reg);
930
931 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
932 unsigned InsSize = Src1Ty.getSizeInBits();
933
934 int64_t Offset = I.getOperand(3).getImm();
935
936 // FIXME: These cases should have been illegal and unnecessary to check here.
937 if (Offset % 32 != 0 || InsSize % 32 != 0)
938 return false;
939
940 // Currently not handled by getSubRegFromChannel.
941 if (InsSize > 128)
942 return false;
943
944 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
945 if (SubReg == AMDGPU::NoSubRegister)
946 return false;
947
948 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
949 const TargetRegisterClass *DstRC =
950 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
951 if (!DstRC)
952 return false;
953
954 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
955 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
956 const TargetRegisterClass *Src0RC =
957 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
958 const TargetRegisterClass *Src1RC =
959 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
960
961 // Deal with weird cases where the class only partially supports the subreg
962 // index.
963 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
964 if (!Src0RC || !Src1RC)
965 return false;
966
967 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
968 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
969 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
970 return false;
971
972 const DebugLoc &DL = I.getDebugLoc();
973 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
974 .addReg(Src0Reg)
975 .addReg(Src1Reg)
976 .addImm(SubReg);
977
978 I.eraseFromParent();
979 return true;
980}
981
982bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
983 Register DstReg = MI.getOperand(0).getReg();
984 Register SrcReg = MI.getOperand(1).getReg();
985 Register OffsetReg = MI.getOperand(2).getReg();
986 Register WidthReg = MI.getOperand(3).getReg();
987
988 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
989 "scalar BFX instructions are expanded in regbankselect");
990 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
991 "64-bit vector BFX instructions are expanded in regbankselect");
992
993 const DebugLoc &DL = MI.getDebugLoc();
994 MachineBasicBlock *MBB = MI.getParent();
995
996 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
997 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
998 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
999 .addReg(SrcReg)
1000 .addReg(OffsetReg)
1001 .addReg(WidthReg);
1002 MI.eraseFromParent();
1003 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1004 return true;
1005}
1006
1007bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1008 if (STI.getLDSBankCount() != 16)
1009 return selectImpl(MI, *CoverageInfo);
1010
1011 Register Dst = MI.getOperand(0).getReg();
1012 Register Src0 = MI.getOperand(2).getReg();
1013 Register M0Val = MI.getOperand(6).getReg();
1014 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1015 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1016 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1017 return false;
1018
1019 // This requires 2 instructions. It is possible to write a pattern to support
1020 // this, but the generated isel emitter doesn't correctly deal with multiple
1021 // output instructions using the same physical register input. The copy to m0
1022 // is incorrectly placed before the second instruction.
1023 //
1024 // TODO: Match source modifiers.
1025
1026 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1027 const DebugLoc &DL = MI.getDebugLoc();
1028 MachineBasicBlock *MBB = MI.getParent();
1029
1030 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1031 .addReg(M0Val);
1032 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1033 .addImm(2)
1034 .addImm(MI.getOperand(4).getImm()) // $attr
1035 .addImm(MI.getOperand(3).getImm()); // $attrchan
1036
1037 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1038 .addImm(0) // $src0_modifiers
1039 .addReg(Src0) // $src0
1040 .addImm(MI.getOperand(4).getImm()) // $attr
1041 .addImm(MI.getOperand(3).getImm()) // $attrchan
1042 .addImm(0) // $src2_modifiers
1043 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1044 .addImm(MI.getOperand(5).getImm()) // $high
1045 .addImm(0) // $clamp
1046 .addImm(0); // $omod
1047
1048 MI.eraseFromParent();
1049 return true;
1050}
1051
1052// Writelane is special in that it can use SGPR and M0 (which would normally
1053// count as using the constant bus twice - but in this case it is allowed since
1054// the lane selector doesn't count as a use of the constant bus). However, it is
1055// still required to abide by the 1 SGPR rule. Fix this up if we might have
1056// multiple SGPRs.
1057bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1058 // With a constant bus limit of at least 2, there's no issue.
1059 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1060 return selectImpl(MI, *CoverageInfo);
1061
1062 MachineBasicBlock *MBB = MI.getParent();
1063 const DebugLoc &DL = MI.getDebugLoc();
1064 Register VDst = MI.getOperand(0).getReg();
1065 Register Val = MI.getOperand(2).getReg();
1066 Register LaneSelect = MI.getOperand(3).getReg();
1067 Register VDstIn = MI.getOperand(4).getReg();
1068
1069 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1070
1071 std::optional<ValueAndVReg> ConstSelect =
1072 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1073 if (ConstSelect) {
1074 // The selector has to be an inline immediate, so we can use whatever for
1075 // the other operands.
1076 MIB.addReg(Val);
1077 MIB.addImm(ConstSelect->Value.getSExtValue() &
1078 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1079 } else {
1080 std::optional<ValueAndVReg> ConstVal =
1082
1083 // If the value written is an inline immediate, we can get away without a
1084 // copy to m0.
1085 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1086 STI.hasInv2PiInlineImm())) {
1087 MIB.addImm(ConstVal->Value.getSExtValue());
1088 MIB.addReg(LaneSelect);
1089 } else {
1090 MIB.addReg(Val);
1091
1092 // If the lane selector was originally in a VGPR and copied with
1093 // readfirstlane, there's a hazard to read the same SGPR from the
1094 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1095 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1096
1097 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1098 .addReg(LaneSelect);
1099 MIB.addReg(AMDGPU::M0);
1100 }
1101 }
1102
1103 MIB.addReg(VDstIn);
1104
1105 MI.eraseFromParent();
1106 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1107 return true;
1108}
1109
1110// We need to handle this here because tablegen doesn't support matching
1111// instructions with multiple outputs.
1112bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1113 Register Dst0 = MI.getOperand(0).getReg();
1114 Register Dst1 = MI.getOperand(1).getReg();
1115
1116 LLT Ty = MRI->getType(Dst0);
1117 unsigned Opc;
1118 if (Ty == LLT::scalar(32))
1119 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1120 else if (Ty == LLT::scalar(64))
1121 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1122 else
1123 return false;
1124
1125 // TODO: Match source modifiers.
1126
1127 const DebugLoc &DL = MI.getDebugLoc();
1128 MachineBasicBlock *MBB = MI.getParent();
1129
1130 Register Numer = MI.getOperand(3).getReg();
1131 Register Denom = MI.getOperand(4).getReg();
1132 unsigned ChooseDenom = MI.getOperand(5).getImm();
1133
1134 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1135
1136 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1137 .addDef(Dst1)
1138 .addImm(0) // $src0_modifiers
1139 .addUse(Src0) // $src0
1140 .addImm(0) // $src1_modifiers
1141 .addUse(Denom) // $src1
1142 .addImm(0) // $src2_modifiers
1143 .addUse(Numer) // $src2
1144 .addImm(0) // $clamp
1145 .addImm(0); // $omod
1146
1147 MI.eraseFromParent();
1148 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1149 return true;
1150}
1151
1152bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1153 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1154 switch (IntrinsicID) {
1155 case Intrinsic::amdgcn_if_break: {
1156 MachineBasicBlock *BB = I.getParent();
1157
1158 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1159 // SelectionDAG uses for wave32 vs wave64.
1160 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1161 .add(I.getOperand(0))
1162 .add(I.getOperand(2))
1163 .add(I.getOperand(3));
1164
1165 Register DstReg = I.getOperand(0).getReg();
1166 Register Src0Reg = I.getOperand(2).getReg();
1167 Register Src1Reg = I.getOperand(3).getReg();
1168
1169 I.eraseFromParent();
1170
1171 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1172 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1173
1174 return true;
1175 }
1176 case Intrinsic::amdgcn_interp_p1_f16:
1177 return selectInterpP1F16(I);
1178 case Intrinsic::amdgcn_wqm:
1179 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1180 case Intrinsic::amdgcn_softwqm:
1181 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1182 case Intrinsic::amdgcn_strict_wwm:
1183 case Intrinsic::amdgcn_wwm:
1184 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1185 case Intrinsic::amdgcn_strict_wqm:
1186 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1187 case Intrinsic::amdgcn_writelane:
1188 return selectWritelane(I);
1189 case Intrinsic::amdgcn_div_scale:
1190 return selectDivScale(I);
1191 case Intrinsic::amdgcn_icmp:
1192 case Intrinsic::amdgcn_fcmp:
1193 if (selectImpl(I, *CoverageInfo))
1194 return true;
1195 return selectIntrinsicCmp(I);
1196 case Intrinsic::amdgcn_ballot:
1197 return selectBallot(I);
1198 case Intrinsic::amdgcn_reloc_constant:
1199 return selectRelocConstant(I);
1200 case Intrinsic::amdgcn_groupstaticsize:
1201 return selectGroupStaticSize(I);
1202 case Intrinsic::returnaddress:
1203 return selectReturnAddress(I);
1204 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1205 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1206 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1207 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1208 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1209 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1211 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1212 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1213 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1214 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1215 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1216 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1217 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1218 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1219 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1220 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1221 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1222 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1223 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1224 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1225 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1226 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1227 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1228 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1229 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1230 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1231 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1232 return selectSMFMACIntrin(I);
1233 case Intrinsic::amdgcn_permlane16_swap:
1234 case Intrinsic::amdgcn_permlane32_swap:
1235 return selectPermlaneSwapIntrin(I, IntrinsicID);
1236 case Intrinsic::amdgcn_wave_shuffle:
1237 return selectWaveShuffleIntrin(I);
1238 default:
1239 return selectImpl(I, *CoverageInfo);
1240 }
1241}
1242
1244 const GCNSubtarget &ST) {
1245 if (Size != 16 && Size != 32 && Size != 64)
1246 return -1;
1247
1248 if (Size == 16 && !ST.has16BitInsts())
1249 return -1;
1250
1251 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1252 unsigned FakeS16Opc, unsigned S32Opc,
1253 unsigned S64Opc) {
1254 if (Size == 16)
1255 return ST.hasTrue16BitInsts()
1256 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1257 : S16Opc;
1258 if (Size == 32)
1259 return S32Opc;
1260 return S64Opc;
1261 };
1262
1263 switch (P) {
1264 default:
1265 llvm_unreachable("Unknown condition code!");
1266 case CmpInst::ICMP_NE:
1267 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1268 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1269 AMDGPU::V_CMP_NE_U64_e64);
1270 case CmpInst::ICMP_EQ:
1271 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1272 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1273 AMDGPU::V_CMP_EQ_U64_e64);
1274 case CmpInst::ICMP_SGT:
1275 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1276 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1277 AMDGPU::V_CMP_GT_I64_e64);
1278 case CmpInst::ICMP_SGE:
1279 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1280 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1281 AMDGPU::V_CMP_GE_I64_e64);
1282 case CmpInst::ICMP_SLT:
1283 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1284 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1285 AMDGPU::V_CMP_LT_I64_e64);
1286 case CmpInst::ICMP_SLE:
1287 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1288 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1289 AMDGPU::V_CMP_LE_I64_e64);
1290 case CmpInst::ICMP_UGT:
1291 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1292 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1293 AMDGPU::V_CMP_GT_U64_e64);
1294 case CmpInst::ICMP_UGE:
1295 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1296 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1297 AMDGPU::V_CMP_GE_U64_e64);
1298 case CmpInst::ICMP_ULT:
1299 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1300 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1301 AMDGPU::V_CMP_LT_U64_e64);
1302 case CmpInst::ICMP_ULE:
1303 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1304 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1305 AMDGPU::V_CMP_LE_U64_e64);
1306
1307 case CmpInst::FCMP_OEQ:
1308 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1309 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1310 AMDGPU::V_CMP_EQ_F64_e64);
1311 case CmpInst::FCMP_OGT:
1312 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1313 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1314 AMDGPU::V_CMP_GT_F64_e64);
1315 case CmpInst::FCMP_OGE:
1316 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1317 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1318 AMDGPU::V_CMP_GE_F64_e64);
1319 case CmpInst::FCMP_OLT:
1320 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1321 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1322 AMDGPU::V_CMP_LT_F64_e64);
1323 case CmpInst::FCMP_OLE:
1324 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1325 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1326 AMDGPU::V_CMP_LE_F64_e64);
1327 case CmpInst::FCMP_ONE:
1328 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1329 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1330 AMDGPU::V_CMP_NEQ_F64_e64);
1331 case CmpInst::FCMP_ORD:
1332 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1333 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1334 AMDGPU::V_CMP_O_F64_e64);
1335 case CmpInst::FCMP_UNO:
1336 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1337 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1338 AMDGPU::V_CMP_U_F64_e64);
1339 case CmpInst::FCMP_UEQ:
1340 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1341 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1342 AMDGPU::V_CMP_NLG_F64_e64);
1343 case CmpInst::FCMP_UGT:
1344 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1345 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1346 AMDGPU::V_CMP_NLE_F64_e64);
1347 case CmpInst::FCMP_UGE:
1348 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1349 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1350 AMDGPU::V_CMP_NLT_F64_e64);
1351 case CmpInst::FCMP_ULT:
1352 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1353 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1354 AMDGPU::V_CMP_NGE_F64_e64);
1355 case CmpInst::FCMP_ULE:
1356 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1357 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1358 AMDGPU::V_CMP_NGT_F64_e64);
1359 case CmpInst::FCMP_UNE:
1360 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1361 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1362 AMDGPU::V_CMP_NEQ_F64_e64);
1363 case CmpInst::FCMP_TRUE:
1364 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1365 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1366 AMDGPU::V_CMP_TRU_F64_e64);
1368 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1369 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1370 AMDGPU::V_CMP_F_F64_e64);
1371 }
1372}
1373
1374int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1375 unsigned Size) const {
1376 if (Size == 64) {
1377 if (!STI.hasScalarCompareEq64())
1378 return -1;
1379
1380 switch (P) {
1381 case CmpInst::ICMP_NE:
1382 return AMDGPU::S_CMP_LG_U64;
1383 case CmpInst::ICMP_EQ:
1384 return AMDGPU::S_CMP_EQ_U64;
1385 default:
1386 return -1;
1387 }
1388 }
1389
1390 if (Size == 32) {
1391 switch (P) {
1392 case CmpInst::ICMP_NE:
1393 return AMDGPU::S_CMP_LG_U32;
1394 case CmpInst::ICMP_EQ:
1395 return AMDGPU::S_CMP_EQ_U32;
1396 case CmpInst::ICMP_SGT:
1397 return AMDGPU::S_CMP_GT_I32;
1398 case CmpInst::ICMP_SGE:
1399 return AMDGPU::S_CMP_GE_I32;
1400 case CmpInst::ICMP_SLT:
1401 return AMDGPU::S_CMP_LT_I32;
1402 case CmpInst::ICMP_SLE:
1403 return AMDGPU::S_CMP_LE_I32;
1404 case CmpInst::ICMP_UGT:
1405 return AMDGPU::S_CMP_GT_U32;
1406 case CmpInst::ICMP_UGE:
1407 return AMDGPU::S_CMP_GE_U32;
1408 case CmpInst::ICMP_ULT:
1409 return AMDGPU::S_CMP_LT_U32;
1410 case CmpInst::ICMP_ULE:
1411 return AMDGPU::S_CMP_LE_U32;
1412 case CmpInst::FCMP_OEQ:
1413 return AMDGPU::S_CMP_EQ_F32;
1414 case CmpInst::FCMP_OGT:
1415 return AMDGPU::S_CMP_GT_F32;
1416 case CmpInst::FCMP_OGE:
1417 return AMDGPU::S_CMP_GE_F32;
1418 case CmpInst::FCMP_OLT:
1419 return AMDGPU::S_CMP_LT_F32;
1420 case CmpInst::FCMP_OLE:
1421 return AMDGPU::S_CMP_LE_F32;
1422 case CmpInst::FCMP_ONE:
1423 return AMDGPU::S_CMP_LG_F32;
1424 case CmpInst::FCMP_ORD:
1425 return AMDGPU::S_CMP_O_F32;
1426 case CmpInst::FCMP_UNO:
1427 return AMDGPU::S_CMP_U_F32;
1428 case CmpInst::FCMP_UEQ:
1429 return AMDGPU::S_CMP_NLG_F32;
1430 case CmpInst::FCMP_UGT:
1431 return AMDGPU::S_CMP_NLE_F32;
1432 case CmpInst::FCMP_UGE:
1433 return AMDGPU::S_CMP_NLT_F32;
1434 case CmpInst::FCMP_ULT:
1435 return AMDGPU::S_CMP_NGE_F32;
1436 case CmpInst::FCMP_ULE:
1437 return AMDGPU::S_CMP_NGT_F32;
1438 case CmpInst::FCMP_UNE:
1439 return AMDGPU::S_CMP_NEQ_F32;
1440 default:
1441 llvm_unreachable("Unknown condition code!");
1442 }
1443 }
1444
1445 if (Size == 16) {
1446 if (!STI.hasSALUFloatInsts())
1447 return -1;
1448
1449 switch (P) {
1450 case CmpInst::FCMP_OEQ:
1451 return AMDGPU::S_CMP_EQ_F16;
1452 case CmpInst::FCMP_OGT:
1453 return AMDGPU::S_CMP_GT_F16;
1454 case CmpInst::FCMP_OGE:
1455 return AMDGPU::S_CMP_GE_F16;
1456 case CmpInst::FCMP_OLT:
1457 return AMDGPU::S_CMP_LT_F16;
1458 case CmpInst::FCMP_OLE:
1459 return AMDGPU::S_CMP_LE_F16;
1460 case CmpInst::FCMP_ONE:
1461 return AMDGPU::S_CMP_LG_F16;
1462 case CmpInst::FCMP_ORD:
1463 return AMDGPU::S_CMP_O_F16;
1464 case CmpInst::FCMP_UNO:
1465 return AMDGPU::S_CMP_U_F16;
1466 case CmpInst::FCMP_UEQ:
1467 return AMDGPU::S_CMP_NLG_F16;
1468 case CmpInst::FCMP_UGT:
1469 return AMDGPU::S_CMP_NLE_F16;
1470 case CmpInst::FCMP_UGE:
1471 return AMDGPU::S_CMP_NLT_F16;
1472 case CmpInst::FCMP_ULT:
1473 return AMDGPU::S_CMP_NGE_F16;
1474 case CmpInst::FCMP_ULE:
1475 return AMDGPU::S_CMP_NGT_F16;
1476 case CmpInst::FCMP_UNE:
1477 return AMDGPU::S_CMP_NEQ_F16;
1478 default:
1479 llvm_unreachable("Unknown condition code!");
1480 }
1481 }
1482
1483 return -1;
1484}
1485
1486bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1487
1488 MachineBasicBlock *BB = I.getParent();
1489 const DebugLoc &DL = I.getDebugLoc();
1490
1491 Register SrcReg = I.getOperand(2).getReg();
1492 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1493
1494 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1495
1496 Register CCReg = I.getOperand(0).getReg();
1497 if (!isVCC(CCReg, *MRI)) {
1498 int Opcode = getS_CMPOpcode(Pred, Size);
1499 if (Opcode == -1)
1500 return false;
1501 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1502 .add(I.getOperand(2))
1503 .add(I.getOperand(3));
1504 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1505 .addReg(AMDGPU::SCC);
1506 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1507 bool Ret =
1508 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1509 I.eraseFromParent();
1510 return Ret;
1511 }
1512
1513 if (I.getOpcode() == AMDGPU::G_FCMP)
1514 return false;
1515
1516 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1517 if (Opcode == -1)
1518 return false;
1519
1520 MachineInstrBuilder ICmp;
1521 // t16 instructions
1522 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1523 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1524 .addImm(0)
1525 .add(I.getOperand(2))
1526 .addImm(0)
1527 .add(I.getOperand(3))
1528 .addImm(0); // op_sel
1529 } else {
1530 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1531 .add(I.getOperand(2))
1532 .add(I.getOperand(3));
1533 }
1534
1535 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1536 *TRI.getBoolRC(), *MRI);
1537 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1538 I.eraseFromParent();
1539 return true;
1540}
1541
1542bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1543 Register Dst = I.getOperand(0).getReg();
1544 if (isVCC(Dst, *MRI))
1545 return false;
1546
1547 LLT DstTy = MRI->getType(Dst);
1548 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1549 return false;
1550
1551 MachineBasicBlock *BB = I.getParent();
1552 const DebugLoc &DL = I.getDebugLoc();
1553 Register SrcReg = I.getOperand(2).getReg();
1554 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1555
1556 // i1 inputs are not supported in GlobalISel.
1557 if (Size == 1)
1558 return false;
1559
1560 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1561 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1562 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1563 I.eraseFromParent();
1564 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1565 }
1566
1567 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1568 if (Opcode == -1)
1569 return false;
1570
1571 MachineInstrBuilder SelectedMI;
1572 MachineOperand &LHS = I.getOperand(2);
1573 MachineOperand &RHS = I.getOperand(3);
1574 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1575 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1576 Register Src0Reg =
1577 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1578 Register Src1Reg =
1579 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1580 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1581 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1582 SelectedMI.addImm(Src0Mods);
1583 SelectedMI.addReg(Src0Reg);
1584 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1585 SelectedMI.addImm(Src1Mods);
1586 SelectedMI.addReg(Src1Reg);
1587 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1588 SelectedMI.addImm(0); // clamp
1589 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1590 SelectedMI.addImm(0); // op_sel
1591
1592 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1593 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1594
1595 I.eraseFromParent();
1596 return true;
1597}
1598
1599// Ballot has to zero bits in input lane-mask that are zero in current exec,
1600// Done as AND with exec. For inputs that are results of instruction that
1601// implicitly use same exec, for example compares in same basic block or SCC to
1602// VCC copy, use copy.
1605 MachineInstr *MI = MRI.getVRegDef(Reg);
1606 if (MI->getParent() != MBB)
1607 return false;
1608
1609 // Lane mask generated by SCC to VCC copy.
1610 if (MI->getOpcode() == AMDGPU::COPY) {
1611 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1612 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1613 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1614 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1615 return true;
1616 }
1617
1618 // Lane mask generated using compare with same exec.
1619 if (isa<GAnyCmp>(MI))
1620 return true;
1621
1622 Register LHS, RHS;
1623 // Look through AND.
1624 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1625 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1627
1628 return false;
1629}
1630
1631bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1632 MachineBasicBlock *BB = I.getParent();
1633 const DebugLoc &DL = I.getDebugLoc();
1634 Register DstReg = I.getOperand(0).getReg();
1635 Register SrcReg = I.getOperand(2).getReg();
1636 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1637 const unsigned WaveSize = STI.getWavefrontSize();
1638
1639 // In the common case, the return type matches the wave size.
1640 // However we also support emitting i64 ballots in wave32 mode.
1641 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1642 return false;
1643
1644 std::optional<ValueAndVReg> Arg =
1646
1647 Register Dst = DstReg;
1648 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1649 if (BallotSize != WaveSize) {
1650 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1651 }
1652
1653 if (Arg) {
1654 const int64_t Value = Arg->Value.getZExtValue();
1655 if (Value == 0) {
1656 // Dst = S_MOV 0
1657 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1658 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1659 } else {
1660 // Dst = COPY EXEC
1661 assert(Value == 1);
1662 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1663 }
1664 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1665 return false;
1666 } else {
1667 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1668 // Dst = COPY SrcReg
1669 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1670 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1671 return false;
1672 } else {
1673 // Dst = S_AND SrcReg, EXEC
1674 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1675 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1676 .addReg(SrcReg)
1677 .addReg(TRI.getExec())
1678 .setOperandDead(3); // Dead scc
1679 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1680 }
1681 }
1682
1683 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1684 if (BallotSize != WaveSize) {
1685 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1686 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1687 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1688 .addReg(Dst)
1689 .addImm(AMDGPU::sub0)
1690 .addReg(HiReg)
1691 .addImm(AMDGPU::sub1);
1692 }
1693
1694 I.eraseFromParent();
1695 return true;
1696}
1697
1698bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1699 Register DstReg = I.getOperand(0).getReg();
1700 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1701 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1702 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1703 return false;
1704
1705 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1706
1707 Module *M = MF->getFunction().getParent();
1708 const MDNode *Metadata = I.getOperand(2).getMetadata();
1709 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1710 auto *RelocSymbol = cast<GlobalVariable>(
1711 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1712
1713 MachineBasicBlock *BB = I.getParent();
1714 BuildMI(*BB, &I, I.getDebugLoc(),
1715 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1717
1718 I.eraseFromParent();
1719 return true;
1720}
1721
1722bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1723 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1724
1725 Register DstReg = I.getOperand(0).getReg();
1726 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1727 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1728 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1729
1730 MachineBasicBlock *MBB = I.getParent();
1731 const DebugLoc &DL = I.getDebugLoc();
1732
1733 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1734
1735 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1736 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1737 MIB.addImm(MFI->getLDSSize());
1738 } else {
1739 Module *M = MF->getFunction().getParent();
1740 const GlobalValue *GV =
1741 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1743 }
1744
1745 I.eraseFromParent();
1746 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1747 return true;
1748}
1749
1750bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1751 MachineBasicBlock *MBB = I.getParent();
1752 MachineFunction &MF = *MBB->getParent();
1753 const DebugLoc &DL = I.getDebugLoc();
1754
1755 MachineOperand &Dst = I.getOperand(0);
1756 Register DstReg = Dst.getReg();
1757 unsigned Depth = I.getOperand(2).getImm();
1758
1759 const TargetRegisterClass *RC
1760 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1761 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1762 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1763 return false;
1764
1765 // Check for kernel and shader functions
1766 if (Depth != 0 ||
1767 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1768 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1769 .addImm(0);
1770 I.eraseFromParent();
1771 return true;
1772 }
1773
1774 MachineFrameInfo &MFI = MF.getFrameInfo();
1775 // There is a call to @llvm.returnaddress in this function
1776 MFI.setReturnAddressIsTaken(true);
1777
1778 // Get the return address reg and mark it as an implicit live-in
1779 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1780 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1781 AMDGPU::SReg_64RegClass, DL);
1782 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1783 .addReg(LiveIn);
1784 I.eraseFromParent();
1785 return true;
1786}
1787
1788bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1789 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1790 // SelectionDAG uses for wave32 vs wave64.
1791 MachineBasicBlock *BB = MI.getParent();
1792 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1793 .add(MI.getOperand(1));
1794
1795 Register Reg = MI.getOperand(1).getReg();
1796 MI.eraseFromParent();
1797
1798 if (!MRI->getRegClassOrNull(Reg))
1799 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1800 return true;
1801}
1802
1803bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1804 MachineInstr &MI, Intrinsic::ID IntrID) const {
1805 MachineBasicBlock *MBB = MI.getParent();
1806 MachineFunction *MF = MBB->getParent();
1807 const DebugLoc &DL = MI.getDebugLoc();
1808
1809 unsigned IndexOperand = MI.getOperand(7).getImm();
1810 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1811 bool WaveDone = MI.getOperand(9).getImm() != 0;
1812
1813 if (WaveDone && !WaveRelease) {
1814 // TODO: Move this to IR verifier
1815 const Function &Fn = MF->getFunction();
1816 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1817 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1818 }
1819
1820 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1821 IndexOperand &= ~0x3f;
1822 unsigned CountDw = 0;
1823
1824 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1825 CountDw = (IndexOperand >> 24) & 0xf;
1826 IndexOperand &= ~(0xf << 24);
1827
1828 if (CountDw < 1 || CountDw > 4) {
1829 const Function &Fn = MF->getFunction();
1830 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1831 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1832 CountDw = 1;
1833 }
1834 }
1835
1836 if (IndexOperand) {
1837 const Function &Fn = MF->getFunction();
1838 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1839 Fn, "ds_ordered_count: bad index operand", DL));
1840 }
1841
1842 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1843 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1844
1845 unsigned Offset0 = OrderedCountIndex << 2;
1846 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1847
1848 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1849 Offset1 |= (CountDw - 1) << 6;
1850
1851 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1852 Offset1 |= ShaderType << 2;
1853
1854 unsigned Offset = Offset0 | (Offset1 << 8);
1855
1856 Register M0Val = MI.getOperand(2).getReg();
1857 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1858 .addReg(M0Val);
1859
1860 Register DstReg = MI.getOperand(0).getReg();
1861 Register ValReg = MI.getOperand(3).getReg();
1862 MachineInstrBuilder DS =
1863 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1864 .addReg(ValReg)
1865 .addImm(Offset)
1866 .cloneMemRefs(MI);
1867
1868 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1869 return false;
1870
1871 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1872 MI.eraseFromParent();
1873 return true;
1874}
1875
1876static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1877 switch (IntrID) {
1878 case Intrinsic::amdgcn_ds_gws_init:
1879 return AMDGPU::DS_GWS_INIT;
1880 case Intrinsic::amdgcn_ds_gws_barrier:
1881 return AMDGPU::DS_GWS_BARRIER;
1882 case Intrinsic::amdgcn_ds_gws_sema_v:
1883 return AMDGPU::DS_GWS_SEMA_V;
1884 case Intrinsic::amdgcn_ds_gws_sema_br:
1885 return AMDGPU::DS_GWS_SEMA_BR;
1886 case Intrinsic::amdgcn_ds_gws_sema_p:
1887 return AMDGPU::DS_GWS_SEMA_P;
1888 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1889 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1890 default:
1891 llvm_unreachable("not a gws intrinsic");
1892 }
1893}
1894
1895bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1896 Intrinsic::ID IID) const {
1897 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1898 !STI.hasGWSSemaReleaseAll()))
1899 return false;
1900
1901 // intrinsic ID, vsrc, offset
1902 const bool HasVSrc = MI.getNumOperands() == 3;
1903 assert(HasVSrc || MI.getNumOperands() == 2);
1904
1905 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1906 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1907 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1908 return false;
1909
1910 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1911 unsigned ImmOffset;
1912
1913 MachineBasicBlock *MBB = MI.getParent();
1914 const DebugLoc &DL = MI.getDebugLoc();
1915
1916 MachineInstr *Readfirstlane = nullptr;
1917
1918 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1919 // incoming offset, in case there's an add of a constant. We'll have to put it
1920 // back later.
1921 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1922 Readfirstlane = OffsetDef;
1923 BaseOffset = OffsetDef->getOperand(1).getReg();
1924 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1925 }
1926
1927 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1928 // If we have a constant offset, try to use the 0 in m0 as the base.
1929 // TODO: Look into changing the default m0 initialization value. If the
1930 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1931 // the immediate offset.
1932
1933 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1934 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1935 .addImm(0);
1936 } else {
1937 std::tie(BaseOffset, ImmOffset) =
1938 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1939
1940 if (Readfirstlane) {
1941 // We have the constant offset now, so put the readfirstlane back on the
1942 // variable component.
1943 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1944 return false;
1945
1946 Readfirstlane->getOperand(1).setReg(BaseOffset);
1947 BaseOffset = Readfirstlane->getOperand(0).getReg();
1948 } else {
1949 if (!RBI.constrainGenericRegister(BaseOffset,
1950 AMDGPU::SReg_32RegClass, *MRI))
1951 return false;
1952 }
1953
1954 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1955 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1956 .addReg(BaseOffset)
1957 .addImm(16)
1958 .setOperandDead(3); // Dead scc
1959
1960 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1961 .addReg(M0Base);
1962 }
1963
1964 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1965 // offset field) % 64. Some versions of the programming guide omit the m0
1966 // part, or claim it's from offset 0.
1967
1968 unsigned Opc = gwsIntrinToOpcode(IID);
1969 const MCInstrDesc &InstrDesc = TII.get(Opc);
1970
1971 if (HasVSrc) {
1972 Register VSrc = MI.getOperand(1).getReg();
1973
1974 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1975 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1976 const TargetRegisterClass *SubRC =
1977 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1978
1979 if (!SubRC) {
1980 // 32-bit normal case.
1981 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1982 return false;
1983
1984 BuildMI(*MBB, &MI, DL, InstrDesc)
1985 .addReg(VSrc)
1986 .addImm(ImmOffset)
1987 .cloneMemRefs(MI);
1988 } else {
1989 // Requires even register alignment, so create 64-bit value and pad the
1990 // top half with undef.
1991 Register DataReg = MRI->createVirtualRegister(DataRC);
1992 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1993 return false;
1994
1995 Register UndefReg = MRI->createVirtualRegister(SubRC);
1996 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1997 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1998 .addReg(VSrc)
1999 .addImm(AMDGPU::sub0)
2000 .addReg(UndefReg)
2001 .addImm(AMDGPU::sub1);
2002
2003 BuildMI(*MBB, &MI, DL, InstrDesc)
2004 .addReg(DataReg)
2005 .addImm(ImmOffset)
2006 .cloneMemRefs(MI);
2007 }
2008 } else {
2009 BuildMI(*MBB, &MI, DL, InstrDesc)
2010 .addImm(ImmOffset)
2011 .cloneMemRefs(MI);
2012 }
2013
2014 MI.eraseFromParent();
2015 return true;
2016}
2017
2018bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2019 bool IsAppend) const {
2020 Register PtrBase = MI.getOperand(2).getReg();
2021 LLT PtrTy = MRI->getType(PtrBase);
2022 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2023
2024 unsigned Offset;
2025 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2026
2027 // TODO: Should this try to look through readfirstlane like GWS?
2028 if (!isDSOffsetLegal(PtrBase, Offset)) {
2029 PtrBase = MI.getOperand(2).getReg();
2030 Offset = 0;
2031 }
2032
2033 MachineBasicBlock *MBB = MI.getParent();
2034 const DebugLoc &DL = MI.getDebugLoc();
2035 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2036
2037 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2038 .addReg(PtrBase);
2039 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2040 return false;
2041
2042 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2043 .addImm(Offset)
2044 .addImm(IsGDS ? -1 : 0)
2045 .cloneMemRefs(MI);
2046 MI.eraseFromParent();
2047 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2048 return true;
2049}
2050
2051bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2052 MachineFunction *MF = MI.getMF();
2053 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2054
2055 MFInfo->setInitWholeWave();
2056 return selectImpl(MI, *CoverageInfo);
2057}
2058
2059static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2060 bool &IsTexFail) {
2061 if (TexFailCtrl)
2062 IsTexFail = true;
2063
2064 TFE = TexFailCtrl & 0x1;
2065 TexFailCtrl &= ~(uint64_t)0x1;
2066 LWE = TexFailCtrl & 0x2;
2067 TexFailCtrl &= ~(uint64_t)0x2;
2068
2069 return TexFailCtrl == 0;
2070}
2071
2072bool AMDGPUInstructionSelector::selectImageIntrinsic(
2073 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2074 MachineBasicBlock *MBB = MI.getParent();
2075 const DebugLoc &DL = MI.getDebugLoc();
2076 unsigned IntrOpcode = Intr->BaseOpcode;
2077
2078 // For image atomic: use no-return opcode if result is unused.
2079 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2080 Register ResultDef = MI.getOperand(0).getReg();
2081 if (MRI->use_nodbg_empty(ResultDef))
2082 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2083 }
2084
2085 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2087
2088 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2089 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2090 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2091 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2092
2093 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2094
2095 Register VDataIn = AMDGPU::NoRegister;
2096 Register VDataOut = AMDGPU::NoRegister;
2097 LLT VDataTy;
2098 int NumVDataDwords = -1;
2099 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2100 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2101
2102 bool Unorm;
2103 if (!BaseOpcode->Sampler)
2104 Unorm = true;
2105 else
2106 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2107
2108 bool TFE;
2109 bool LWE;
2110 bool IsTexFail = false;
2111 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2112 TFE, LWE, IsTexFail))
2113 return false;
2114
2115 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2116 const bool IsA16 = (Flags & 1) != 0;
2117 const bool IsG16 = (Flags & 2) != 0;
2118
2119 // A16 implies 16 bit gradients if subtarget doesn't support G16
2120 if (IsA16 && !STI.hasG16() && !IsG16)
2121 return false;
2122
2123 unsigned DMask = 0;
2124 unsigned DMaskLanes = 0;
2125
2126 if (BaseOpcode->Atomic) {
2127 if (!BaseOpcode->NoReturn)
2128 VDataOut = MI.getOperand(0).getReg();
2129 VDataIn = MI.getOperand(2).getReg();
2130 LLT Ty = MRI->getType(VDataIn);
2131
2132 // Be careful to allow atomic swap on 16-bit element vectors.
2133 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2134 Ty.getSizeInBits() == 128 :
2135 Ty.getSizeInBits() == 64;
2136
2137 if (BaseOpcode->AtomicX2) {
2138 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2139
2140 DMask = Is64Bit ? 0xf : 0x3;
2141 NumVDataDwords = Is64Bit ? 4 : 2;
2142 } else {
2143 DMask = Is64Bit ? 0x3 : 0x1;
2144 NumVDataDwords = Is64Bit ? 2 : 1;
2145 }
2146 } else {
2147 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2148 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2149
2150 if (BaseOpcode->Store) {
2151 VDataIn = MI.getOperand(1).getReg();
2152 VDataTy = MRI->getType(VDataIn);
2153 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2154 } else if (BaseOpcode->NoReturn) {
2155 NumVDataDwords = 0;
2156 } else {
2157 VDataOut = MI.getOperand(0).getReg();
2158 VDataTy = MRI->getType(VDataOut);
2159 NumVDataDwords = DMaskLanes;
2160
2161 if (IsD16 && !STI.hasUnpackedD16VMem())
2162 NumVDataDwords = (DMaskLanes + 1) / 2;
2163 }
2164 }
2165
2166 // Set G16 opcode
2167 if (Subtarget->hasG16() && IsG16) {
2168 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2170 assert(G16MappingInfo);
2171 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2172 }
2173
2174 // TODO: Check this in verifier.
2175 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2176
2177 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2178 // Keep GLC only when the atomic's result is actually used.
2179 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2181 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2183 return false;
2184
2185 int NumVAddrRegs = 0;
2186 int NumVAddrDwords = 0;
2187 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2188 // Skip the $noregs and 0s inserted during legalization.
2189 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2190 if (!AddrOp.isReg())
2191 continue; // XXX - Break?
2192
2193 Register Addr = AddrOp.getReg();
2194 if (!Addr)
2195 break;
2196
2197 ++NumVAddrRegs;
2198 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2199 }
2200
2201 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2202 // NSA, these should have been packed into a single value in the first
2203 // address register
2204 const bool UseNSA =
2205 NumVAddrRegs != 1 &&
2206 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2207 : NumVAddrDwords == NumVAddrRegs);
2208 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2209 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2210 return false;
2211 }
2212
2213 if (IsTexFail)
2214 ++NumVDataDwords;
2215
2216 int Opcode = -1;
2217 if (IsGFX12Plus) {
2218 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2219 NumVDataDwords, NumVAddrDwords);
2220 } else if (IsGFX11Plus) {
2221 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2222 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2223 : AMDGPU::MIMGEncGfx11Default,
2224 NumVDataDwords, NumVAddrDwords);
2225 } else if (IsGFX10Plus) {
2226 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2227 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2228 : AMDGPU::MIMGEncGfx10Default,
2229 NumVDataDwords, NumVAddrDwords);
2230 } else {
2231 if (Subtarget->hasGFX90AInsts()) {
2232 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2233 NumVDataDwords, NumVAddrDwords);
2234 if (Opcode == -1) {
2235 LLVM_DEBUG(
2236 dbgs()
2237 << "requested image instruction is not supported on this GPU\n");
2238 return false;
2239 }
2240 }
2241 if (Opcode == -1 &&
2242 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2243 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2244 NumVDataDwords, NumVAddrDwords);
2245 if (Opcode == -1)
2246 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2247 NumVDataDwords, NumVAddrDwords);
2248 }
2249 if (Opcode == -1)
2250 return false;
2251
2252 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2253 .cloneMemRefs(MI);
2254
2255 if (VDataOut) {
2256 if (BaseOpcode->AtomicX2) {
2257 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2258
2259 Register TmpReg = MRI->createVirtualRegister(
2260 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2261 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2262
2263 MIB.addDef(TmpReg);
2264 if (!MRI->use_empty(VDataOut)) {
2265 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2266 .addReg(TmpReg, RegState::Kill, SubReg);
2267 }
2268
2269 } else {
2270 MIB.addDef(VDataOut); // vdata output
2271 }
2272 }
2273
2274 if (VDataIn)
2275 MIB.addReg(VDataIn); // vdata input
2276
2277 for (int I = 0; I != NumVAddrRegs; ++I) {
2278 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2279 if (SrcOp.isReg()) {
2280 assert(SrcOp.getReg() != 0);
2281 MIB.addReg(SrcOp.getReg());
2282 }
2283 }
2284
2285 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2286 if (BaseOpcode->Sampler)
2287 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2288
2289 MIB.addImm(DMask); // dmask
2290
2291 if (IsGFX10Plus)
2292 MIB.addImm(DimInfo->Encoding);
2293 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2294 MIB.addImm(Unorm);
2295
2296 MIB.addImm(CPol);
2297 MIB.addImm(IsA16 && // a16 or r128
2298 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2299 if (IsGFX10Plus)
2300 MIB.addImm(IsA16 ? -1 : 0);
2301
2302 if (!Subtarget->hasGFX90AInsts()) {
2303 MIB.addImm(TFE); // tfe
2304 } else if (TFE) {
2305 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2306 return false;
2307 }
2308
2309 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2310 MIB.addImm(LWE); // lwe
2311 if (!IsGFX10Plus)
2312 MIB.addImm(DimInfo->DA ? -1 : 0);
2313 if (BaseOpcode->HasD16)
2314 MIB.addImm(IsD16 ? -1 : 0);
2315
2316 MI.eraseFromParent();
2317 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2318 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2319 return true;
2320}
2321
2322// We need to handle this here because tablegen doesn't support matching
2323// instructions with multiple outputs.
2324bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2325 MachineInstr &MI) const {
2326 Register Dst0 = MI.getOperand(0).getReg();
2327 Register Dst1 = MI.getOperand(1).getReg();
2328
2329 const DebugLoc &DL = MI.getDebugLoc();
2330 MachineBasicBlock *MBB = MI.getParent();
2331
2332 Register Addr = MI.getOperand(3).getReg();
2333 Register Data0 = MI.getOperand(4).getReg();
2334 Register Data1 = MI.getOperand(5).getReg();
2335 unsigned Offset = MI.getOperand(6).getImm();
2336
2337 unsigned Opc;
2338 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2339 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2340 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2341 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2342 break;
2343 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2344 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2345 break;
2346 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2347 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2348 break;
2349 }
2350
2351 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2352 .addDef(Dst1)
2353 .addUse(Addr)
2354 .addUse(Data0)
2355 .addUse(Data1)
2356 .addImm(Offset)
2357 .cloneMemRefs(MI);
2358
2359 MI.eraseFromParent();
2360 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2361 return true;
2362}
2363
2364bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2365 MachineInstr &I) const {
2366 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2367 switch (IntrinsicID) {
2368 case Intrinsic::amdgcn_end_cf:
2369 return selectEndCfIntrinsic(I);
2370 case Intrinsic::amdgcn_ds_ordered_add:
2371 case Intrinsic::amdgcn_ds_ordered_swap:
2372 return selectDSOrderedIntrinsic(I, IntrinsicID);
2373 case Intrinsic::amdgcn_ds_gws_init:
2374 case Intrinsic::amdgcn_ds_gws_barrier:
2375 case Intrinsic::amdgcn_ds_gws_sema_v:
2376 case Intrinsic::amdgcn_ds_gws_sema_br:
2377 case Intrinsic::amdgcn_ds_gws_sema_p:
2378 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2379 return selectDSGWSIntrinsic(I, IntrinsicID);
2380 case Intrinsic::amdgcn_ds_append:
2381 return selectDSAppendConsume(I, true);
2382 case Intrinsic::amdgcn_ds_consume:
2383 return selectDSAppendConsume(I, false);
2384 case Intrinsic::amdgcn_init_whole_wave:
2385 return selectInitWholeWave(I);
2386 case Intrinsic::amdgcn_raw_buffer_load_lds:
2387 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2388 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2389 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2390 case Intrinsic::amdgcn_struct_buffer_load_lds:
2391 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2392 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2393 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2394 return selectBufferLoadLds(I);
2395 // Until we can store both the address space of the global and the LDS
2396 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2397 // that the argument is a global pointer (buffer pointers have been handled by
2398 // a LLVM IR-level lowering).
2399 case Intrinsic::amdgcn_load_to_lds:
2400 case Intrinsic::amdgcn_load_async_to_lds:
2401 case Intrinsic::amdgcn_global_load_lds:
2402 case Intrinsic::amdgcn_global_load_async_lds:
2403 return selectGlobalLoadLds(I);
2404 case Intrinsic::amdgcn_tensor_load_to_lds:
2405 case Intrinsic::amdgcn_tensor_store_from_lds:
2406 return selectTensorLoadStore(I, IntrinsicID);
2407 case Intrinsic::amdgcn_asyncmark:
2408 case Intrinsic::amdgcn_wait_asyncmark:
2409 if (!Subtarget->hasAsyncMark())
2410 return false;
2411 break;
2412 case Intrinsic::amdgcn_exp_compr:
2413 if (!STI.hasCompressedExport()) {
2414 Function &F = I.getMF()->getFunction();
2415 F.getContext().diagnose(
2416 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2417 I.getDebugLoc(), DS_Error));
2418 return false;
2419 }
2420 break;
2421 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2422 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2423 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2424 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2425 return selectDSBvhStackIntrinsic(I);
2426 case Intrinsic::amdgcn_s_alloc_vgpr: {
2427 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2428 // SCC. We then need to COPY it into the result vreg.
2429 MachineBasicBlock *MBB = I.getParent();
2430 const DebugLoc &DL = I.getDebugLoc();
2431
2432 Register ResReg = I.getOperand(0).getReg();
2433
2434 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2435 .add(I.getOperand(2));
2436 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2437 .addReg(AMDGPU::SCC);
2438 I.eraseFromParent();
2439 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2440 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2441 }
2442 case Intrinsic::amdgcn_s_barrier_init:
2443 case Intrinsic::amdgcn_s_barrier_signal_var:
2444 return selectNamedBarrierInit(I, IntrinsicID);
2445 case Intrinsic::amdgcn_s_wakeup_barrier: {
2446 if (!STI.hasSWakeupBarrier()) {
2447 Function &F = I.getMF()->getFunction();
2448 F.getContext().diagnose(
2449 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2450 I.getDebugLoc(), DS_Error));
2451 return false;
2452 }
2453 return selectNamedBarrierInst(I, IntrinsicID);
2454 }
2455 case Intrinsic::amdgcn_s_barrier_join:
2456 case Intrinsic::amdgcn_s_get_named_barrier_state:
2457 return selectNamedBarrierInst(I, IntrinsicID);
2458 case Intrinsic::amdgcn_s_get_barrier_state:
2459 return selectSGetBarrierState(I, IntrinsicID);
2460 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2461 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2462 }
2463 return selectImpl(I, *CoverageInfo);
2464}
2465
2466bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2467 if (selectImpl(I, *CoverageInfo))
2468 return true;
2469
2470 MachineBasicBlock *BB = I.getParent();
2471 const DebugLoc &DL = I.getDebugLoc();
2472
2473 Register DstReg = I.getOperand(0).getReg();
2474 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2475 assert(Size <= 32 || Size == 64);
2476 const MachineOperand &CCOp = I.getOperand(1);
2477 Register CCReg = CCOp.getReg();
2478 if (!isVCC(CCReg, *MRI)) {
2479 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2480 AMDGPU::S_CSELECT_B32;
2481 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2482 .addReg(CCReg);
2483
2484 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2485 // bank, because it does not cover the register class that we used to represent
2486 // for it. So we need to manually set the register class here.
2487 if (!MRI->getRegClassOrNull(CCReg))
2488 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2489 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2490 .add(I.getOperand(2))
2491 .add(I.getOperand(3));
2492
2494 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2495 I.eraseFromParent();
2496 return true;
2497 }
2498
2499 // Wide VGPR select should have been split in RegBankSelect.
2500 if (Size > 32)
2501 return false;
2502
2503 MachineInstr *Select =
2504 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2505 .addImm(0)
2506 .add(I.getOperand(3))
2507 .addImm(0)
2508 .add(I.getOperand(2))
2509 .add(I.getOperand(1));
2510
2512 I.eraseFromParent();
2513 return true;
2514}
2515
2516bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2517 Register DstReg = I.getOperand(0).getReg();
2518 Register SrcReg = I.getOperand(1).getReg();
2519 const LLT DstTy = MRI->getType(DstReg);
2520 const LLT SrcTy = MRI->getType(SrcReg);
2521 const LLT S1 = LLT::scalar(1);
2522
2523 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2524 const RegisterBank *DstRB;
2525 if (DstTy == S1) {
2526 // This is a special case. We don't treat s1 for legalization artifacts as
2527 // vcc booleans.
2528 DstRB = SrcRB;
2529 } else {
2530 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2531 if (SrcRB != DstRB)
2532 return false;
2533 }
2534
2535 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2536
2537 unsigned DstSize = DstTy.getSizeInBits();
2538 unsigned SrcSize = SrcTy.getSizeInBits();
2539
2540 const TargetRegisterClass *SrcRC =
2541 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2542 const TargetRegisterClass *DstRC =
2543 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2544 if (!SrcRC || !DstRC)
2545 return false;
2546
2547 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2548 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2549 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2550 return false;
2551 }
2552
2553 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2554 assert(STI.useRealTrue16Insts());
2555 const DebugLoc &DL = I.getDebugLoc();
2556 MachineBasicBlock *MBB = I.getParent();
2557 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2558 .addReg(SrcReg, {}, AMDGPU::lo16);
2559 I.eraseFromParent();
2560 return true;
2561 }
2562
2563 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2564 MachineBasicBlock *MBB = I.getParent();
2565 const DebugLoc &DL = I.getDebugLoc();
2566
2567 Register LoReg = MRI->createVirtualRegister(DstRC);
2568 Register HiReg = MRI->createVirtualRegister(DstRC);
2569 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2570 .addReg(SrcReg, {}, AMDGPU::sub0);
2571 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2572 .addReg(SrcReg, {}, AMDGPU::sub1);
2573
2574 if (IsVALU && STI.hasSDWA()) {
2575 // Write the low 16-bits of the high element into the high 16-bits of the
2576 // low element.
2577 MachineInstr *MovSDWA =
2578 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2579 .addImm(0) // $src0_modifiers
2580 .addReg(HiReg) // $src0
2581 .addImm(0) // $clamp
2582 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2583 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2584 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2585 .addReg(LoReg, RegState::Implicit);
2586 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2587 } else {
2588 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2589 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2590 Register ImmReg = MRI->createVirtualRegister(DstRC);
2591 if (IsVALU) {
2592 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2593 .addImm(16)
2594 .addReg(HiReg);
2595 } else {
2596 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2597 .addReg(HiReg)
2598 .addImm(16)
2599 .setOperandDead(3); // Dead scc
2600 }
2601
2602 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2603 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2604 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2605
2606 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2607 .addImm(0xffff);
2608 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2609 .addReg(LoReg)
2610 .addReg(ImmReg);
2611 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2612 .addReg(TmpReg0)
2613 .addReg(TmpReg1);
2614
2615 if (!IsVALU) {
2616 And.setOperandDead(3); // Dead scc
2617 Or.setOperandDead(3); // Dead scc
2618 }
2619 }
2620
2621 I.eraseFromParent();
2622 return true;
2623 }
2624
2625 if (!DstTy.isScalar())
2626 return false;
2627
2628 if (SrcSize > 32) {
2629 unsigned SubRegIdx = DstSize < 32
2630 ? static_cast<unsigned>(AMDGPU::sub0)
2631 : TRI.getSubRegFromChannel(0, DstSize / 32);
2632 if (SubRegIdx == AMDGPU::NoSubRegister)
2633 return false;
2634
2635 // Deal with weird cases where the class only partially supports the subreg
2636 // index.
2637 const TargetRegisterClass *SrcWithSubRC
2638 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2639 if (!SrcWithSubRC)
2640 return false;
2641
2642 if (SrcWithSubRC != SrcRC) {
2643 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2644 return false;
2645 }
2646
2647 I.getOperand(1).setSubReg(SubRegIdx);
2648 }
2649
2650 I.setDesc(TII.get(TargetOpcode::COPY));
2651 return true;
2652}
2653
2654/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2655static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2657 int SignedMask = static_cast<int>(Mask);
2658 return SignedMask >= -16 && SignedMask <= 64;
2659}
2660
2661// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2662const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2663 Register Reg, const MachineRegisterInfo &MRI,
2664 const TargetRegisterInfo &TRI) const {
2665 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2666 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2667 return RB;
2668
2669 // Ignore the type, since we don't use vcc in artifacts.
2670 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2671 return &RBI.getRegBankFromRegClass(*RC, LLT());
2672 return nullptr;
2673}
2674
2675bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2676 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2677 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2678 const DebugLoc &DL = I.getDebugLoc();
2679 MachineBasicBlock &MBB = *I.getParent();
2680 const Register DstReg = I.getOperand(0).getReg();
2681 const Register SrcReg = I.getOperand(1).getReg();
2682
2683 const LLT DstTy = MRI->getType(DstReg);
2684 const LLT SrcTy = MRI->getType(SrcReg);
2685 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2686 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2687 const unsigned DstSize = DstTy.getSizeInBits();
2688 if (!DstTy.isScalar())
2689 return false;
2690
2691 // Artifact casts should never use vcc.
2692 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2693
2694 // FIXME: This should probably be illegal and split earlier.
2695 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2696 if (DstSize <= 32)
2697 return selectCOPY(I);
2698
2699 const TargetRegisterClass *SrcRC =
2700 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2701 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2702 const TargetRegisterClass *DstRC =
2703 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2704
2705 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2706 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2707 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2708 .addReg(SrcReg)
2709 .addImm(AMDGPU::sub0)
2710 .addReg(UndefReg)
2711 .addImm(AMDGPU::sub1);
2712 I.eraseFromParent();
2713
2714 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2715 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2716 }
2717
2718 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2719 // 64-bit should have been split up in RegBankSelect
2720
2721 // Try to use an and with a mask if it will save code size.
2722 unsigned Mask;
2723 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2724 MachineInstr *ExtI =
2725 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2726 .addImm(Mask)
2727 .addReg(SrcReg);
2728 I.eraseFromParent();
2729 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2730 return true;
2731 }
2732
2733 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2734 MachineInstr *ExtI =
2735 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2736 .addReg(SrcReg)
2737 .addImm(0) // Offset
2738 .addImm(SrcSize); // Width
2739 I.eraseFromParent();
2740 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2741 return true;
2742 }
2743
2744 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2745 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2746 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2747 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2748 return false;
2749
2750 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2751 const unsigned SextOpc = SrcSize == 8 ?
2752 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2753 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2754 .addReg(SrcReg);
2755 I.eraseFromParent();
2756 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2757 }
2758
2759 // Using a single 32-bit SALU to calculate the high half is smaller than
2760 // S_BFE with a literal constant operand.
2761 if (DstSize > 32 && SrcSize == 32) {
2762 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2763 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2764 if (Signed) {
2765 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2766 .addReg(SrcReg, {}, SubReg)
2767 .addImm(31)
2768 .setOperandDead(3); // Dead scc
2769 } else {
2770 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2771 .addImm(0);
2772 }
2773 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2774 .addReg(SrcReg, {}, SubReg)
2775 .addImm(AMDGPU::sub0)
2776 .addReg(HiReg)
2777 .addImm(AMDGPU::sub1);
2778 I.eraseFromParent();
2779 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2780 *MRI);
2781 }
2782
2783 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2784 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2785
2786 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2787 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2788 // We need a 64-bit register source, but the high bits don't matter.
2789 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2790 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2791 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2792
2793 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2794 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2795 .addReg(SrcReg, {}, SubReg)
2796 .addImm(AMDGPU::sub0)
2797 .addReg(UndefReg)
2798 .addImm(AMDGPU::sub1);
2799
2800 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2801 .addReg(ExtReg)
2802 .addImm(SrcSize << 16);
2803
2804 I.eraseFromParent();
2805 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2806 }
2807
2808 unsigned Mask;
2809 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2810 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2811 .addReg(SrcReg)
2812 .addImm(Mask)
2813 .setOperandDead(3); // Dead scc
2814 } else {
2815 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2816 .addReg(SrcReg)
2817 .addImm(SrcSize << 16);
2818 }
2819
2820 I.eraseFromParent();
2821 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2822 }
2823
2824 return false;
2825}
2826
2830
2832 Register BitcastSrc;
2833 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2834 Reg = BitcastSrc;
2835 return Reg;
2836}
2837
2839 Register &Out) {
2840 Register Trunc;
2841 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2842 return false;
2843
2844 Register LShlSrc;
2845 Register Cst;
2846 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2847 Cst = stripCopy(Cst, MRI);
2848 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2849 Out = stripBitCast(LShlSrc, MRI);
2850 return true;
2851 }
2852 }
2853
2854 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2855 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2856 return false;
2857
2858 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2859 LLT::fixed_vector(2, 16));
2860
2861 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2862 assert(Mask.size() == 2);
2863
2864 if (Mask[0] == 1 && Mask[1] <= 1) {
2865 Out = Shuffle->getOperand(0).getReg();
2866 return true;
2867 }
2868
2869 return false;
2870}
2871
2872bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2873 if (!Subtarget->hasSALUFloatInsts())
2874 return false;
2875
2876 Register Dst = I.getOperand(0).getReg();
2877 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2878 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2879 return false;
2880
2881 Register Src = I.getOperand(1).getReg();
2882
2883 if (MRI->getType(Dst) == LLT::scalar(32) &&
2884 MRI->getType(Src) == LLT::scalar(16)) {
2885 if (isExtractHiElt(*MRI, Src, Src)) {
2886 MachineBasicBlock *BB = I.getParent();
2887 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2888 .addUse(Src);
2889 I.eraseFromParent();
2890 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2891 }
2892 }
2893
2894 return false;
2895}
2896
2897bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2898 // Only manually handle the f64 SGPR case.
2899 //
2900 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2901 // the bit ops theoretically have a second result due to the implicit def of
2902 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2903 // that is easy by disabling the check. The result works, but uses a
2904 // nonsensical sreg32orlds_and_sreg_1 regclass.
2905 //
2906 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2907 // the variadic REG_SEQUENCE operands.
2908
2909 Register Dst = MI.getOperand(0).getReg();
2910 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2911 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2912 MRI->getType(Dst) != LLT::scalar(64))
2913 return false;
2914
2915 Register Src = MI.getOperand(1).getReg();
2916 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2917 if (Fabs)
2918 Src = Fabs->getOperand(1).getReg();
2919
2920 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2921 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2922 return false;
2923
2924 MachineBasicBlock *BB = MI.getParent();
2925 const DebugLoc &DL = MI.getDebugLoc();
2926 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2927 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2928 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2929 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2930
2931 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2932 .addReg(Src, {}, AMDGPU::sub0);
2933 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2934 .addReg(Src, {}, AMDGPU::sub1);
2935 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2936 .addImm(0x80000000);
2937
2938 // Set or toggle sign bit.
2939 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2940 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2941 .addReg(HiReg)
2942 .addReg(ConstReg)
2943 .setOperandDead(3); // Dead scc
2944 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2945 .addReg(LoReg)
2946 .addImm(AMDGPU::sub0)
2947 .addReg(OpReg)
2948 .addImm(AMDGPU::sub1);
2949 MI.eraseFromParent();
2950 return true;
2951}
2952
2953// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2954bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2955 Register Dst = MI.getOperand(0).getReg();
2956 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2957 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2958 MRI->getType(Dst) != LLT::scalar(64))
2959 return false;
2960
2961 Register Src = MI.getOperand(1).getReg();
2962 MachineBasicBlock *BB = MI.getParent();
2963 const DebugLoc &DL = MI.getDebugLoc();
2964 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2965 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2966 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2967 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2968
2969 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2970 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2971 return false;
2972
2973 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2974 .addReg(Src, {}, AMDGPU::sub0);
2975 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2976 .addReg(Src, {}, AMDGPU::sub1);
2977 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2978 .addImm(0x7fffffff);
2979
2980 // Clear sign bit.
2981 // TODO: Should this used S_BITSET0_*?
2982 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2983 .addReg(HiReg)
2984 .addReg(ConstReg)
2985 .setOperandDead(3); // Dead scc
2986 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2987 .addReg(LoReg)
2988 .addImm(AMDGPU::sub0)
2989 .addReg(OpReg)
2990 .addImm(AMDGPU::sub1);
2991
2992 MI.eraseFromParent();
2993 return true;
2994}
2995
2996static bool isConstant(const MachineInstr &MI) {
2997 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2998}
2999
3000void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
3001 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
3002
3003 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
3004 const MachineInstr *PtrMI =
3005 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
3006
3007 assert(PtrMI);
3008
3009 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3010 return;
3011
3012 GEPInfo GEPInfo;
3013
3014 for (unsigned i = 1; i != 3; ++i) {
3015 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3016 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3017 assert(OpDef);
3018 if (i == 2 && isConstant(*OpDef)) {
3019 // TODO: Could handle constant base + variable offset, but a combine
3020 // probably should have commuted it.
3021 assert(GEPInfo.Imm == 0);
3022 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3023 continue;
3024 }
3025 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3026 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3027 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3028 else
3029 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3030 }
3031
3032 AddrInfo.push_back(GEPInfo);
3033 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3034}
3035
3036bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3037 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3038}
3039
3040bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3041 if (!MI.hasOneMemOperand())
3042 return false;
3043
3044 const MachineMemOperand *MMO = *MI.memoperands_begin();
3045 const Value *Ptr = MMO->getValue();
3046
3047 // UndefValue means this is a load of a kernel input. These are uniform.
3048 // Sometimes LDS instructions have constant pointers.
3049 // If Ptr is null, then that means this mem operand contains a
3050 // PseudoSourceValue like GOT.
3052 return true;
3053
3055 return true;
3056
3057 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3058 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3059 AMDGPU::SGPRRegBankID;
3060
3061 const Instruction *I = dyn_cast<Instruction>(Ptr);
3062 return I && I->getMetadata("amdgpu.uniform");
3063}
3064
3065bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3066 for (const GEPInfo &GEPInfo : AddrInfo) {
3067 if (!GEPInfo.VgprParts.empty())
3068 return true;
3069 }
3070 return false;
3071}
3072
3073void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3074 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3075 unsigned AS = PtrTy.getAddressSpace();
3077 STI.ldsRequiresM0Init()) {
3078 MachineBasicBlock *BB = I.getParent();
3079
3080 // If DS instructions require M0 initialization, insert it before selecting.
3081 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3082 .addImm(-1);
3083 }
3084}
3085
3086bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3087 MachineInstr &I) const {
3088 initM0(I);
3089 return selectImpl(I, *CoverageInfo);
3090}
3091
3093 if (Reg.isPhysical())
3094 return false;
3095
3097 const unsigned Opcode = MI.getOpcode();
3098
3099 if (Opcode == AMDGPU::COPY)
3100 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3101
3102 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3103 Opcode == AMDGPU::G_XOR)
3104 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3105 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3106
3107 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3108 return GI->is(Intrinsic::amdgcn_class);
3109
3110 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3111}
3112
3113bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3114 MachineBasicBlock *BB = I.getParent();
3115 MachineOperand &CondOp = I.getOperand(0);
3116 Register CondReg = CondOp.getReg();
3117 const DebugLoc &DL = I.getDebugLoc();
3118
3119 unsigned BrOpcode;
3120 Register CondPhysReg;
3121 const TargetRegisterClass *ConstrainRC;
3122
3123 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3124 // whether the branch is uniform when selecting the instruction. In
3125 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3126 // RegBankSelect knows what it's doing if the branch condition is scc, even
3127 // though it currently does not.
3128 if (!isVCC(CondReg, *MRI)) {
3129 if (MRI->getType(CondReg) != LLT::scalar(32))
3130 return false;
3131
3132 CondPhysReg = AMDGPU::SCC;
3133 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3134 ConstrainRC = &AMDGPU::SReg_32RegClass;
3135 } else {
3136 // FIXME: Should scc->vcc copies and with exec?
3137
3138 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3139 // need to insert an and with exec.
3140 if (!isVCmpResult(CondReg, *MRI)) {
3141 const bool Is64 = STI.isWave64();
3142 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3143 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3144
3145 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3146 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3147 .addReg(CondReg)
3148 .addReg(Exec)
3149 .setOperandDead(3); // Dead scc
3150 CondReg = TmpReg;
3151 }
3152
3153 CondPhysReg = TRI.getVCC();
3154 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3155 ConstrainRC = TRI.getBoolRC();
3156 }
3157
3158 if (!MRI->getRegClassOrNull(CondReg))
3159 MRI->setRegClass(CondReg, ConstrainRC);
3160
3161 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3162 .addReg(CondReg);
3163 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3164 .addMBB(I.getOperand(1).getMBB());
3165
3166 I.eraseFromParent();
3167 return true;
3168}
3169
3170bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3171 MachineInstr &I) const {
3172 Register DstReg = I.getOperand(0).getReg();
3173 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3174 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3175 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3176 if (IsVGPR)
3177 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3178
3179 return RBI.constrainGenericRegister(
3180 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3181}
3182
3183bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3184 Register DstReg = I.getOperand(0).getReg();
3185 Register SrcReg = I.getOperand(1).getReg();
3186 Register MaskReg = I.getOperand(2).getReg();
3187 LLT Ty = MRI->getType(DstReg);
3188 LLT MaskTy = MRI->getType(MaskReg);
3189 MachineBasicBlock *BB = I.getParent();
3190 const DebugLoc &DL = I.getDebugLoc();
3191
3192 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3193 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3194 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3195 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3196 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3197 return false;
3198
3199 // Try to avoid emitting a bit operation when we only need to touch half of
3200 // the 64-bit pointer.
3201 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3202 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3203 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3204
3205 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3206 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3207
3208 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3209 !CanCopyLow32 && !CanCopyHi32) {
3210 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3211 .addReg(SrcReg)
3212 .addReg(MaskReg)
3213 .setOperandDead(3); // Dead scc
3214 I.eraseFromParent();
3215 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3216 return true;
3217 }
3218
3219 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3220 const TargetRegisterClass &RegRC
3221 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3222
3223 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3224 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3225 const TargetRegisterClass *MaskRC =
3226 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3227
3228 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3229 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3230 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3231 return false;
3232
3233 if (Ty.getSizeInBits() == 32) {
3234 assert(MaskTy.getSizeInBits() == 32 &&
3235 "ptrmask should have been narrowed during legalize");
3236
3237 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3238 .addReg(SrcReg)
3239 .addReg(MaskReg);
3240
3241 if (!IsVGPR)
3242 NewOp.setOperandDead(3); // Dead scc
3243 I.eraseFromParent();
3244 return true;
3245 }
3246
3247 Register HiReg = MRI->createVirtualRegister(&RegRC);
3248 Register LoReg = MRI->createVirtualRegister(&RegRC);
3249
3250 // Extract the subregisters from the source pointer.
3251 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3252 .addReg(SrcReg, {}, AMDGPU::sub0);
3253 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3254 .addReg(SrcReg, {}, AMDGPU::sub1);
3255
3256 Register MaskedLo, MaskedHi;
3257
3258 if (CanCopyLow32) {
3259 // If all the bits in the low half are 1, we only need a copy for it.
3260 MaskedLo = LoReg;
3261 } else {
3262 // Extract the mask subregister and apply the and.
3263 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3264 MaskedLo = MRI->createVirtualRegister(&RegRC);
3265
3266 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3267 .addReg(MaskReg, {}, AMDGPU::sub0);
3268 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3269 .addReg(LoReg)
3270 .addReg(MaskLo);
3271 }
3272
3273 if (CanCopyHi32) {
3274 // If all the bits in the high half are 1, we only need a copy for it.
3275 MaskedHi = HiReg;
3276 } else {
3277 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3278 MaskedHi = MRI->createVirtualRegister(&RegRC);
3279
3280 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3281 .addReg(MaskReg, {}, AMDGPU::sub1);
3282 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3283 .addReg(HiReg)
3284 .addReg(MaskHi);
3285 }
3286
3287 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3288 .addReg(MaskedLo)
3289 .addImm(AMDGPU::sub0)
3290 .addReg(MaskedHi)
3291 .addImm(AMDGPU::sub1);
3292 I.eraseFromParent();
3293 return true;
3294}
3295
3296/// Return the register to use for the index value, and the subregister to use
3297/// for the indirectly accessed register.
3298static std::pair<Register, unsigned>
3300 const TargetRegisterClass *SuperRC, Register IdxReg,
3301 unsigned EltSize, GISelValueTracking &ValueTracking) {
3302 Register IdxBaseReg;
3303 int Offset;
3304
3305 std::tie(IdxBaseReg, Offset) =
3306 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3307 if (IdxBaseReg == AMDGPU::NoRegister) {
3308 // This will happen if the index is a known constant. This should ordinarily
3309 // be legalized out, but handle it as a register just in case.
3310 assert(Offset == 0);
3311 IdxBaseReg = IdxReg;
3312 }
3313
3314 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3315
3316 // Skip out of bounds offsets, or else we would end up using an undefined
3317 // register.
3318 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3319 return std::pair(IdxReg, SubRegs[0]);
3320 return std::pair(IdxBaseReg, SubRegs[Offset]);
3321}
3322
3323bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3324 MachineInstr &MI) const {
3325 Register DstReg = MI.getOperand(0).getReg();
3326 Register SrcReg = MI.getOperand(1).getReg();
3327 Register IdxReg = MI.getOperand(2).getReg();
3328
3329 LLT DstTy = MRI->getType(DstReg);
3330 LLT SrcTy = MRI->getType(SrcReg);
3331
3332 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3333 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3334 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3335
3336 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3337 // into a waterfall loop.
3338 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3339 return false;
3340
3341 const TargetRegisterClass *SrcRC =
3342 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3343 const TargetRegisterClass *DstRC =
3344 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3345 if (!SrcRC || !DstRC)
3346 return false;
3347 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3348 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3349 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3350 return false;
3351
3352 MachineBasicBlock *BB = MI.getParent();
3353 const DebugLoc &DL = MI.getDebugLoc();
3354 const bool Is64 = DstTy.getSizeInBits() == 64;
3355
3356 unsigned SubReg;
3357 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3358 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3359
3360 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3361 if (DstTy.getSizeInBits() != 32 && !Is64)
3362 return false;
3363
3364 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3365 .addReg(IdxReg);
3366
3367 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3368 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3369 .addReg(SrcReg, {}, SubReg)
3370 .addReg(SrcReg, RegState::Implicit);
3371 MI.eraseFromParent();
3372 return true;
3373 }
3374
3375 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3376 return false;
3377
3378 if (!STI.useVGPRIndexMode()) {
3379 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3380 .addReg(IdxReg);
3381 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3382 .addReg(SrcReg, {}, SubReg)
3383 .addReg(SrcReg, RegState::Implicit);
3384 MI.eraseFromParent();
3385 return true;
3386 }
3387
3388 const MCInstrDesc &GPRIDXDesc =
3389 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3390 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3391 .addReg(SrcReg)
3392 .addReg(IdxReg)
3393 .addImm(SubReg);
3394
3395 MI.eraseFromParent();
3396 return true;
3397}
3398
3399// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3400bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3401 MachineInstr &MI) const {
3402 Register DstReg = MI.getOperand(0).getReg();
3403 Register VecReg = MI.getOperand(1).getReg();
3404 Register ValReg = MI.getOperand(2).getReg();
3405 Register IdxReg = MI.getOperand(3).getReg();
3406
3407 LLT VecTy = MRI->getType(DstReg);
3408 LLT ValTy = MRI->getType(ValReg);
3409 unsigned VecSize = VecTy.getSizeInBits();
3410 unsigned ValSize = ValTy.getSizeInBits();
3411
3412 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3413 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3414 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3415
3416 assert(VecTy.getElementType() == ValTy);
3417
3418 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3419 // into a waterfall loop.
3420 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3421 return false;
3422
3423 const TargetRegisterClass *VecRC =
3424 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3425 const TargetRegisterClass *ValRC =
3426 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3427
3428 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3429 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3430 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3431 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3432 return false;
3433
3434 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3435 return false;
3436
3437 unsigned SubReg;
3438 std::tie(IdxReg, SubReg) =
3439 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3440
3441 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3442 STI.useVGPRIndexMode();
3443
3444 MachineBasicBlock *BB = MI.getParent();
3445 const DebugLoc &DL = MI.getDebugLoc();
3446
3447 if (!IndexMode) {
3448 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3449 .addReg(IdxReg);
3450
3451 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3452 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3453 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3454 .addReg(VecReg)
3455 .addReg(ValReg)
3456 .addImm(SubReg);
3457 MI.eraseFromParent();
3458 return true;
3459 }
3460
3461 const MCInstrDesc &GPRIDXDesc =
3462 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3463 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3464 .addReg(VecReg)
3465 .addReg(ValReg)
3466 .addReg(IdxReg)
3467 .addImm(SubReg);
3468
3469 MI.eraseFromParent();
3470 return true;
3471}
3472
3473static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3474 switch (Intr) {
3475 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3476 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3477 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3478 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3479 case Intrinsic::amdgcn_load_async_to_lds:
3480 case Intrinsic::amdgcn_global_load_async_lds:
3481 return true;
3482 }
3483 return false;
3484}
3485
3486bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3487 if (!Subtarget->hasVMemToLDSLoad())
3488 return false;
3489 unsigned Opc;
3490 unsigned Size = MI.getOperand(3).getImm();
3491 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3492
3493 // The struct intrinsic variants add one additional operand over raw.
3494 const bool HasVIndex = MI.getNumOperands() == 9;
3495 Register VIndex;
3496 int OpOffset = 0;
3497 if (HasVIndex) {
3498 VIndex = MI.getOperand(4).getReg();
3499 OpOffset = 1;
3500 }
3501
3502 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3503 std::optional<ValueAndVReg> MaybeVOffset =
3505 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3506
3507 switch (Size) {
3508 default:
3509 return false;
3510 case 1:
3511 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3512 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3513 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3514 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3515 break;
3516 case 2:
3517 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3518 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3519 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3520 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3521 break;
3522 case 4:
3523 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3524 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3525 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3526 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3527 break;
3528 case 12:
3529 if (!Subtarget->hasLDSLoadB96_B128())
3530 return false;
3531
3532 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3533 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3534 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3535 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3536 break;
3537 case 16:
3538 if (!Subtarget->hasLDSLoadB96_B128())
3539 return false;
3540
3541 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3542 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3543 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3544 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3545 break;
3546 }
3547
3548 MachineBasicBlock *MBB = MI.getParent();
3549 const DebugLoc &DL = MI.getDebugLoc();
3550 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3551 .add(MI.getOperand(2));
3552
3553 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3554
3555 if (HasVIndex && HasVOffset) {
3556 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3557 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3558 .addReg(VIndex)
3559 .addImm(AMDGPU::sub0)
3560 .addReg(VOffset)
3561 .addImm(AMDGPU::sub1);
3562
3563 MIB.addReg(IdxReg);
3564 } else if (HasVIndex) {
3565 MIB.addReg(VIndex);
3566 } else if (HasVOffset) {
3567 MIB.addReg(VOffset);
3568 }
3569
3570 MIB.add(MI.getOperand(1)); // rsrc
3571 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3572 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3573 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3574 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3575 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3576 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3577 MIB.addImm(
3578 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3579 ? 1
3580 : 0); // swz
3581 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3582
3583 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3584 // Don't set the offset value here because the pointer points to the base of
3585 // the buffer.
3586 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3587
3588 MachinePointerInfo StorePtrI = LoadPtrI;
3589 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3593
3594 auto F = LoadMMO->getFlags() &
3596 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3597 Size, LoadMMO->getBaseAlign());
3598
3599 MachineMemOperand *StoreMMO =
3600 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3601 sizeof(int32_t), LoadMMO->getBaseAlign());
3602
3603 MIB.setMemRefs({LoadMMO, StoreMMO});
3604
3605 MI.eraseFromParent();
3606 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3607 return true;
3608}
3609
3610/// Match a zero extend from a 32-bit value to 64-bits.
3611Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3612 Register ZExtSrc;
3613 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3614 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3615
3616 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3617 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3618 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3619 return Register();
3620
3621 assert(Def->getNumOperands() == 3 &&
3622 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3623 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3624 return Def->getOperand(1).getReg();
3625 }
3626
3627 return Register();
3628}
3629
3630/// Match a sign extend from a 32-bit value to 64-bits.
3631Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3632 Register SExtSrc;
3633 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3634 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3635
3636 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3637 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3638 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3639 return Register();
3640
3641 assert(Def->getNumOperands() == 3 &&
3642 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3643 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3644 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3645 m_SpecificICst(31))))
3646 return Def->getOperand(1).getReg();
3647
3648 if (VT->signBitIsZero(Reg))
3649 return matchZeroExtendFromS32(Reg);
3650
3651 return Register();
3652}
3653
3654/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3655/// is 32-bit.
3657AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3658 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3659 : matchZeroExtendFromS32(Reg);
3660}
3661
3662/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3663/// is 32-bit.
3665AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3666 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3667 : matchSignExtendFromS32(Reg);
3668}
3669
3671AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3672 bool IsSigned) const {
3673 if (IsSigned)
3674 return matchSignExtendFromS32OrS32(Reg);
3675
3676 return matchZeroExtendFromS32OrS32(Reg);
3677}
3678
3679Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3680 Register AnyExtSrc;
3681 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3682 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3683
3684 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3685 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3686 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3687 return Register();
3688
3689 assert(Def->getNumOperands() == 3 &&
3690 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3691
3692 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3693 return Def->getOperand(1).getReg();
3694
3695 return Register();
3696}
3697
3698bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3699 if (!Subtarget->hasVMemToLDSLoad())
3700 return false;
3701
3702 unsigned Opc;
3703 unsigned Size = MI.getOperand(3).getImm();
3704 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3705
3706 switch (Size) {
3707 default:
3708 return false;
3709 case 1:
3710 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3711 break;
3712 case 2:
3713 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3714 break;
3715 case 4:
3716 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3717 break;
3718 case 12:
3719 if (!Subtarget->hasLDSLoadB96_B128())
3720 return false;
3721 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3722 break;
3723 case 16:
3724 if (!Subtarget->hasLDSLoadB96_B128())
3725 return false;
3726 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3727 break;
3728 }
3729
3730 MachineBasicBlock *MBB = MI.getParent();
3731 const DebugLoc &DL = MI.getDebugLoc();
3732 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3733 .add(MI.getOperand(2));
3734
3735 Register Addr = MI.getOperand(1).getReg();
3736 Register VOffset;
3737 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3738 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3739 if (!isSGPR(Addr)) {
3740 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3741 if (isSGPR(AddrDef->Reg)) {
3742 Addr = AddrDef->Reg;
3743 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3744 Register SAddr =
3745 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3746 if (isSGPR(SAddr)) {
3747 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3748 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3749 Addr = SAddr;
3750 VOffset = Off;
3751 }
3752 }
3753 }
3754 }
3755
3756 if (isSGPR(Addr)) {
3758 if (!VOffset) {
3759 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3760 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3761 .addImm(0);
3762 }
3763 }
3764
3765 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3766 .addReg(Addr);
3767
3768 if (isSGPR(Addr))
3769 MIB.addReg(VOffset);
3770
3771 MIB.add(MI.getOperand(4)); // offset
3772
3773 unsigned Aux = MI.getOperand(5).getImm();
3774 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3775 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3776
3777 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3778 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3779 LoadPtrI.Offset = MI.getOperand(4).getImm();
3780 MachinePointerInfo StorePtrI = LoadPtrI;
3781 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3785 auto F = LoadMMO->getFlags() &
3787 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3788 Size, LoadMMO->getBaseAlign());
3789 MachineMemOperand *StoreMMO =
3790 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3791 sizeof(int32_t), Align(4));
3792
3793 MIB.setMemRefs({LoadMMO, StoreMMO});
3794
3795 MI.eraseFromParent();
3796 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3797 return true;
3798}
3799
3800bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3801 Intrinsic::ID IID) const {
3802 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3803 unsigned Opc =
3804 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3805 int NumGroups = 4;
3806
3807 // A lamda function to check whether an operand is a vector of all 0s.
3808 const auto isAllZeros = [&](MachineOperand &Opnd) {
3809 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3810 if (!DefMI)
3811 return false;
3812 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3813 };
3814
3815 // Use _D2 version if both group 2 and 3 are zero-initialized.
3816 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3817 NumGroups = 2;
3818 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3819 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3820 }
3821
3822 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3823 // for now because all existing targets only support up to 4 groups.
3824 MachineBasicBlock *MBB = MI.getParent();
3825 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3826 .add(MI.getOperand(1)) // D# group 0
3827 .add(MI.getOperand(2)); // D# group 1
3828
3829 if (NumGroups >= 4) { // Has at least 4 groups
3830 MIB.add(MI.getOperand(3)) // D# group 2
3831 .add(MI.getOperand(4)); // D# group 3
3832 }
3833
3834 MIB.addImm(0) // r128
3835 .add(MI.getOperand(6)); // cpol
3836
3837 MI.eraseFromParent();
3838 return true;
3839}
3840
3841bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3842 MachineInstr &MI) const {
3843 unsigned OpcodeOpIdx =
3844 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3845 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3846 MI.removeOperand(OpcodeOpIdx);
3847 MI.addImplicitDefUseOperands(*MI.getMF());
3848 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3849 return true;
3850}
3851
3852// FIXME: This should be removed and let the patterns select. We just need the
3853// AGPR/VGPR combination versions.
3854bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3855 unsigned Opc;
3856 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3857 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3858 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3859 break;
3860 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3861 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3862 break;
3863 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3864 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3865 break;
3866 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3867 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3868 break;
3869 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3870 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3871 break;
3872 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3873 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3874 break;
3875 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3876 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3877 break;
3878 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3879 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3880 break;
3881 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3882 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3883 break;
3884 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3885 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3886 break;
3887 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3888 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3889 break;
3890 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3891 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3892 break;
3893 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3894 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3895 break;
3896 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3897 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3898 break;
3899 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3900 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3901 break;
3902 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3903 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3904 break;
3905 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3906 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3907 break;
3908 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3909 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3910 break;
3911 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3912 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3913 break;
3914 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3915 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3916 break;
3917 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3918 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3919 break;
3920 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3921 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3922 break;
3923 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3924 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3925 break;
3926 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3927 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3928 break;
3929 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3930 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3931 break;
3932 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3933 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3934 break;
3935 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3936 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3937 break;
3938 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3939 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3940 break;
3941 default:
3942 llvm_unreachable("unhandled smfmac intrinsic");
3943 }
3944
3945 auto VDst_In = MI.getOperand(4);
3946
3947 MI.setDesc(TII.get(Opc));
3948 MI.removeOperand(4); // VDst_In
3949 MI.removeOperand(1); // Intrinsic ID
3950 MI.addOperand(VDst_In); // Readd VDst_In to the end
3951 MI.addImplicitDefUseOperands(*MI.getMF());
3952 const MCInstrDesc &MCID = MI.getDesc();
3953 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3954 MI.getOperand(0).setIsEarlyClobber(true);
3955 }
3956 return true;
3957}
3958
3959bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3960 MachineInstr &MI, Intrinsic::ID IntrID) const {
3961 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3962 !Subtarget->hasPermlane16Swap())
3963 return false;
3964 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3965 !Subtarget->hasPermlane32Swap())
3966 return false;
3967
3968 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3969 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3970 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3971
3972 MI.removeOperand(2);
3973 MI.setDesc(TII.get(Opcode));
3974 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3975
3976 MachineOperand &FI = MI.getOperand(4);
3978
3979 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3980 return true;
3981}
3982
3983bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3984 Register DstReg = MI.getOperand(0).getReg();
3985 Register SrcReg = MI.getOperand(1).getReg();
3986 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3987 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3988 MachineBasicBlock *MBB = MI.getParent();
3989 const DebugLoc &DL = MI.getDebugLoc();
3990
3991 if (IsVALU) {
3992 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3993 .addImm(Subtarget->getWavefrontSizeLog2())
3994 .addReg(SrcReg);
3995 } else {
3996 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3997 .addReg(SrcReg)
3998 .addImm(Subtarget->getWavefrontSizeLog2())
3999 .setOperandDead(3); // Dead scc
4000 }
4001
4002 const TargetRegisterClass &RC =
4003 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
4004 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
4005 return false;
4006
4007 MI.eraseFromParent();
4008 return true;
4009}
4010
4011bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4012 MachineInstr &MI) const {
4013 assert(MI.getNumOperands() == 4);
4014 MachineBasicBlock *MBB = MI.getParent();
4015 const DebugLoc &DL = MI.getDebugLoc();
4016
4017 Register DstReg = MI.getOperand(0).getReg();
4018 Register ValReg = MI.getOperand(2).getReg();
4019 Register IdxReg = MI.getOperand(3).getReg();
4020
4021 const LLT DstTy = MRI->getType(DstReg);
4022 unsigned DstSize = DstTy.getSizeInBits();
4023 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4024 const TargetRegisterClass *DstRC =
4025 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4026
4027 if (DstTy != LLT::scalar(32))
4028 return false;
4029
4030 if (!Subtarget->supportsBPermute())
4031 return false;
4032
4033 // If we can bpermute across the whole wave, then just do that
4034 if (Subtarget->supportsWaveWideBPermute()) {
4035 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4036 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4037 .addImm(2)
4038 .addReg(IdxReg);
4039
4040 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4041 .addReg(ShiftIdxReg)
4042 .addReg(ValReg)
4043 .addImm(0);
4044 } else {
4045 // Otherwise, we need to make use of whole wave mode
4046 assert(Subtarget->isWave64());
4047
4048 // Set inactive lanes to poison
4049 Register UndefValReg =
4050 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4051 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4052
4053 Register UndefExecReg = MRI->createVirtualRegister(
4054 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4055 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4056
4057 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4058 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4059 .addImm(0)
4060 .addReg(ValReg)
4061 .addImm(0)
4062 .addReg(UndefValReg)
4063 .addReg(UndefExecReg);
4064
4065 // ds_bpermute requires index to be multiplied by 4
4066 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4067 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4068 .addImm(2)
4069 .addReg(IdxReg);
4070
4071 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4072 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4073 .addImm(0)
4074 .addReg(ShiftIdxReg)
4075 .addImm(0)
4076 .addReg(UndefValReg)
4077 .addReg(UndefExecReg);
4078
4079 // Get permutation of each half, then we'll select which one to use
4080 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4081 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4082 .addReg(PoisonIdxReg)
4083 .addReg(PoisonValReg)
4084 .addImm(0);
4085
4086 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4087 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4088 .addReg(PoisonValReg);
4089
4090 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4091 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4092 .addReg(PoisonIdxReg)
4093 .addReg(SwappedValReg)
4094 .addImm(0);
4095
4096 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4097 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4098 .addReg(OppSidePermReg);
4099
4100 // Select which side to take the permute from
4101 // We can get away with only using mbcnt_lo here since we're only
4102 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4103 // returns 32 for lanes 32-63.
4104 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4105 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4106 .addImm(-1)
4107 .addImm(0);
4108
4109 Register XORReg = MRI->createVirtualRegister(DstRC);
4110 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4111 .addReg(ThreadIDReg)
4112 .addReg(PoisonIdxReg);
4113
4114 Register ANDReg = MRI->createVirtualRegister(DstRC);
4115 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4116 .addReg(XORReg)
4117 .addImm(32);
4118
4119 Register CompareReg = MRI->createVirtualRegister(
4120 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4121 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4122 .addReg(ANDReg)
4123 .addImm(0);
4124
4125 // Finally do the selection
4126 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4127 .addImm(0)
4128 .addReg(WWMSwapPermReg)
4129 .addImm(0)
4130 .addReg(SameSidePermReg)
4131 .addReg(CompareReg);
4132 }
4133
4134 MI.eraseFromParent();
4135 return true;
4136}
4137
4138// Match BITOP3 operation and return a number of matched instructions plus
4139// truth table.
4140static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4142 const MachineRegisterInfo &MRI) {
4143 unsigned NumOpcodes = 0;
4144 uint8_t LHSBits, RHSBits;
4145
4146 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4147 // Define truth table given Src0, Src1, Src2 bits permutations:
4148 // 0 0 0
4149 // 0 0 1
4150 // 0 1 0
4151 // 0 1 1
4152 // 1 0 0
4153 // 1 0 1
4154 // 1 1 0
4155 // 1 1 1
4156 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4157
4158 if (mi_match(Op, MRI, m_AllOnesInt())) {
4159 Bits = 0xff;
4160 return true;
4161 }
4162 if (mi_match(Op, MRI, m_ZeroInt())) {
4163 Bits = 0;
4164 return true;
4165 }
4166
4167 for (unsigned I = 0; I < Src.size(); ++I) {
4168 // Try to find existing reused operand
4169 if (Src[I] == Op) {
4170 Bits = SrcBits[I];
4171 return true;
4172 }
4173 // Try to replace parent operator
4174 if (Src[I] == R) {
4175 Bits = SrcBits[I];
4176 Src[I] = Op;
4177 return true;
4178 }
4179 }
4180
4181 if (Src.size() == 3) {
4182 // No room left for operands. Try one last time, there can be a 'not' of
4183 // one of our source operands. In this case we can compute the bits
4184 // without growing Src vector.
4185 Register LHS;
4186 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4188 for (unsigned I = 0; I < Src.size(); ++I) {
4189 if (Src[I] == LHS) {
4190 Bits = ~SrcBits[I];
4191 return true;
4192 }
4193 }
4194 }
4195
4196 return false;
4197 }
4198
4199 Bits = SrcBits[Src.size()];
4200 Src.push_back(Op);
4201 return true;
4202 };
4203
4204 MachineInstr *MI = MRI.getVRegDef(R);
4205 switch (MI->getOpcode()) {
4206 case TargetOpcode::G_AND:
4207 case TargetOpcode::G_OR:
4208 case TargetOpcode::G_XOR: {
4209 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4210 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4211
4212 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4213 if (!getOperandBits(LHS, LHSBits) ||
4214 !getOperandBits(RHS, RHSBits)) {
4215 Src = std::move(Backup);
4216 return std::make_pair(0, 0);
4217 }
4218
4219 // Recursion is naturally limited by the size of the operand vector.
4220 auto Op = BitOp3_Op(LHS, Src, MRI);
4221 if (Op.first) {
4222 NumOpcodes += Op.first;
4223 LHSBits = Op.second;
4224 }
4225
4226 Op = BitOp3_Op(RHS, Src, MRI);
4227 if (Op.first) {
4228 NumOpcodes += Op.first;
4229 RHSBits = Op.second;
4230 }
4231 break;
4232 }
4233 default:
4234 return std::make_pair(0, 0);
4235 }
4236
4237 uint8_t TTbl;
4238 switch (MI->getOpcode()) {
4239 case TargetOpcode::G_AND:
4240 TTbl = LHSBits & RHSBits;
4241 break;
4242 case TargetOpcode::G_OR:
4243 TTbl = LHSBits | RHSBits;
4244 break;
4245 case TargetOpcode::G_XOR:
4246 TTbl = LHSBits ^ RHSBits;
4247 break;
4248 default:
4249 break;
4250 }
4251
4252 return std::make_pair(NumOpcodes + 1, TTbl);
4253}
4254
4255bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4256 if (!Subtarget->hasBitOp3Insts())
4257 return false;
4258
4259 Register DstReg = MI.getOperand(0).getReg();
4260 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4261 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4262 if (!IsVALU)
4263 return false;
4264
4266 uint8_t TTbl;
4267 unsigned NumOpcodes;
4268
4269 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4270
4271 // Src.empty() case can happen if all operands are all zero or all ones.
4272 // Normally it shall be optimized out before reaching this.
4273 if (NumOpcodes < 2 || Src.empty())
4274 return false;
4275
4276 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4277 if (NumOpcodes == 2 && IsB32) {
4278 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4279 // asm more readable. This cannot be modeled with AddedComplexity because
4280 // selector does not know how many operations did we match.
4281 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4282 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4283 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4284 return false;
4285 } else if (NumOpcodes < 4) {
4286 // For a uniform case threshold should be higher to account for moves
4287 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4288 // in SGPRs and a readtfirstlane after.
4289 return false;
4290 }
4291
4292 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4293 if (!IsB32 && STI.hasTrue16BitInsts())
4294 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4295 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4296 unsigned CBL = STI.getConstantBusLimit(Opc);
4297 MachineBasicBlock *MBB = MI.getParent();
4298 const DebugLoc &DL = MI.getDebugLoc();
4299
4300 for (unsigned I = 0; I < Src.size(); ++I) {
4301 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4302 if (RB->getID() != AMDGPU::SGPRRegBankID)
4303 continue;
4304 if (CBL > 0) {
4305 --CBL;
4306 continue;
4307 }
4308 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4309 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4310 .addReg(Src[I]);
4311 Src[I] = NewReg;
4312 }
4313
4314 // Last operand can be ignored, turning a ternary operation into a binary.
4315 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4316 // 'c' with 'a' here without changing the answer. In some pathological
4317 // cases it should be possible to get an operation with a single operand
4318 // too if optimizer would not catch it.
4319 while (Src.size() < 3)
4320 Src.push_back(Src[0]);
4321
4322 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4323 if (!IsB32)
4324 MIB.addImm(0); // src_mod0
4325 MIB.addReg(Src[0]);
4326 if (!IsB32)
4327 MIB.addImm(0); // src_mod1
4328 MIB.addReg(Src[1]);
4329 if (!IsB32)
4330 MIB.addImm(0); // src_mod2
4331 MIB.addReg(Src[2])
4332 .addImm(TTbl);
4333 if (!IsB32)
4334 MIB.addImm(0); // op_sel
4335
4336 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4337 MI.eraseFromParent();
4338
4339 return true;
4340}
4341
4342bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4343 Register SrcReg = MI.getOperand(0).getReg();
4344 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4345 return false;
4346
4347 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4348 Register SP =
4349 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4350 Register WaveAddr = getWaveAddress(DefMI);
4351 MachineBasicBlock *MBB = MI.getParent();
4352 const DebugLoc &DL = MI.getDebugLoc();
4353
4354 if (!WaveAddr) {
4355 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4356 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4357 .addReg(SrcReg)
4358 .addImm(Subtarget->getWavefrontSizeLog2())
4359 .setOperandDead(3); // Dead scc
4360 }
4361
4362 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4363 .addReg(WaveAddr);
4364
4365 MI.eraseFromParent();
4366 return true;
4367}
4368
4370
4371 if (!I.isPreISelOpcode()) {
4372 if (I.isCopy())
4373 return selectCOPY(I);
4374 return true;
4375 }
4376
4377 switch (I.getOpcode()) {
4378 case TargetOpcode::G_AND:
4379 case TargetOpcode::G_OR:
4380 case TargetOpcode::G_XOR:
4381 if (selectBITOP3(I))
4382 return true;
4383 if (selectImpl(I, *CoverageInfo))
4384 return true;
4385 return selectG_AND_OR_XOR(I);
4386 case TargetOpcode::G_ADD:
4387 case TargetOpcode::G_SUB:
4388 case TargetOpcode::G_PTR_ADD:
4389 if (selectImpl(I, *CoverageInfo))
4390 return true;
4391 return selectG_ADD_SUB(I);
4392 case TargetOpcode::G_UADDO:
4393 case TargetOpcode::G_USUBO:
4394 case TargetOpcode::G_UADDE:
4395 case TargetOpcode::G_USUBE:
4396 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4397 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4398 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4399 return selectG_AMDGPU_MAD_64_32(I);
4400 case TargetOpcode::G_INTTOPTR:
4401 case TargetOpcode::G_BITCAST:
4402 case TargetOpcode::G_PTRTOINT:
4403 case TargetOpcode::G_FREEZE:
4404 return selectCOPY(I);
4405 case TargetOpcode::G_FNEG:
4406 if (selectImpl(I, *CoverageInfo))
4407 return true;
4408 return selectG_FNEG(I);
4409 case TargetOpcode::G_FABS:
4410 if (selectImpl(I, *CoverageInfo))
4411 return true;
4412 return selectG_FABS(I);
4413 case TargetOpcode::G_EXTRACT:
4414 return selectG_EXTRACT(I);
4415 case TargetOpcode::G_MERGE_VALUES:
4416 case TargetOpcode::G_CONCAT_VECTORS:
4417 return selectG_MERGE_VALUES(I);
4418 case TargetOpcode::G_UNMERGE_VALUES:
4419 return selectG_UNMERGE_VALUES(I);
4420 case TargetOpcode::G_BUILD_VECTOR:
4421 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4422 return selectG_BUILD_VECTOR(I);
4423 case TargetOpcode::G_IMPLICIT_DEF:
4424 return selectG_IMPLICIT_DEF(I);
4425 case TargetOpcode::G_INSERT:
4426 return selectG_INSERT(I);
4427 case TargetOpcode::G_INTRINSIC:
4428 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4429 return selectG_INTRINSIC(I);
4430 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4431 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4432 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4433 case TargetOpcode::G_ICMP:
4434 case TargetOpcode::G_FCMP:
4435 if (selectG_ICMP_or_FCMP(I))
4436 return true;
4437 return selectImpl(I, *CoverageInfo);
4438 case TargetOpcode::G_LOAD:
4439 case TargetOpcode::G_ZEXTLOAD:
4440 case TargetOpcode::G_SEXTLOAD:
4441 case TargetOpcode::G_STORE:
4442 case TargetOpcode::G_ATOMIC_CMPXCHG:
4443 case TargetOpcode::G_ATOMICRMW_XCHG:
4444 case TargetOpcode::G_ATOMICRMW_ADD:
4445 case TargetOpcode::G_ATOMICRMW_SUB:
4446 case TargetOpcode::G_ATOMICRMW_AND:
4447 case TargetOpcode::G_ATOMICRMW_OR:
4448 case TargetOpcode::G_ATOMICRMW_XOR:
4449 case TargetOpcode::G_ATOMICRMW_MIN:
4450 case TargetOpcode::G_ATOMICRMW_MAX:
4451 case TargetOpcode::G_ATOMICRMW_UMIN:
4452 case TargetOpcode::G_ATOMICRMW_UMAX:
4453 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4454 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4455 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4456 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4457 case TargetOpcode::G_ATOMICRMW_FADD:
4458 case TargetOpcode::G_ATOMICRMW_FMIN:
4459 case TargetOpcode::G_ATOMICRMW_FMAX:
4460 return selectG_LOAD_STORE_ATOMICRMW(I);
4461 case TargetOpcode::G_SELECT:
4462 return selectG_SELECT(I);
4463 case TargetOpcode::G_TRUNC:
4464 return selectG_TRUNC(I);
4465 case TargetOpcode::G_SEXT:
4466 case TargetOpcode::G_ZEXT:
4467 case TargetOpcode::G_ANYEXT:
4468 case TargetOpcode::G_SEXT_INREG:
4469 // This is a workaround. For extension from type i1, `selectImpl()` uses
4470 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4471 // i1 can only be hold in a SGPR class.
4472 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4473 selectImpl(I, *CoverageInfo))
4474 return true;
4475 return selectG_SZA_EXT(I);
4476 case TargetOpcode::G_FPEXT:
4477 if (selectG_FPEXT(I))
4478 return true;
4479 return selectImpl(I, *CoverageInfo);
4480 case TargetOpcode::G_BRCOND:
4481 return selectG_BRCOND(I);
4482 case TargetOpcode::G_GLOBAL_VALUE:
4483 return selectG_GLOBAL_VALUE(I);
4484 case TargetOpcode::G_PTRMASK:
4485 return selectG_PTRMASK(I);
4486 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4487 return selectG_EXTRACT_VECTOR_ELT(I);
4488 case TargetOpcode::G_INSERT_VECTOR_ELT:
4489 return selectG_INSERT_VECTOR_ELT(I);
4490 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4491 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4492 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4493 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4494 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4495 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4497 assert(Intr && "not an image intrinsic with image pseudo");
4498 return selectImageIntrinsic(I, Intr);
4499 }
4500 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4501 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4502 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4503 return selectBVHIntersectRayIntrinsic(I);
4504 case AMDGPU::G_SBFX:
4505 case AMDGPU::G_UBFX:
4506 return selectG_SBFX_UBFX(I);
4507 case AMDGPU::G_SI_CALL:
4508 I.setDesc(TII.get(AMDGPU::SI_CALL));
4509 return true;
4510 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4511 return selectWaveAddress(I);
4512 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4513 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4514 return true;
4515 }
4516 case AMDGPU::G_STACKRESTORE:
4517 return selectStackRestore(I);
4518 case AMDGPU::G_PHI:
4519 return selectPHI(I);
4520 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4521 return selectCOPY_SCC_VCC(I);
4522 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4523 return selectCOPY_VCC_SCC(I);
4524 case AMDGPU::G_AMDGPU_READANYLANE:
4525 return selectReadAnyLane(I);
4526 case TargetOpcode::G_CONSTANT:
4527 case TargetOpcode::G_FCONSTANT:
4528 default:
4529 return selectImpl(I, *CoverageInfo);
4530 }
4531 return false;
4532}
4533
4535AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4536 return {{
4537 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4538 }};
4539
4540}
4541
4542std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4543 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4544 unsigned Mods = 0;
4545 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4546
4547 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4548 Src = MI->getOperand(1).getReg();
4549 Mods |= SISrcMods::NEG;
4550 MI = getDefIgnoringCopies(Src, *MRI);
4551 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4552 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4553 // denormal mode, but we're implicitly canonicalizing in a source operand.
4554 const ConstantFP *LHS =
4555 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4556 if (LHS && LHS->isZero()) {
4557 Mods |= SISrcMods::NEG;
4558 Src = MI->getOperand(2).getReg();
4559 }
4560 }
4561
4562 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4563 Src = MI->getOperand(1).getReg();
4564 Mods |= SISrcMods::ABS;
4565 }
4566
4567 if (OpSel)
4568 Mods |= SISrcMods::OP_SEL_0;
4569
4570 return std::pair(Src, Mods);
4571}
4572
4573std::pair<Register, unsigned>
4574AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4575 unsigned Mods;
4576 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4577 Mods |= SISrcMods::OP_SEL_1;
4578 return std::pair(Src, Mods);
4579}
4580
4581Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4582 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4583 bool ForceVGPR) const {
4584 if ((Mods != 0 || ForceVGPR) &&
4585 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4586
4587 // If we looked through copies to find source modifiers on an SGPR operand,
4588 // we now have an SGPR register source. To avoid potentially violating the
4589 // constant bus restriction, we need to insert a copy to a VGPR.
4590 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4591 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4592 TII.get(AMDGPU::COPY), VGPRSrc)
4593 .addReg(Src);
4594 Src = VGPRSrc;
4595 }
4596
4597 return Src;
4598}
4599
4600///
4601/// This will select either an SGPR or VGPR operand and will save us from
4602/// having to write an extra tablegen pattern.
4604AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4605 return {{
4606 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4607 }};
4608}
4609
4611AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4612 Register Src;
4613 unsigned Mods;
4614 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4615
4616 return {{
4617 [=](MachineInstrBuilder &MIB) {
4618 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4619 },
4620 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4621 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4622 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4623 }};
4624}
4625
4627AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4628 Register Src;
4629 unsigned Mods;
4630 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4631 /*IsCanonicalizing=*/true,
4632 /*AllowAbs=*/false);
4633
4634 return {{
4635 [=](MachineInstrBuilder &MIB) {
4636 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4637 },
4638 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4639 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4640 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4641 }};
4642}
4643
4645AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4646 return {{
4647 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4648 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4649 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4650 }};
4651}
4652
4654AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4655 Register Src;
4656 unsigned Mods;
4657 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4658
4659 return {{
4660 [=](MachineInstrBuilder &MIB) {
4661 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4662 },
4663 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4664 }};
4665}
4666
4668AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4669 MachineOperand &Root) const {
4670 Register Src;
4671 unsigned Mods;
4672 std::tie(Src, Mods) =
4673 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4674
4675 return {{
4676 [=](MachineInstrBuilder &MIB) {
4677 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4678 },
4679 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4680 }};
4681}
4682
4684AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4685 Register Src;
4686 unsigned Mods;
4687 std::tie(Src, Mods) =
4688 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4689 /*AllowAbs=*/false);
4690
4691 return {{
4692 [=](MachineInstrBuilder &MIB) {
4693 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4694 },
4695 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4696 }};
4697}
4698
4700AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4701 Register Reg = Root.getReg();
4702 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4703 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4704 return {};
4705 return {{
4706 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4707 }};
4708}
4709
4710enum class SrcStatus {
4715 // This means current op = [op_upper, op_lower] and src = -op_lower.
4718 // This means current op = [op_upper, op_lower] and src = [op_upper,
4719 // -op_lower].
4727};
4728/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4729static bool isTruncHalf(const MachineInstr *MI,
4730 const MachineRegisterInfo &MRI) {
4731 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4732 return false;
4733
4734 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4735 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4736 return DstSize * 2 == SrcSize;
4737}
4738
4739/// Test if the MI is logic shift right with half bits,
4740/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4741static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4742 if (MI->getOpcode() != AMDGPU::G_LSHR)
4743 return false;
4744
4745 Register ShiftSrc;
4746 std::optional<ValueAndVReg> ShiftAmt;
4747 if (mi_match(MI->getOperand(0).getReg(), MRI,
4748 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4749 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4750 unsigned Shift = ShiftAmt->Value.getZExtValue();
4751 return Shift * 2 == SrcSize;
4752 }
4753 return false;
4754}
4755
4756/// Test if the MI is shift left with half bits,
4757/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4758static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4759 if (MI->getOpcode() != AMDGPU::G_SHL)
4760 return false;
4761
4762 Register ShiftSrc;
4763 std::optional<ValueAndVReg> ShiftAmt;
4764 if (mi_match(MI->getOperand(0).getReg(), MRI,
4765 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4766 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4767 unsigned Shift = ShiftAmt->Value.getZExtValue();
4768 return Shift * 2 == SrcSize;
4769 }
4770 return false;
4771}
4772
4773/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4774static bool isUnmergeHalf(const MachineInstr *MI,
4775 const MachineRegisterInfo &MRI) {
4776 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4777 return false;
4778 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4779 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4780}
4781
4783
4785 const MachineRegisterInfo &MRI) {
4786 LLT OpTy = MRI.getType(Reg);
4787 if (OpTy.isScalar())
4788 return TypeClass::SCALAR;
4789 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4792}
4793
4795 const MachineRegisterInfo &MRI) {
4796 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4797 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4798 return SrcStatus::INVALID;
4799
4800 switch (S) {
4801 case SrcStatus::IS_SAME:
4802 if (NegType == TypeClass::VECTOR_OF_TWO) {
4803 // Vector of 2:
4804 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4805 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4806 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4807 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4809 }
4810 if (NegType == TypeClass::SCALAR) {
4811 // Scalar:
4812 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4813 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4814 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4815 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4816 return SrcStatus::IS_HI_NEG;
4817 }
4818 break;
4820 if (NegType == TypeClass::VECTOR_OF_TWO) {
4821 // Vector of 2:
4822 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4823 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4824 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4825 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4826 return SrcStatus::IS_LO_NEG;
4827 }
4828 if (NegType == TypeClass::SCALAR) {
4829 // Scalar:
4830 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4831 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4832 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4833 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4834 return SrcStatus::IS_SAME;
4835 }
4836 break;
4838 if (NegType == TypeClass::VECTOR_OF_TWO) {
4839 // Vector of 2:
4840 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4841 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4842 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4843 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4844 return SrcStatus::IS_HI_NEG;
4845 }
4846 if (NegType == TypeClass::SCALAR) {
4847 // Scalar:
4848 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4849 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4850 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4851 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4853 }
4854 break;
4856 if (NegType == TypeClass::VECTOR_OF_TWO) {
4857 // Vector of 2:
4858 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4859 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4860 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4861 // [SrcHi, SrcLo] = [OpHi, OpLo]
4862 return SrcStatus::IS_SAME;
4863 }
4864 if (NegType == TypeClass::SCALAR) {
4865 // Scalar:
4866 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4867 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4868 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4869 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4870 return SrcStatus::IS_LO_NEG;
4871 }
4872 break;
4874 // Vector of 2:
4875 // Src = CurrUpper
4876 // Curr = [CurrUpper, CurrLower]
4877 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4878 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4879 // Src = -OpUpper
4880 //
4881 // Scalar:
4882 // Src = CurrUpper
4883 // Curr = [CurrUpper, CurrLower]
4884 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4885 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4886 // Src = -OpUpper
4889 if (NegType == TypeClass::VECTOR_OF_TWO) {
4890 // Vector of 2:
4891 // Src = CurrLower
4892 // Curr = [CurrUpper, CurrLower]
4893 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4894 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4895 // Src = -OpLower
4897 }
4898 if (NegType == TypeClass::SCALAR) {
4899 // Scalar:
4900 // Src = CurrLower
4901 // Curr = [CurrUpper, CurrLower]
4902 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4903 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4904 // Src = OpLower
4906 }
4907 break;
4909 // Vector of 2:
4910 // Src = -CurrUpper
4911 // Curr = [CurrUpper, CurrLower]
4912 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4913 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4914 // Src = -(-OpUpper) = OpUpper
4915 //
4916 // Scalar:
4917 // Src = -CurrUpper
4918 // Curr = [CurrUpper, CurrLower]
4919 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4920 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4921 // Src = -(-OpUpper) = OpUpper
4924 if (NegType == TypeClass::VECTOR_OF_TWO) {
4925 // Vector of 2:
4926 // Src = -CurrLower
4927 // Curr = [CurrUpper, CurrLower]
4928 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4929 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4930 // Src = -(-OpLower) = OpLower
4932 }
4933 if (NegType == TypeClass::SCALAR) {
4934 // Scalar:
4935 // Src = -CurrLower
4936 // Curr = [CurrUpper, CurrLower]
4937 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4938 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4939 // Src = -OpLower
4941 }
4942 break;
4943 default:
4944 break;
4945 }
4946 llvm_unreachable("unexpected SrcStatus & NegType combination");
4947}
4948
4949static std::optional<std::pair<Register, SrcStatus>>
4950calcNextStatus(std::pair<Register, SrcStatus> Curr,
4951 const MachineRegisterInfo &MRI) {
4952 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4953
4954 unsigned Opc = MI->getOpcode();
4955
4956 // Handle general Opc cases.
4957 switch (Opc) {
4958 case AMDGPU::G_BITCAST:
4959 return std::optional<std::pair<Register, SrcStatus>>(
4960 {MI->getOperand(1).getReg(), Curr.second});
4961 case AMDGPU::COPY:
4962 if (MI->getOperand(1).getReg().isPhysical())
4963 return std::nullopt;
4964 return std::optional<std::pair<Register, SrcStatus>>(
4965 {MI->getOperand(1).getReg(), Curr.second});
4966 case AMDGPU::G_FNEG: {
4967 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4968 if (Stat == SrcStatus::INVALID)
4969 return std::nullopt;
4970 return std::optional<std::pair<Register, SrcStatus>>(
4971 {MI->getOperand(1).getReg(), Stat});
4972 }
4973 default:
4974 break;
4975 }
4976
4977 // Calc next Stat from current Stat.
4978 switch (Curr.second) {
4979 case SrcStatus::IS_SAME:
4980 if (isTruncHalf(MI, MRI))
4981 return std::optional<std::pair<Register, SrcStatus>>(
4982 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4983 else if (isUnmergeHalf(MI, MRI)) {
4984 if (Curr.first == MI->getOperand(0).getReg())
4985 return std::optional<std::pair<Register, SrcStatus>>(
4986 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4987 return std::optional<std::pair<Register, SrcStatus>>(
4988 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4989 }
4990 break;
4992 if (isTruncHalf(MI, MRI)) {
4993 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4994 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4995 // = [OpLowerHi, OpLowerLo]
4996 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4997 // = [-OpLowerHi, OpLowerLo]
4998 // = -OpLower
4999 return std::optional<std::pair<Register, SrcStatus>>(
5000 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5001 }
5002 if (isUnmergeHalf(MI, MRI)) {
5003 if (Curr.first == MI->getOperand(0).getReg())
5004 return std::optional<std::pair<Register, SrcStatus>>(
5005 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5006 return std::optional<std::pair<Register, SrcStatus>>(
5007 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5008 }
5009 break;
5011 if (isShlHalf(MI, MRI))
5012 return std::optional<std::pair<Register, SrcStatus>>(
5013 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5014 break;
5016 if (isLshrHalf(MI, MRI))
5017 return std::optional<std::pair<Register, SrcStatus>>(
5018 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5019 break;
5021 if (isShlHalf(MI, MRI))
5022 return std::optional<std::pair<Register, SrcStatus>>(
5023 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5024 break;
5026 if (isLshrHalf(MI, MRI))
5027 return std::optional<std::pair<Register, SrcStatus>>(
5028 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5029 break;
5030 default:
5031 break;
5032 }
5033 return std::nullopt;
5034}
5035
5036/// This is used to control valid status that current MI supports. For example,
5037/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5038/// bit on VOP3P.
5039/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5040/// for different MI on different arch
5042private:
5043 bool HasNeg = false;
5044 // Assume all complex pattern of VOP3P have opsel.
5045 bool HasOpsel = true;
5046
5047public:
5049 const MachineInstr *MI = MRI.getVRegDef(Reg);
5050 unsigned Opc = MI->getOpcode();
5051
5052 if (Opc == TargetOpcode::G_INTRINSIC) {
5053 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5054 // Only float point intrinsic has neg & neg_hi bits.
5055 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5056 HasNeg = true;
5058 // Keep same for generic op.
5059 HasNeg = true;
5060 }
5061 }
5062 bool checkOptions(SrcStatus Stat) const {
5063 if (!HasNeg &&
5064 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5065 return false;
5066 }
5067 if (!HasOpsel &&
5068 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5069 return false;
5070 }
5071 return true;
5072 }
5073};
5074
5077 int MaxDepth = 3) {
5078 int Depth = 0;
5079 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5081
5082 while (Depth <= MaxDepth && Curr.has_value()) {
5083 Depth++;
5084 if (SO.checkOptions(Curr.value().second))
5085 Statlist.push_back(Curr.value());
5086 Curr = calcNextStatus(Curr.value(), MRI);
5087 }
5088
5089 return Statlist;
5090}
5091
5092static std::pair<Register, SrcStatus>
5094 int MaxDepth = 3) {
5095 int Depth = 0;
5096 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5097 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5098
5099 while (Depth <= MaxDepth && Curr.has_value()) {
5100 Depth++;
5101 SrcStatus Stat = Curr.value().second;
5102 if (SO.checkOptions(Stat)) {
5103 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5105 LastSameOrNeg = Curr.value();
5106 }
5107 Curr = calcNextStatus(Curr.value(), MRI);
5108 }
5109
5110 return LastSameOrNeg;
5111}
5112
5113static bool isSameBitWidth(Register Reg1, Register Reg2,
5114 const MachineRegisterInfo &MRI) {
5115 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5116 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5117 return Width1 == Width2;
5118}
5119
5120static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5121 // SrcStatus::IS_LOWER_HALF remain 0.
5122 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5123 Mods ^= SISrcMods::NEG_HI;
5124 Mods |= SISrcMods::OP_SEL_1;
5125 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5126 Mods |= SISrcMods::OP_SEL_1;
5127 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5128 Mods ^= SISrcMods::NEG_HI;
5129 else if (HiStat == SrcStatus::IS_HI_NEG)
5130 Mods ^= SISrcMods::NEG_HI;
5131
5132 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5133 Mods ^= SISrcMods::NEG;
5134 Mods |= SISrcMods::OP_SEL_0;
5135 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5136 Mods |= SISrcMods::OP_SEL_0;
5137 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5138 Mods |= SISrcMods::NEG;
5139 else if (LoStat == SrcStatus::IS_HI_NEG)
5140 Mods ^= SISrcMods::NEG;
5141
5142 return Mods;
5143}
5144
5145static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5146 Register RootReg, const SIInstrInfo &TII,
5147 const MachineRegisterInfo &MRI) {
5148 auto IsHalfState = [](SrcStatus S) {
5151 };
5152 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5153 IsHalfState(HiStat);
5154}
5155
5156std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5157 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5158 unsigned Mods = 0;
5159 // No modification if Root type is not form of <2 x Type>.
5160 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5161 Mods |= SISrcMods::OP_SEL_1;
5162 return {RootReg, Mods};
5163 }
5164
5165 SearchOptions SO(RootReg, MRI);
5166
5167 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5168
5169 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5171 else if (Stat.second == SrcStatus::IS_HI_NEG)
5172 Mods ^= SISrcMods::NEG_HI;
5173 else if (Stat.second == SrcStatus::IS_LO_NEG)
5174 Mods ^= SISrcMods::NEG;
5175
5176 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5177
5178 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5179 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5180 Mods |= SISrcMods::OP_SEL_1;
5181 return {Stat.first, Mods};
5182 }
5183
5185 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5186
5187 if (StatlistHi.empty()) {
5188 Mods |= SISrcMods::OP_SEL_1;
5189 return {Stat.first, Mods};
5190 }
5191
5193 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5194
5195 if (StatlistLo.empty()) {
5196 Mods |= SISrcMods::OP_SEL_1;
5197 return {Stat.first, Mods};
5198 }
5199
5200 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5201 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5202 if (StatlistHi[I].first == StatlistLo[J].first &&
5203 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5204 StatlistHi[I].first, RootReg, TII, MRI))
5205 return {StatlistHi[I].first,
5206 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5207 }
5208 }
5209 // Packed instructions do not have abs modifiers.
5210 Mods |= SISrcMods::OP_SEL_1;
5211
5212 return {Stat.first, Mods};
5213}
5214
5215// Removed unused function `getAllKindImm` to eliminate dead code.
5216
5217static bool checkRB(Register Reg, unsigned int RBNo,
5218 const AMDGPURegisterBankInfo &RBI,
5219 const MachineRegisterInfo &MRI,
5220 const TargetRegisterInfo &TRI) {
5221 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5222 return RB->getID() == RBNo;
5223}
5224
5225// This function is used to get the correct register bank for returned reg.
5226// Assume:
5227// 1. VOP3P is always legal for VGPR.
5228// 2. RootOp's regbank is legal.
5229// Thus
5230// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5231// 2. If RootOp is VGPR, then NewOp must be VGPR.
5233 const AMDGPURegisterBankInfo &RBI,
5235 const TargetRegisterInfo &TRI,
5236 const SIInstrInfo &TII) {
5237 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5238 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5239 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5240 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5241 return NewReg;
5242
5243 MachineInstr *MI = MRI.getVRegDef(RootReg);
5244 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5245 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5246 return RootReg;
5247 }
5248
5249 MachineBasicBlock *BB = MI->getParent();
5250 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5251
5253 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5254 .addReg(NewReg);
5255
5256 // Only accept VGPR.
5257 return MIB->getOperand(0).getReg();
5258}
5259
5261AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5262 bool IsDOT) const {
5263 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5264 Register Reg;
5265 unsigned Mods;
5266 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5267
5268 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5269 return {{
5270 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5271 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5272 }};
5273}
5274
5276AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5277
5278 return selectVOP3PRetHelper(Root);
5279}
5280
5282AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5283
5284 return selectVOP3PRetHelper(Root, true);
5285}
5286
5288AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5289 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5290 Register Src;
5291 unsigned Mods;
5292 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
5293 if (Mods != SISrcMods::OP_SEL_1)
5294 return {};
5295
5296 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5297}
5298
5300AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5301 Register Src;
5302 unsigned Mods;
5303 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5304
5305 return {{
5306 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5307 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5308 }};
5309}
5310
5312AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5313 Register Src;
5314 unsigned Mods;
5315 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5316 if (Mods != SISrcMods::OP_SEL_1)
5317 return {};
5318
5319 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5320}
5321
5323AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5324 MachineOperand &Root) const {
5325 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5326 "expected i1 value");
5327 unsigned Mods = SISrcMods::OP_SEL_1;
5328 if (Root.getImm() != 0)
5329 Mods |= SISrcMods::OP_SEL_0;
5330
5331 return {{
5332 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5333 }};
5334}
5335
5337 MachineInstr *InsertPt,
5338 MachineRegisterInfo &MRI) {
5339 const TargetRegisterClass *DstRegClass;
5340 switch (Elts.size()) {
5341 case 8:
5342 DstRegClass = &AMDGPU::VReg_256RegClass;
5343 break;
5344 case 4:
5345 DstRegClass = &AMDGPU::VReg_128RegClass;
5346 break;
5347 case 2:
5348 DstRegClass = &AMDGPU::VReg_64RegClass;
5349 break;
5350 default:
5351 llvm_unreachable("unhandled Reg sequence size");
5352 }
5353
5354 MachineIRBuilder B(*InsertPt);
5355 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5356 .addDef(MRI.createVirtualRegister(DstRegClass));
5357 for (unsigned i = 0; i < Elts.size(); ++i) {
5358 MIB.addReg(Elts[i]);
5360 }
5361 return MIB->getOperand(0).getReg();
5362}
5363
5364static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5366 MachineInstr *InsertPt,
5367 MachineRegisterInfo &MRI) {
5368 if (ModOpcode == TargetOpcode::G_FNEG) {
5369 Mods |= SISrcMods::NEG;
5370 // Check if all elements also have abs modifier
5371 SmallVector<Register, 8> NegAbsElts;
5372 for (auto El : Elts) {
5373 Register FabsSrc;
5374 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5375 break;
5376 NegAbsElts.push_back(FabsSrc);
5377 }
5378 if (Elts.size() != NegAbsElts.size()) {
5379 // Neg
5380 Src = buildRegSequence(Elts, InsertPt, MRI);
5381 } else {
5382 // Neg and Abs
5383 Mods |= SISrcMods::NEG_HI;
5384 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5385 }
5386 } else {
5387 assert(ModOpcode == TargetOpcode::G_FABS);
5388 // Abs
5389 Mods |= SISrcMods::NEG_HI;
5390 Src = buildRegSequence(Elts, InsertPt, MRI);
5391 }
5392}
5393
5395AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5396 Register Src = Root.getReg();
5397 unsigned Mods = SISrcMods::OP_SEL_1;
5399
5400 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5401 assert(BV->getNumSources() > 0);
5402 // Based on first element decide which mod we match, neg or abs
5403 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5404 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5405 ? AMDGPU::G_FNEG
5406 : AMDGPU::G_FABS;
5407 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5408 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5409 if (ElF32->getOpcode() != ModOpcode)
5410 break;
5411 EltsF32.push_back(ElF32->getOperand(1).getReg());
5412 }
5413
5414 // All elements had ModOpcode modifier
5415 if (BV->getNumSources() == EltsF32.size()) {
5416 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5417 *MRI);
5418 }
5419 }
5420
5421 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5422 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5423}
5424
5426AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5427 Register Src = Root.getReg();
5428 unsigned Mods = SISrcMods::OP_SEL_1;
5429 SmallVector<Register, 8> EltsV2F16;
5430
5431 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5432 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5433 Register FNegSrc;
5434 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5435 break;
5436 EltsV2F16.push_back(FNegSrc);
5437 }
5438
5439 // All elements had ModOpcode modifier
5440 if (CV->getNumSources() == EltsV2F16.size()) {
5441 Mods |= SISrcMods::NEG;
5442 Mods |= SISrcMods::NEG_HI;
5443 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5444 }
5445 }
5446
5447 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5448 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5449}
5450
5452AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5453 Register Src = Root.getReg();
5454 unsigned Mods = SISrcMods::OP_SEL_1;
5455 SmallVector<Register, 8> EltsV2F16;
5456
5457 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5458 assert(CV->getNumSources() > 0);
5459 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5460 // Based on first element decide which mod we match, neg or abs
5461 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5462 ? AMDGPU::G_FNEG
5463 : AMDGPU::G_FABS;
5464
5465 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5466 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5467 if (ElV2F16->getOpcode() != ModOpcode)
5468 break;
5469 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5470 }
5471
5472 // All elements had ModOpcode modifier
5473 if (CV->getNumSources() == EltsV2F16.size()) {
5474 MachineIRBuilder B(*Root.getParent());
5475 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5476 *MRI);
5477 }
5478 }
5479
5480 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5481 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5482}
5483
5485AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5486 std::optional<FPValueAndVReg> FPValReg;
5487 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5488 if (TII.isInlineConstant(FPValReg->Value)) {
5489 return {{[=](MachineInstrBuilder &MIB) {
5490 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5491 }}};
5492 }
5493 // Non-inlineable splat floats should not fall-through for integer immediate
5494 // checks.
5495 return {};
5496 }
5497
5498 APInt ICst;
5499 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5500 if (TII.isInlineConstant(ICst)) {
5501 return {
5502 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5503 }
5504 }
5505
5506 return {};
5507}
5508
5510AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5511 Register Src =
5512 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5513 unsigned Key = 0;
5514
5515 Register ShiftSrc;
5516 std::optional<ValueAndVReg> ShiftAmt;
5517 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5518 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5519 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5520 Key = ShiftAmt->Value.getZExtValue() / 8;
5521 Src = ShiftSrc;
5522 }
5523
5524 return {{
5525 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5526 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5527 }};
5528}
5529
5531AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5532
5533 Register Src =
5534 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5535 unsigned Key = 0;
5536
5537 Register ShiftSrc;
5538 std::optional<ValueAndVReg> ShiftAmt;
5539 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5540 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5541 ShiftAmt->Value.getZExtValue() == 16) {
5542 Src = ShiftSrc;
5543 Key = 1;
5544 }
5545
5546 return {{
5547 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5548 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5549 }};
5550}
5551
5553AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5554 Register Src =
5555 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5556 unsigned Key = 0;
5557
5558 Register S32 = matchZeroExtendFromS32(Src);
5559 if (!S32)
5560 S32 = matchAnyExtendFromS32(Src);
5561
5562 if (S32) {
5563 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5564 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5565 assert(Def->getNumOperands() == 3);
5566 Register DstReg1 = Def->getOperand(1).getReg();
5567 if (mi_match(S32, *MRI,
5568 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5569 Src = Def->getOperand(2).getReg();
5570 Key = 1;
5571 }
5572 }
5573 }
5574
5575 return {{
5576 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5577 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5578 }};
5579}
5580
5582AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5583 Register Src;
5584 unsigned Mods;
5585 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5586
5587 // FIXME: Handle op_sel
5588 return {{
5589 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5590 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5591 }};
5592}
5593
5594// FIXME-TRUE16 remove when fake16 is removed
5596AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5597 Register Src;
5598 unsigned Mods;
5599 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5600 /*IsCanonicalizing=*/true,
5601 /*AllowAbs=*/false,
5602 /*OpSel=*/false);
5603
5604 return {{
5605 [=](MachineInstrBuilder &MIB) {
5606 MIB.addReg(
5607 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5608 },
5609 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5610 }};
5611}
5612
5614AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5615 Register Src;
5616 unsigned Mods;
5617 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5618 /*IsCanonicalizing=*/true,
5619 /*AllowAbs=*/false,
5620 /*OpSel=*/true);
5621
5622 return {{
5623 [=](MachineInstrBuilder &MIB) {
5624 MIB.addReg(
5625 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5626 },
5627 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5628 }};
5629}
5630
5631// Given \p Offset and load specified by the \p Root operand check if \p Offset
5632// is a multiple of the load byte size. If it is update \p Offset to a
5633// pre-scaled value and return true.
5634bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5636 bool IsSigned) const {
5637 if (!Subtarget->hasScaleOffset())
5638 return false;
5639
5640 const MachineInstr &MI = *Root.getParent();
5641 MachineMemOperand *MMO = *MI.memoperands_begin();
5642
5643 if (!MMO->getSize().hasValue())
5644 return false;
5645
5646 uint64_t Size = MMO->getSize().getValue();
5647
5648 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5649 if (!OffsetReg)
5650 OffsetReg = Offset;
5651
5652 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5653 OffsetReg = Def->Reg;
5654
5655 Register Op0;
5656 MachineInstr *Mul;
5657 bool ScaleOffset =
5658 (isPowerOf2_64(Size) &&
5659 mi_match(OffsetReg, *MRI,
5660 m_GShl(m_Reg(Op0),
5663 mi_match(OffsetReg, *MRI,
5665 m_Copy(m_SpecificICst(Size))))) ||
5666 mi_match(
5667 OffsetReg, *MRI,
5668 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5669 m_Reg(Op0), m_SpecificICst(Size))) ||
5670 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5671 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5672 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5673 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5674 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5675 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5676 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5677 mi_match(Mul->getOperand(3).getReg(), *MRI,
5679 m_Copy(m_SpecificICst(Size))))) &&
5680 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5681
5682 if (ScaleOffset)
5683 Offset = Op0;
5684
5685 return ScaleOffset;
5686}
5687
5688bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5689 Register &Base,
5690 Register *SOffset,
5691 int64_t *Offset,
5692 bool *ScaleOffset) const {
5693 MachineInstr *MI = Root.getParent();
5694 MachineBasicBlock *MBB = MI->getParent();
5695
5696 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5697 // then we can select all ptr + 32-bit offsets.
5698 SmallVector<GEPInfo, 4> AddrInfo;
5699 getAddrModeInfo(*MI, *MRI, AddrInfo);
5700
5701 if (AddrInfo.empty())
5702 return false;
5703
5704 const GEPInfo &GEPI = AddrInfo[0];
5705 std::optional<int64_t> EncodedImm;
5706
5707 if (ScaleOffset)
5708 *ScaleOffset = false;
5709
5710 if (SOffset && Offset) {
5711 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5712 /*HasSOffset=*/true);
5713 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5714 AddrInfo.size() > 1) {
5715 const GEPInfo &GEPI2 = AddrInfo[1];
5716 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5717 Register OffsetReg = GEPI2.SgprParts[1];
5718 if (ScaleOffset)
5719 *ScaleOffset =
5720 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5721 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5722 if (OffsetReg) {
5723 Base = GEPI2.SgprParts[0];
5724 *SOffset = OffsetReg;
5725 *Offset = *EncodedImm;
5726 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5727 return true;
5728
5729 // For unbuffered smem loads, it is illegal for the Immediate Offset
5730 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5731 // is negative. Handle the case where the Immediate Offset + SOffset
5732 // is negative.
5733 auto SKnown = VT->getKnownBits(*SOffset);
5734 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5735 return false;
5736
5737 return true;
5738 }
5739 }
5740 }
5741 return false;
5742 }
5743
5744 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5745 /*HasSOffset=*/false);
5746 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5747 Base = GEPI.SgprParts[0];
5748 *Offset = *EncodedImm;
5749 return true;
5750 }
5751
5752 // SGPR offset is unsigned.
5753 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5754 GEPI.Imm != 0) {
5755 // If we make it this far we have a load with an 32-bit immediate offset.
5756 // It is OK to select this using a sgpr offset, because we have already
5757 // failed trying to select this load into one of the _IMM variants since
5758 // the _IMM Patterns are considered before the _SGPR patterns.
5759 Base = GEPI.SgprParts[0];
5760 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5761 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5762 .addImm(GEPI.Imm);
5763 return true;
5764 }
5765
5766 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5767 Register OffsetReg = GEPI.SgprParts[1];
5768 if (ScaleOffset)
5769 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5770 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5771 if (OffsetReg) {
5772 Base = GEPI.SgprParts[0];
5773 *SOffset = OffsetReg;
5774 return true;
5775 }
5776 }
5777
5778 return false;
5779}
5780
5782AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5783 Register Base;
5784 int64_t Offset;
5785 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5786 /* ScaleOffset */ nullptr))
5787 return std::nullopt;
5788
5789 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5790 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5791}
5792
5794AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5795 SmallVector<GEPInfo, 4> AddrInfo;
5796 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5797
5798 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5799 return std::nullopt;
5800
5801 const GEPInfo &GEPInfo = AddrInfo[0];
5802 Register PtrReg = GEPInfo.SgprParts[0];
5803 std::optional<int64_t> EncodedImm =
5804 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5805 if (!EncodedImm)
5806 return std::nullopt;
5807
5808 return {{
5809 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5810 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5811 }};
5812}
5813
5815AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5816 Register Base, SOffset;
5817 bool ScaleOffset;
5818 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5819 &ScaleOffset))
5820 return std::nullopt;
5821
5822 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5823 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5824 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5825 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5826}
5827
5829AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5830 Register Base, SOffset;
5831 int64_t Offset;
5832 bool ScaleOffset;
5833 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5834 return std::nullopt;
5835
5836 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5837 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5838 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5839 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5840 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5841}
5842
5843std::pair<Register, int>
5844AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5845 uint64_t FlatVariant) const {
5846 MachineInstr *MI = Root.getParent();
5847
5848 auto Default = std::pair(Root.getReg(), 0);
5849
5850 if (!STI.hasFlatInstOffsets())
5851 return Default;
5852
5853 Register PtrBase;
5854 int64_t ConstOffset;
5855 bool IsInBounds;
5856 std::tie(PtrBase, ConstOffset, IsInBounds) =
5857 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5858
5859 // Adding the offset to the base address with an immediate in a FLAT
5860 // instruction must not change the memory aperture in which the address falls.
5861 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5862 // instructions.
5863 if (ConstOffset == 0 ||
5864 (FlatVariant == SIInstrFlags::FlatScratch &&
5865 !isFlatScratchBaseLegal(Root.getReg())) ||
5866 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5867 return Default;
5868
5869 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5870 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5871 return Default;
5872
5873 return std::pair(PtrBase, ConstOffset);
5874}
5875
5877AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5878 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5879
5880 return {{
5881 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5882 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5883 }};
5884}
5885
5887AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5888 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5889
5890 return {{
5891 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5892 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5893 }};
5894}
5895
5897AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5898 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5899
5900 return {{
5901 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5902 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5903 }};
5904}
5905
5906// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5908AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5909 unsigned CPolBits,
5910 bool NeedIOffset) const {
5911 Register Addr = Root.getReg();
5912 Register PtrBase;
5913 int64_t ConstOffset;
5914 int64_t ImmOffset = 0;
5915
5916 // Match the immediate offset first, which canonically is moved as low as
5917 // possible.
5918 std::tie(PtrBase, ConstOffset, std::ignore) =
5919 getPtrBaseWithConstantOffset(Addr, *MRI);
5920
5921 if (ConstOffset != 0) {
5922 if (NeedIOffset &&
5923 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5925 Addr = PtrBase;
5926 ImmOffset = ConstOffset;
5927 } else {
5928 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5929 if (isSGPR(PtrBaseDef->Reg)) {
5930 if (ConstOffset > 0) {
5931 // Offset is too large.
5932 //
5933 // saddr + large_offset -> saddr +
5934 // (voffset = large_offset & ~MaxOffset) +
5935 // (large_offset & MaxOffset);
5936 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5937 if (NeedIOffset) {
5938 std::tie(SplitImmOffset, RemainderOffset) =
5939 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5941 }
5942
5943 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5944 : isUInt<32>(RemainderOffset)) {
5945 MachineInstr *MI = Root.getParent();
5946 MachineBasicBlock *MBB = MI->getParent();
5947 Register HighBits =
5948 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5949
5950 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5951 HighBits)
5952 .addImm(RemainderOffset);
5953
5954 if (NeedIOffset)
5955 return {{
5956 [=](MachineInstrBuilder &MIB) {
5957 MIB.addReg(PtrBase);
5958 }, // saddr
5959 [=](MachineInstrBuilder &MIB) {
5960 MIB.addReg(HighBits);
5961 }, // voffset
5962 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5963 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5964 }};
5965 return {{
5966 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5967 [=](MachineInstrBuilder &MIB) {
5968 MIB.addReg(HighBits);
5969 }, // voffset
5970 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5971 }};
5972 }
5973 }
5974
5975 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5976 // is 1 we would need to perform 1 or 2 extra moves for each half of
5977 // the constant and it is better to do a scalar add and then issue a
5978 // single VALU instruction to materialize zero. Otherwise it is less
5979 // instructions to perform VALU adds with immediates or inline literals.
5980 unsigned NumLiterals =
5981 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5982 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5983 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5984 return std::nullopt;
5985 }
5986 }
5987 }
5988
5989 // Match the variable offset.
5990 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5991 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5992 // Look through the SGPR->VGPR copy.
5993 Register SAddr =
5994 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5995
5996 if (isSGPR(SAddr)) {
5997 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5998
5999 // It's possible voffset is an SGPR here, but the copy to VGPR will be
6000 // inserted later.
6001 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
6002 Subtarget->hasSignedGVSOffset());
6003 if (Register VOffset = matchExtendFromS32OrS32(
6004 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
6005 if (NeedIOffset)
6006 return {{[=](MachineInstrBuilder &MIB) { // saddr
6007 MIB.addReg(SAddr);
6008 },
6009 [=](MachineInstrBuilder &MIB) { // voffset
6010 MIB.addReg(VOffset);
6011 },
6012 [=](MachineInstrBuilder &MIB) { // offset
6013 MIB.addImm(ImmOffset);
6014 },
6015 [=](MachineInstrBuilder &MIB) { // cpol
6016 MIB.addImm(CPolBits |
6017 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6018 }}};
6019 return {{[=](MachineInstrBuilder &MIB) { // saddr
6020 MIB.addReg(SAddr);
6021 },
6022 [=](MachineInstrBuilder &MIB) { // voffset
6023 MIB.addReg(VOffset);
6024 },
6025 [=](MachineInstrBuilder &MIB) { // cpol
6026 MIB.addImm(CPolBits |
6027 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6028 }}};
6029 }
6030 }
6031 }
6032
6033 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6034 // drop this.
6035 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6036 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6037 return std::nullopt;
6038
6039 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6040 // moves required to copy a 64-bit SGPR to VGPR.
6041 MachineInstr *MI = Root.getParent();
6042 MachineBasicBlock *MBB = MI->getParent();
6043 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6044
6045 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6046 .addImm(0);
6047
6048 if (NeedIOffset)
6049 return {{
6050 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6051 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6052 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6053 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6054 }};
6055 return {{
6056 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6057 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6058 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6059 }};
6060}
6061
6063AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6064 return selectGlobalSAddr(Root, 0);
6065}
6066
6068AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6069 const MachineInstr &I = *Root.getParent();
6070
6071 // We are assuming CPol is always the last operand of the intrinsic.
6072 auto PassedCPol =
6073 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6074 return selectGlobalSAddr(Root, PassedCPol);
6075}
6076
6078AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6079 const MachineInstr &I = *Root.getParent();
6080
6081 // We are assuming CPol is second from last operand of the intrinsic.
6082 auto PassedCPol =
6083 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6084 return selectGlobalSAddr(Root, PassedCPol);
6085}
6086
6088AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6089 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6090}
6091
6093AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6094 MachineOperand &Root) const {
6095 const MachineInstr &I = *Root.getParent();
6096
6097 // We are assuming CPol is always the last operand of the intrinsic.
6098 auto PassedCPol =
6099 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6100 return selectGlobalSAddr(Root, PassedCPol, false);
6101}
6102
6104AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6105 MachineOperand &Root) const {
6106 const MachineInstr &I = *Root.getParent();
6107
6108 // We are assuming CPol is second from last operand of the intrinsic.
6109 auto PassedCPol =
6110 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6111 return selectGlobalSAddr(Root, PassedCPol, false);
6112}
6113
6115AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6116 Register Addr = Root.getReg();
6117 Register PtrBase;
6118 int64_t ConstOffset;
6119 int64_t ImmOffset = 0;
6120
6121 // Match the immediate offset first, which canonically is moved as low as
6122 // possible.
6123 std::tie(PtrBase, ConstOffset, std::ignore) =
6124 getPtrBaseWithConstantOffset(Addr, *MRI);
6125
6126 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6127 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6129 Addr = PtrBase;
6130 ImmOffset = ConstOffset;
6131 }
6132
6133 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6134 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6135 int FI = AddrDef->MI->getOperand(1).getIndex();
6136 return {{
6137 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6138 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6139 }};
6140 }
6141
6142 Register SAddr = AddrDef->Reg;
6143
6144 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6145 Register LHS = AddrDef->MI->getOperand(1).getReg();
6146 Register RHS = AddrDef->MI->getOperand(2).getReg();
6147 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6148 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6149
6150 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6151 isSGPR(RHSDef->Reg)) {
6152 int FI = LHSDef->MI->getOperand(1).getIndex();
6153 MachineInstr &I = *Root.getParent();
6154 MachineBasicBlock *BB = I.getParent();
6155 const DebugLoc &DL = I.getDebugLoc();
6156 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6157
6158 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6159 .addFrameIndex(FI)
6160 .addReg(RHSDef->Reg)
6161 .setOperandDead(3); // Dead scc
6162 }
6163 }
6164
6165 if (!isSGPR(SAddr))
6166 return std::nullopt;
6167
6168 return {{
6169 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6170 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6171 }};
6172}
6173
6174// Check whether the flat scratch SVS swizzle bug affects this access.
6175bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6176 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6177 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6178 return false;
6179
6180 // The bug affects the swizzling of SVS accesses if there is any carry out
6181 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6182 // voffset to (soffset + inst_offset).
6183 auto VKnown = VT->getKnownBits(VAddr);
6184 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6185 KnownBits::makeConstant(APInt(32, ImmOffset)));
6186 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6187 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6188 return (VMax & 3) + (SMax & 3) >= 4;
6189}
6190
6192AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6193 Register Addr = Root.getReg();
6194 Register PtrBase;
6195 int64_t ConstOffset;
6196 int64_t ImmOffset = 0;
6197
6198 // Match the immediate offset first, which canonically is moved as low as
6199 // possible.
6200 std::tie(PtrBase, ConstOffset, std::ignore) =
6201 getPtrBaseWithConstantOffset(Addr, *MRI);
6202
6203 Register OrigAddr = Addr;
6204 if (ConstOffset != 0 &&
6205 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6207 Addr = PtrBase;
6208 ImmOffset = ConstOffset;
6209 }
6210
6211 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6212 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6213 return std::nullopt;
6214
6215 Register RHS = AddrDef->MI->getOperand(2).getReg();
6216 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6217 return std::nullopt;
6218
6219 Register LHS = AddrDef->MI->getOperand(1).getReg();
6220 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6221
6222 if (OrigAddr != Addr) {
6223 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6224 return std::nullopt;
6225 } else {
6226 if (!isFlatScratchBaseLegalSV(OrigAddr))
6227 return std::nullopt;
6228 }
6229
6230 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6231 return std::nullopt;
6232
6233 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6235 : 0;
6236
6237 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6238 int FI = LHSDef->MI->getOperand(1).getIndex();
6239 return {{
6240 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6241 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6242 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6243 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6244 }};
6245 }
6246
6247 if (!isSGPR(LHS))
6248 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6249 LHS = Def->Reg;
6250
6251 if (!isSGPR(LHS))
6252 return std::nullopt;
6253
6254 return {{
6255 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6256 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6257 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6258 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6259 }};
6260}
6261
6263AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6264 MachineInstr *MI = Root.getParent();
6265 MachineBasicBlock *MBB = MI->getParent();
6266 MachineFunction *MF = MBB->getParent();
6267 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6268
6269 int64_t Offset = 0;
6270 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6272 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6273
6274 // TODO: Should this be inside the render function? The iterator seems to
6275 // move.
6276 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6277 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6278 HighBits)
6279 .addImm(Offset & ~MaxOffset);
6280
6281 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6282 MIB.addReg(Info->getScratchRSrcReg());
6283 },
6284 [=](MachineInstrBuilder &MIB) { // vaddr
6285 MIB.addReg(HighBits);
6286 },
6287 [=](MachineInstrBuilder &MIB) { // soffset
6288 // Use constant zero for soffset and rely on eliminateFrameIndex
6289 // to choose the appropriate frame register if need be.
6290 MIB.addImm(0);
6291 },
6292 [=](MachineInstrBuilder &MIB) { // offset
6293 MIB.addImm(Offset & MaxOffset);
6294 }}};
6295 }
6296
6297 assert(Offset == 0 || Offset == -1);
6298
6299 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6300 // offsets.
6301 std::optional<int> FI;
6302 Register VAddr = Root.getReg();
6303
6304 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6305 Register PtrBase;
6306 int64_t ConstOffset;
6307 std::tie(PtrBase, ConstOffset, std::ignore) =
6308 getPtrBaseWithConstantOffset(VAddr, *MRI);
6309 if (ConstOffset != 0) {
6310 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6311 (!STI.privateMemoryResourceIsRangeChecked() ||
6312 VT->signBitIsZero(PtrBase))) {
6313 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6314 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6315 FI = PtrBaseDef->getOperand(1).getIndex();
6316 else
6317 VAddr = PtrBase;
6318 Offset = ConstOffset;
6319 }
6320 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6321 FI = RootDef->getOperand(1).getIndex();
6322 }
6323
6324 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6325 MIB.addReg(Info->getScratchRSrcReg());
6326 },
6327 [=](MachineInstrBuilder &MIB) { // vaddr
6328 if (FI)
6329 MIB.addFrameIndex(*FI);
6330 else
6331 MIB.addReg(VAddr);
6332 },
6333 [=](MachineInstrBuilder &MIB) { // soffset
6334 // Use constant zero for soffset and rely on eliminateFrameIndex
6335 // to choose the appropriate frame register if need be.
6336 MIB.addImm(0);
6337 },
6338 [=](MachineInstrBuilder &MIB) { // offset
6339 MIB.addImm(Offset);
6340 }}};
6341}
6342
6343bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6344 int64_t Offset) const {
6345 if (!isUInt<16>(Offset))
6346 return false;
6347
6348 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6349 return true;
6350
6351 // On Southern Islands instruction with a negative base value and an offset
6352 // don't seem to work.
6353 return VT->signBitIsZero(Base);
6354}
6355
6356bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6357 int64_t Offset1,
6358 unsigned Size) const {
6359 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6360 return false;
6361 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6362 return false;
6363
6364 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6365 return true;
6366
6367 // On Southern Islands instruction with a negative base value and an offset
6368 // don't seem to work.
6369 return VT->signBitIsZero(Base);
6370}
6371
6372// Return whether the operation has NoUnsignedWrap property.
6373static bool isNoUnsignedWrap(MachineInstr *Addr) {
6374 return Addr->getOpcode() == TargetOpcode::G_OR ||
6375 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6377}
6378
6379// Check that the base address of flat scratch load/store in the form of `base +
6380// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6381// requirement). We always treat the first operand as the base address here.
6382bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6383 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6384
6385 if (isNoUnsignedWrap(AddrMI))
6386 return true;
6387
6388 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6389 // values.
6390 if (STI.hasSignedScratchOffsets())
6391 return true;
6392
6393 Register LHS = AddrMI->getOperand(1).getReg();
6394 Register RHS = AddrMI->getOperand(2).getReg();
6395
6396 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6397 std::optional<ValueAndVReg> RhsValReg =
6399 // If the immediate offset is negative and within certain range, the base
6400 // address cannot also be negative. If the base is also negative, the sum
6401 // would be either negative or much larger than the valid range of scratch
6402 // memory a thread can access.
6403 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6404 RhsValReg->Value.getSExtValue() > -0x40000000)
6405 return true;
6406 }
6407
6408 return VT->signBitIsZero(LHS);
6409}
6410
6411// Check address value in SGPR/VGPR are legal for flat scratch in the form
6412// of: SGPR + VGPR.
6413bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6414 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6415
6416 if (isNoUnsignedWrap(AddrMI))
6417 return true;
6418
6419 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6420 // values.
6421 if (STI.hasSignedScratchOffsets())
6422 return true;
6423
6424 Register LHS = AddrMI->getOperand(1).getReg();
6425 Register RHS = AddrMI->getOperand(2).getReg();
6426 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6427}
6428
6429// Check address value in SGPR/VGPR are legal for flat scratch in the form
6430// of: SGPR + VGPR + Imm.
6431bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6432 Register Addr) const {
6433 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6434 // values.
6435 if (STI.hasSignedScratchOffsets())
6436 return true;
6437
6438 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6439 Register Base = AddrMI->getOperand(1).getReg();
6440 std::optional<DefinitionAndSourceRegister> BaseDef =
6442 std::optional<ValueAndVReg> RHSOffset =
6444 assert(RHSOffset);
6445
6446 // If the immediate offset is negative and within certain range, the base
6447 // address cannot also be negative. If the base is also negative, the sum
6448 // would be either negative or much larger than the valid range of scratch
6449 // memory a thread can access.
6450 if (isNoUnsignedWrap(BaseDef->MI) &&
6451 (isNoUnsignedWrap(AddrMI) ||
6452 (RHSOffset->Value.getSExtValue() < 0 &&
6453 RHSOffset->Value.getSExtValue() > -0x40000000)))
6454 return true;
6455
6456 Register LHS = BaseDef->MI->getOperand(1).getReg();
6457 Register RHS = BaseDef->MI->getOperand(2).getReg();
6458 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6459}
6460
6461bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6462 unsigned ShAmtBits) const {
6463 assert(MI.getOpcode() == TargetOpcode::G_AND);
6464
6465 std::optional<APInt> RHS =
6466 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6467 if (!RHS)
6468 return false;
6469
6470 if (RHS->countr_one() >= ShAmtBits)
6471 return true;
6472
6473 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6474 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6475}
6476
6478AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6479 MachineOperand &Root) const {
6480 Register Reg = Root.getReg();
6481 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6482
6483 std::optional<DefinitionAndSourceRegister> Def =
6485 assert(Def && "this shouldn't be an optional result");
6486 Reg = Def->Reg;
6487
6488 if (Register WaveBase = getWaveAddress(Def->MI)) {
6489 return {{
6490 [=](MachineInstrBuilder &MIB) { // rsrc
6491 MIB.addReg(Info->getScratchRSrcReg());
6492 },
6493 [=](MachineInstrBuilder &MIB) { // soffset
6494 MIB.addReg(WaveBase);
6495 },
6496 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6497 }};
6498 }
6499
6500 int64_t Offset = 0;
6501
6502 // FIXME: Copy check is a hack
6504 if (mi_match(Reg, *MRI,
6505 m_GPtrAdd(m_Reg(BasePtr),
6507 if (!TII.isLegalMUBUFImmOffset(Offset))
6508 return {};
6509 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6510 Register WaveBase = getWaveAddress(BasePtrDef);
6511 if (!WaveBase)
6512 return {};
6513
6514 return {{
6515 [=](MachineInstrBuilder &MIB) { // rsrc
6516 MIB.addReg(Info->getScratchRSrcReg());
6517 },
6518 [=](MachineInstrBuilder &MIB) { // soffset
6519 MIB.addReg(WaveBase);
6520 },
6521 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6522 }};
6523 }
6524
6525 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6526 !TII.isLegalMUBUFImmOffset(Offset))
6527 return {};
6528
6529 return {{
6530 [=](MachineInstrBuilder &MIB) { // rsrc
6531 MIB.addReg(Info->getScratchRSrcReg());
6532 },
6533 [=](MachineInstrBuilder &MIB) { // soffset
6534 MIB.addImm(0);
6535 },
6536 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6537 }};
6538}
6539
6540std::pair<Register, unsigned>
6541AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6542 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6543 int64_t ConstAddr = 0;
6544
6545 Register PtrBase;
6546 int64_t Offset;
6547 std::tie(PtrBase, Offset, std::ignore) =
6548 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6549
6550 if (Offset) {
6551 if (isDSOffsetLegal(PtrBase, Offset)) {
6552 // (add n0, c0)
6553 return std::pair(PtrBase, Offset);
6554 }
6555 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6556 // TODO
6557
6558
6559 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6560 // TODO
6561
6562 }
6563
6564 return std::pair(Root.getReg(), 0);
6565}
6566
6568AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6569 Register Reg;
6570 unsigned Offset;
6571 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6572 return {{
6573 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6574 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6575 }};
6576}
6577
6579AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6580 return selectDSReadWrite2(Root, 4);
6581}
6582
6584AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6585 return selectDSReadWrite2(Root, 8);
6586}
6587
6589AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6590 unsigned Size) const {
6591 Register Reg;
6592 unsigned Offset;
6593 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6594 return {{
6595 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6596 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6597 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6598 }};
6599}
6600
6601std::pair<Register, unsigned>
6602AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6603 unsigned Size) const {
6604 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6605 int64_t ConstAddr = 0;
6606
6607 Register PtrBase;
6608 int64_t Offset;
6609 std::tie(PtrBase, Offset, std::ignore) =
6610 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6611
6612 if (Offset) {
6613 int64_t OffsetValue0 = Offset;
6614 int64_t OffsetValue1 = Offset + Size;
6615 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6616 // (add n0, c0)
6617 return std::pair(PtrBase, OffsetValue0 / Size);
6618 }
6619 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6620 // TODO
6621
6622 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6623 // TODO
6624
6625 }
6626
6627 return std::pair(Root.getReg(), 0);
6628}
6629
6630/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6631/// the base value with the constant offset, and if the offset computation is
6632/// known to be inbounds. There may be intervening copies between \p Root and
6633/// the identified constant. Returns \p Root, 0, false if this does not match
6634/// the pattern.
6635std::tuple<Register, int64_t, bool>
6636AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6637 Register Root, const MachineRegisterInfo &MRI) const {
6638 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6639 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6640 return {Root, 0, false};
6641
6642 MachineOperand &RHS = RootI->getOperand(2);
6643 std::optional<ValueAndVReg> MaybeOffset =
6645 if (!MaybeOffset)
6646 return {Root, 0, false};
6647 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6648 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6649 IsInBounds};
6650}
6651
6653 MIB.addImm(0);
6654}
6655
6656/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6657/// BasePtr is not valid, a null base pointer will be used.
6659 uint32_t FormatLo, uint32_t FormatHi,
6660 Register BasePtr) {
6661 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6662 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6663 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6664 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6665
6666 B.buildInstr(AMDGPU::S_MOV_B32)
6667 .addDef(RSrc2)
6668 .addImm(FormatLo);
6669 B.buildInstr(AMDGPU::S_MOV_B32)
6670 .addDef(RSrc3)
6671 .addImm(FormatHi);
6672
6673 // Build the half of the subregister with the constants before building the
6674 // full 128-bit register. If we are building multiple resource descriptors,
6675 // this will allow CSEing of the 2-component register.
6676 B.buildInstr(AMDGPU::REG_SEQUENCE)
6677 .addDef(RSrcHi)
6678 .addReg(RSrc2)
6679 .addImm(AMDGPU::sub0)
6680 .addReg(RSrc3)
6681 .addImm(AMDGPU::sub1);
6682
6683 Register RSrcLo = BasePtr;
6684 if (!BasePtr) {
6685 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6686 B.buildInstr(AMDGPU::S_MOV_B64)
6687 .addDef(RSrcLo)
6688 .addImm(0);
6689 }
6690
6691 B.buildInstr(AMDGPU::REG_SEQUENCE)
6692 .addDef(RSrc)
6693 .addReg(RSrcLo)
6694 .addImm(AMDGPU::sub0_sub1)
6695 .addReg(RSrcHi)
6696 .addImm(AMDGPU::sub2_sub3);
6697
6698 return RSrc;
6699}
6700
6702 const SIInstrInfo &TII, Register BasePtr) {
6703 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6704
6705 // FIXME: Why are half the "default" bits ignored based on the addressing
6706 // mode?
6707 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6708}
6709
6711 const SIInstrInfo &TII, Register BasePtr) {
6712 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6713
6714 // FIXME: Why are half the "default" bits ignored based on the addressing
6715 // mode?
6716 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6717}
6718
6719AMDGPUInstructionSelector::MUBUFAddressData
6720AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6721 MUBUFAddressData Data;
6722 Data.N0 = Src;
6723
6724 Register PtrBase;
6725 int64_t Offset;
6726
6727 std::tie(PtrBase, Offset, std::ignore) =
6728 getPtrBaseWithConstantOffset(Src, *MRI);
6729 if (isUInt<32>(Offset)) {
6730 Data.N0 = PtrBase;
6731 Data.Offset = Offset;
6732 }
6733
6734 if (MachineInstr *InputAdd
6735 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6736 Data.N2 = InputAdd->getOperand(1).getReg();
6737 Data.N3 = InputAdd->getOperand(2).getReg();
6738
6739 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6740 // FIXME: Don't know this was defined by operand 0
6741 //
6742 // TODO: Remove this when we have copy folding optimizations after
6743 // RegBankSelect.
6744 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6745 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6746 }
6747
6748 return Data;
6749}
6750
6751/// Return if the addr64 mubuf mode should be used for the given address.
6752bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6753 // (ptr_add N2, N3) -> addr64, or
6754 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6755 if (Addr.N2)
6756 return true;
6757
6758 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6759 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6760}
6761
6762/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6763/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6764/// component.
6765void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6766 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6767 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6768 return;
6769
6770 // Illegal offset, store it in soffset.
6771 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6772 B.buildInstr(AMDGPU::S_MOV_B32)
6773 .addDef(SOffset)
6774 .addImm(ImmOffset);
6775 ImmOffset = 0;
6776}
6777
6778bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6779 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6780 Register &SOffset, int64_t &Offset) const {
6781 // FIXME: Predicates should stop this from reaching here.
6782 // addr64 bit was removed for volcanic islands.
6783 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6784 return false;
6785
6786 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6787 if (!shouldUseAddr64(AddrData))
6788 return false;
6789
6790 Register N0 = AddrData.N0;
6791 Register N2 = AddrData.N2;
6792 Register N3 = AddrData.N3;
6793 Offset = AddrData.Offset;
6794
6795 // Base pointer for the SRD.
6796 Register SRDPtr;
6797
6798 if (N2) {
6799 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6800 assert(N3);
6801 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6802 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6803 // addr64, and construct the default resource from a 0 address.
6804 VAddr = N0;
6805 } else {
6806 SRDPtr = N3;
6807 VAddr = N2;
6808 }
6809 } else {
6810 // N2 is not divergent.
6811 SRDPtr = N2;
6812 VAddr = N3;
6813 }
6814 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6815 // Use the default null pointer in the resource
6816 VAddr = N0;
6817 } else {
6818 // N0 -> offset, or
6819 // (N0 + C1) -> offset
6820 SRDPtr = N0;
6821 }
6822
6823 MachineIRBuilder B(*Root.getParent());
6824 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6825 splitIllegalMUBUFOffset(B, SOffset, Offset);
6826 return true;
6827}
6828
6829bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6830 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6831 int64_t &Offset) const {
6832
6833 // FIXME: Pattern should not reach here.
6834 if (STI.useFlatForGlobal())
6835 return false;
6836
6837 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6838 if (shouldUseAddr64(AddrData))
6839 return false;
6840
6841 // N0 -> offset, or
6842 // (N0 + C1) -> offset
6843 Register SRDPtr = AddrData.N0;
6844 Offset = AddrData.Offset;
6845
6846 // TODO: Look through extensions for 32-bit soffset.
6847 MachineIRBuilder B(*Root.getParent());
6848
6849 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6850 splitIllegalMUBUFOffset(B, SOffset, Offset);
6851 return true;
6852}
6853
6855AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6856 Register VAddr;
6857 Register RSrcReg;
6858 Register SOffset;
6859 int64_t Offset = 0;
6860
6861 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6862 return {};
6863
6864 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6865 // pattern.
6866 return {{
6867 [=](MachineInstrBuilder &MIB) { // rsrc
6868 MIB.addReg(RSrcReg);
6869 },
6870 [=](MachineInstrBuilder &MIB) { // vaddr
6871 MIB.addReg(VAddr);
6872 },
6873 [=](MachineInstrBuilder &MIB) { // soffset
6874 if (SOffset)
6875 MIB.addReg(SOffset);
6876 else if (STI.hasRestrictedSOffset())
6877 MIB.addReg(AMDGPU::SGPR_NULL);
6878 else
6879 MIB.addImm(0);
6880 },
6881 [=](MachineInstrBuilder &MIB) { // offset
6882 MIB.addImm(Offset);
6883 },
6884 addZeroImm, // cpol
6885 addZeroImm, // tfe
6886 addZeroImm // swz
6887 }};
6888}
6889
6891AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6892 Register RSrcReg;
6893 Register SOffset;
6894 int64_t Offset = 0;
6895
6896 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6897 return {};
6898
6899 return {{
6900 [=](MachineInstrBuilder &MIB) { // rsrc
6901 MIB.addReg(RSrcReg);
6902 },
6903 [=](MachineInstrBuilder &MIB) { // soffset
6904 if (SOffset)
6905 MIB.addReg(SOffset);
6906 else if (STI.hasRestrictedSOffset())
6907 MIB.addReg(AMDGPU::SGPR_NULL);
6908 else
6909 MIB.addImm(0);
6910 },
6911 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6912 addZeroImm, // cpol
6913 addZeroImm, // tfe
6914 addZeroImm, // swz
6915 }};
6916}
6917
6919AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6920
6921 Register SOffset = Root.getReg();
6922
6923 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6924 SOffset = AMDGPU::SGPR_NULL;
6925
6926 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6927}
6928
6929/// Get an immediate that must be 32-bits, and treated as zero extended.
6930static std::optional<uint64_t>
6932 // getIConstantVRegVal sexts any values, so see if that matters.
6933 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6934 if (!OffsetVal || !isInt<32>(*OffsetVal))
6935 return std::nullopt;
6936 return Lo_32(*OffsetVal);
6937}
6938
6940AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6941 std::optional<uint64_t> OffsetVal =
6942 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6943 if (!OffsetVal)
6944 return {};
6945
6946 std::optional<int64_t> EncodedImm =
6947 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6948 if (!EncodedImm)
6949 return {};
6950
6951 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6952}
6953
6955AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6956 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6957
6958 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6959 if (!OffsetVal)
6960 return {};
6961
6962 std::optional<int64_t> EncodedImm =
6964 if (!EncodedImm)
6965 return {};
6966
6967 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6968}
6969
6971AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6972 // Match the (soffset + offset) pair as a 32-bit register base and
6973 // an immediate offset.
6974 Register SOffset;
6975 unsigned Offset;
6976 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6977 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6978 if (!SOffset)
6979 return std::nullopt;
6980
6981 std::optional<int64_t> EncodedOffset =
6982 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6983 if (!EncodedOffset)
6984 return std::nullopt;
6985
6986 assert(MRI->getType(SOffset) == LLT::scalar(32));
6987 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6988 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6989}
6990
6991std::pair<Register, unsigned>
6992AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6993 bool &Matched) const {
6994 Matched = false;
6995
6996 Register Src;
6997 unsigned Mods;
6998 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6999
7000 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
7001 assert(MRI->getType(Src) == LLT::scalar(16));
7002
7003 // Only change Src if src modifier could be gained. In such cases new Src
7004 // could be sgpr but this does not violate constant bus restriction for
7005 // instruction that is being selected.
7006 Src = stripBitCast(Src, *MRI);
7007
7008 const auto CheckAbsNeg = [&]() {
7009 // Be careful about folding modifiers if we already have an abs. fneg is
7010 // applied last, so we don't want to apply an earlier fneg.
7011 if ((Mods & SISrcMods::ABS) == 0) {
7012 unsigned ModsTmp;
7013 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7014
7015 if ((ModsTmp & SISrcMods::NEG) != 0)
7016 Mods ^= SISrcMods::NEG;
7017
7018 if ((ModsTmp & SISrcMods::ABS) != 0)
7019 Mods |= SISrcMods::ABS;
7020 }
7021 };
7022
7023 CheckAbsNeg();
7024
7025 // op_sel/op_sel_hi decide the source type and source.
7026 // If the source's op_sel_hi is set, it indicates to do a conversion from
7027 // fp16. If the sources's op_sel is set, it picks the high half of the
7028 // source register.
7029
7030 Mods |= SISrcMods::OP_SEL_1;
7031
7032 if (isExtractHiElt(*MRI, Src, Src)) {
7033 Mods |= SISrcMods::OP_SEL_0;
7034 CheckAbsNeg();
7035 }
7036
7037 Matched = true;
7038 }
7039
7040 return {Src, Mods};
7041}
7042
7044AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7045 MachineOperand &Root) const {
7046 Register Src;
7047 unsigned Mods;
7048 bool Matched;
7049 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7050 if (!Matched)
7051 return {};
7052
7053 return {{
7054 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7055 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7056 }};
7057}
7058
7060AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7061 Register Src;
7062 unsigned Mods;
7063 bool Matched;
7064 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7065
7066 return {{
7067 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7068 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7069 }};
7070}
7071
7072bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7073 MachineInstr &I, Intrinsic::ID IntrID) const {
7074 MachineBasicBlock *MBB = I.getParent();
7075 const DebugLoc &DL = I.getDebugLoc();
7076 Register CCReg = I.getOperand(0).getReg();
7077
7078 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7079 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7080
7081 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7082 .addImm(I.getOperand(2).getImm());
7083
7084 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7085
7086 I.eraseFromParent();
7087 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7088 *MRI);
7089}
7090
7091bool AMDGPUInstructionSelector::selectSGetBarrierState(
7092 MachineInstr &I, Intrinsic::ID IntrID) const {
7093 MachineBasicBlock *MBB = I.getParent();
7094 const DebugLoc &DL = I.getDebugLoc();
7095 const MachineOperand &BarOp = I.getOperand(2);
7096 std::optional<int64_t> BarValImm =
7097 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7098
7099 if (!BarValImm) {
7100 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7101 .addReg(BarOp.getReg());
7102 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7103 }
7104 MachineInstrBuilder MIB;
7105 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7106 : AMDGPU::S_GET_BARRIER_STATE_M0;
7107 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7108
7109 auto DstReg = I.getOperand(0).getReg();
7110 const TargetRegisterClass *DstRC =
7111 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7112 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7113 return false;
7114 MIB.addDef(DstReg);
7115 if (BarValImm) {
7116 MIB.addImm(*BarValImm);
7117 }
7118 I.eraseFromParent();
7119 return true;
7120}
7121
7122unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7123 if (HasInlineConst) {
7124 switch (IntrID) {
7125 default:
7126 llvm_unreachable("not a named barrier op");
7127 case Intrinsic::amdgcn_s_barrier_join:
7128 return AMDGPU::S_BARRIER_JOIN_IMM;
7129 case Intrinsic::amdgcn_s_wakeup_barrier:
7130 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7131 case Intrinsic::amdgcn_s_get_named_barrier_state:
7132 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7133 };
7134 } else {
7135 switch (IntrID) {
7136 default:
7137 llvm_unreachable("not a named barrier op");
7138 case Intrinsic::amdgcn_s_barrier_join:
7139 return AMDGPU::S_BARRIER_JOIN_M0;
7140 case Intrinsic::amdgcn_s_wakeup_barrier:
7141 return AMDGPU::S_WAKEUP_BARRIER_M0;
7142 case Intrinsic::amdgcn_s_get_named_barrier_state:
7143 return AMDGPU::S_GET_BARRIER_STATE_M0;
7144 };
7145 }
7146}
7147
7148bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7149 MachineInstr &I, Intrinsic::ID IntrID) const {
7150 MachineBasicBlock *MBB = I.getParent();
7151 const DebugLoc &DL = I.getDebugLoc();
7152 const MachineOperand &BarOp = I.getOperand(1);
7153 const MachineOperand &CntOp = I.getOperand(2);
7154
7155 // A member count of 0 means "keep existing member count". That plus a known
7156 // constant value for the barrier ID lets us use the immarg form.
7157 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7158 std::optional<int64_t> CntImm =
7159 getIConstantVRegSExtVal(CntOp.getReg(), *MRI);
7160 if (CntImm && *CntImm == 0) {
7161 std::optional<int64_t> BarValImm =
7162 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7163 if (BarValImm) {
7164 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7165 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
7166 .addImm(BarID);
7167 I.eraseFromParent();
7168 return true;
7169 }
7170 }
7171 }
7172
7173 // BarID = (BarOp >> 4) & 0x3F
7174 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7175 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7176 .add(BarOp)
7177 .addImm(4u)
7178 .setOperandDead(3); // Dead scc
7179
7180 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7181 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7182 .addReg(TmpReg0)
7183 .addImm(0x3F)
7184 .setOperandDead(3); // Dead scc
7185
7186 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7187 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7188 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7189 .add(CntOp)
7190 .addImm(0x3F)
7191 .setOperandDead(3); // Dead scc
7192
7193 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7194 constexpr unsigned ShAmt = 16;
7195 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7196 .addReg(TmpReg2)
7197 .addImm(ShAmt)
7198 .setOperandDead(3); // Dead scc
7199
7200 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7201 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7202 .addReg(TmpReg1)
7203 .addReg(TmpReg3)
7204 .setOperandDead(3); // Dead scc;
7205
7206 auto CopyMIB =
7207 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7208 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7209
7210 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7211 ? AMDGPU::S_BARRIER_INIT_M0
7212 : AMDGPU::S_BARRIER_SIGNAL_M0;
7213 MachineInstrBuilder MIB;
7214 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7215
7216 I.eraseFromParent();
7217 return true;
7218}
7219
7220bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7221 MachineInstr &I, Intrinsic::ID IntrID) const {
7222 MachineBasicBlock *MBB = I.getParent();
7223 const DebugLoc &DL = I.getDebugLoc();
7224 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7225 ? I.getOperand(2)
7226 : I.getOperand(1);
7227 std::optional<int64_t> BarValImm =
7228 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7229
7230 if (!BarValImm) {
7231 // BarID = (BarOp >> 4) & 0x3F
7232 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7233 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7234 .addReg(BarOp.getReg())
7235 .addImm(4u)
7236 .setOperandDead(3); // Dead scc;
7237
7238 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7239 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7240 .addReg(TmpReg0)
7241 .addImm(0x3F)
7242 .setOperandDead(3); // Dead scc;
7243
7244 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7245 .addReg(TmpReg1);
7246 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7247 }
7248
7249 MachineInstrBuilder MIB;
7250 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7251 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7252
7253 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7254 auto DstReg = I.getOperand(0).getReg();
7255 const TargetRegisterClass *DstRC =
7256 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7257 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7258 return false;
7259 MIB.addDef(DstReg);
7260 }
7261
7262 if (BarValImm) {
7263 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7264 MIB.addImm(BarId);
7265 }
7266
7267 I.eraseFromParent();
7268 return true;
7269}
7270
7271void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7272 const MachineInstr &MI,
7273 int OpIdx) const {
7274 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7275 "Expected G_CONSTANT");
7276 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7277}
7278
7279void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7280 const MachineInstr &MI,
7281 int OpIdx) const {
7282 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7283 "Expected G_CONSTANT");
7284 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7285}
7286
7287void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7288 const MachineInstr &MI,
7289 int OpIdx) const {
7290 const MachineOperand &Op = MI.getOperand(1);
7291 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7292 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7293}
7294
7295void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7296 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7297 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7298 "Expected G_CONSTANT");
7299 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7300}
7301
7302/// This only really exists to satisfy DAG type checking machinery, so is a
7303/// no-op here.
7304void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7305 const MachineInstr &MI,
7306 int OpIdx) const {
7307 const MachineOperand &Op = MI.getOperand(OpIdx);
7308 int64_t Imm;
7309 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7310 MIB.addImm(Imm);
7311 else
7312 MIB.addImm(Op.getImm());
7313}
7314
7315void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7316 const MachineInstr &MI,
7317 int OpIdx) const {
7318 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7319}
7320
7321void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7322 const MachineInstr &MI,
7323 int OpIdx) const {
7324 assert(OpIdx >= 0 && "expected to match an immediate operand");
7325 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7326}
7327
7328void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7329 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7330 assert(OpIdx >= 0 && "expected to match an immediate operand");
7331 MIB.addImm(
7332 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7333}
7334
7335void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7336 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7337 assert(OpIdx >= 0 && "expected to match an immediate operand");
7338 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7340 : (int64_t)SISrcMods::DST_OP_SEL);
7341}
7342
7343void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7344 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7345 assert(OpIdx >= 0 && "expected to match an immediate operand");
7346 MIB.addImm(
7347 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7348}
7349
7350void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7351 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7352 assert(OpIdx >= 0 && "expected to match an immediate operand");
7353 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7354 ? (int64_t)(SISrcMods::OP_SEL_0)
7355 : 0);
7356}
7357
7358void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7359 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7360 assert(OpIdx >= 0 && "expected to match an immediate operand");
7361 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7362 : 0);
7363}
7364
7365void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7366 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7367 assert(OpIdx >= 0 && "expected to match an immediate operand");
7368 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7369 : 0);
7370}
7371
7372void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7373 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7374 assert(OpIdx >= 0 && "expected to match an immediate operand");
7375 MIB.addImm(
7376 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7377}
7378
7379void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7380 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7381 assert(OpIdx >= 0 && "expected to match an immediate operand");
7382 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7383 ? (int64_t)SISrcMods::DST_OP_SEL
7384 : 0);
7385}
7386
7387void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7388 const MachineInstr &MI,
7389 int OpIdx) const {
7390 assert(OpIdx >= 0 && "expected to match an immediate operand");
7391 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7394}
7395
7396void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7397 const MachineInstr &MI,
7398 int OpIdx) const {
7399 assert(OpIdx >= 0 && "expected to match an immediate operand");
7400 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7403 MIB.addImm(Swizzle);
7404}
7405
7406void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7407 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7408 assert(OpIdx >= 0 && "expected to match an immediate operand");
7409 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7412 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7413}
7414
7415void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7416 const MachineInstr &MI,
7417 int OpIdx) const {
7418 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7419}
7420
7421void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7422 const MachineInstr &MI,
7423 int OpIdx) const {
7424 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7425 int ExpVal = APF.getExactLog2Abs();
7426 assert(ExpVal != INT_MIN);
7427 MIB.addImm(ExpVal);
7428}
7429
7430void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7431 const MachineInstr &MI,
7432 int OpIdx) const {
7433 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7434 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7435 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7436 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7437 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7438}
7439
7440void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7441 const MachineInstr &MI,
7442 int OpIdx) const {
7443 unsigned Mods = SISrcMods::OP_SEL_1;
7444 if (MI.getOperand(OpIdx).getImm())
7445 Mods ^= SISrcMods::NEG;
7446 MIB.addImm((int64_t)Mods);
7447}
7448
7449void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7450 const MachineInstr &MI,
7451 int OpIdx) const {
7452 unsigned Mods = SISrcMods::OP_SEL_1;
7453 if (MI.getOperand(OpIdx).getImm())
7455 MIB.addImm((int64_t)Mods);
7456}
7457
7458void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7459 const MachineInstr &MI,
7460 int OpIdx) const {
7461 unsigned Val = MI.getOperand(OpIdx).getImm();
7462 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7463 if (Val == 1) // neg
7464 Mods ^= SISrcMods::NEG;
7465 if (Val == 2) // abs
7466 Mods ^= SISrcMods::ABS;
7467 if (Val == 3) // neg and abs
7468 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7469 MIB.addImm((int64_t)Mods);
7470}
7471
7472void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7473 const MachineInstr &MI,
7474 int OpIdx) const {
7475 uint32_t V = MI.getOperand(2).getImm();
7478 if (!Subtarget->hasSafeCUPrefetch())
7479 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7480 MIB.addImm(V);
7481}
7482
7483/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7484void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7485 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7486 unsigned Val = MI.getOperand(OpIdx).getImm();
7487 unsigned New = 0;
7488 if (Val & 0x1)
7490 if (Val & 0x2)
7492 MIB.addImm(New);
7493}
7494
7495bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7496 return TII.isInlineConstant(Imm);
7497}
7498
7499bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7500 return TII.isInlineConstant(Imm);
7501}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1594
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858
@ Offset
Definition DWP.cpp:557
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1440
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:57
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:156
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:494
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:469
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:501
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.