LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI)
45 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), STI(STI),
47#include "AMDGPUGenGlobalISel.inc"
50#include "AMDGPUGenGlobalISel.inc"
52{
53}
54
55const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
56
67
68// Return the wave level SGPR base address if this is a wave address.
70 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
71 ? Def->getOperand(1).getReg()
72 : Register();
73}
74
76 const Function &F = I.getMF()->getFunction();
77 F.getContext().diagnose(DiagnosticInfoUnsupported(
78 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error));
79}
80
81bool AMDGPUInstructionSelector::isVCC(Register Reg,
82 const MachineRegisterInfo &MRI) const {
83 // The verifier is oblivious to s1 being a valid value for wavesize registers.
84 if (Reg.isPhysical())
85 return false;
86
87 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
88 const TargetRegisterClass *RC =
90 if (RC) {
91 const LLT Ty = MRI.getType(Reg);
92 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
93 return false;
94 // G_TRUNC s1 result is never vcc.
95 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
96 RC->hasSuperClassEq(TRI.getBoolRC());
97 }
98
99 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
100 return RB->getID() == AMDGPU::VCCRegBankID;
101}
102
103bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
104 unsigned NewOpc) const {
105 MI.setDesc(TII.get(NewOpc));
106 MI.removeOperand(1); // Remove intrinsic ID.
107 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
108
109 MachineOperand &Dst = MI.getOperand(0);
110 MachineOperand &Src = MI.getOperand(1);
111
112 // TODO: This should be legalized to s32 if needed
113 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
114 return false;
115
116 const TargetRegisterClass *DstRC
117 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
118 const TargetRegisterClass *SrcRC
119 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
120 if (!DstRC || DstRC != SrcRC)
121 return false;
122
123 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
124 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
125 return false;
126 const MCInstrDesc &MCID = MI.getDesc();
127 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
128 MI.getOperand(0).setIsEarlyClobber(true);
129 }
130 return true;
131}
132
133bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
134 const DebugLoc &DL = I.getDebugLoc();
135 MachineBasicBlock *BB = I.getParent();
136 I.setDesc(TII.get(TargetOpcode::COPY));
137
138 const MachineOperand &Src = I.getOperand(1);
139 MachineOperand &Dst = I.getOperand(0);
140 Register DstReg = Dst.getReg();
141 Register SrcReg = Src.getReg();
142
143 if (isVCC(DstReg, *MRI)) {
144 if (SrcReg == AMDGPU::SCC) {
145 const TargetRegisterClass *RC
146 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
147 if (!RC)
148 return true;
149 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
150 }
151
152 if (!isVCC(SrcReg, *MRI)) {
153 // TODO: Should probably leave the copy and let copyPhysReg expand it.
154 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
155 return false;
156
157 const TargetRegisterClass *SrcRC
158 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
159
160 std::optional<ValueAndVReg> ConstVal =
161 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
162 if (ConstVal) {
163 unsigned MovOpc =
164 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
165 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
166 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
167 } else {
168 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
169
170 // We can't trust the high bits at this point, so clear them.
171
172 // TODO: Skip masking high bits if def is known boolean.
173
174 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
175 assert(Subtarget->useRealTrue16Insts());
176 const int64_t NoMods = 0;
177 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
178 .addImm(NoMods)
179 .addImm(1)
180 .addImm(NoMods)
181 .addReg(SrcReg)
182 .addImm(NoMods);
183 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
184 .addImm(NoMods)
185 .addImm(0)
186 .addImm(NoMods)
187 .addReg(MaskedReg)
188 .addImm(NoMods);
189 } else {
190 bool IsSGPR = TRI.isSGPRClass(SrcRC);
191 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
192 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
193 .addImm(1)
194 .addReg(SrcReg);
195 if (IsSGPR)
196 And.setOperandDead(3); // Dead scc
197
198 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
199 .addImm(0)
200 .addReg(MaskedReg);
201 }
202 }
203
204 if (!MRI->getRegClassOrNull(SrcReg))
205 MRI->setRegClass(SrcReg, SrcRC);
206 I.eraseFromParent();
207 return true;
208 }
209
210 const TargetRegisterClass *RC =
211 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
212 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
213 return false;
214
215 return true;
216 }
217
218 for (const MachineOperand &MO : I.operands()) {
219 if (MO.getReg().isPhysical())
220 continue;
221
222 const TargetRegisterClass *RC =
223 TRI.getConstrainedRegClassForOperand(MO, *MRI);
224 if (!RC)
225 continue;
226 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
227 }
228 return true;
229}
230
231bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
232 const DebugLoc &DL = I.getDebugLoc();
233 MachineBasicBlock *BB = I.getParent();
234 Register VCCReg = I.getOperand(1).getReg();
235 MachineInstr *Cmp;
236
237 // Set SCC as a side effect with S_CMP or S_OR.
238 if (STI.hasScalarCompareEq64()) {
239 unsigned CmpOpc =
240 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
241 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
242 } else {
243 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
244 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
245 .addReg(VCCReg)
246 .addReg(VCCReg);
247 }
248
249 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
250
251 Register DstReg = I.getOperand(0).getReg();
252 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
253
254 I.eraseFromParent();
255 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
256}
257
258bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
259 const DebugLoc &DL = I.getDebugLoc();
260 MachineBasicBlock *BB = I.getParent();
261
262 Register DstReg = I.getOperand(0).getReg();
263 Register SrcReg = I.getOperand(1).getReg();
264 std::optional<ValueAndVReg> Arg =
265 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
266
267 if (Arg) {
268 const int64_t Value = Arg->Value.getZExtValue();
269 if (Value == 0) {
270 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
271 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
272 } else {
273 assert(Value == 1);
274 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
275 }
276 I.eraseFromParent();
277 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
278 }
279
280 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
281 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
282
283 unsigned SelectOpcode =
284 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
285 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
286 .addReg(TRI.getExec())
287 .addImm(0);
288
289 I.eraseFromParent();
291 return true;
292}
293
294bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
295 Register DstReg = I.getOperand(0).getReg();
296 Register SrcReg = I.getOperand(1).getReg();
297
298 const DebugLoc &DL = I.getDebugLoc();
299 MachineBasicBlock *BB = I.getParent();
300
301 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
302 .addReg(SrcReg);
303
304 I.eraseFromParent();
305 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
306 return true;
307}
308
309bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
310 const Register DefReg = I.getOperand(0).getReg();
311 const LLT DefTy = MRI->getType(DefReg);
312
313 // S1 G_PHIs should not be selected in instruction-select, instead:
314 // - divergent S1 G_PHI should go through lane mask merging algorithm
315 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
316 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
317 if (DefTy == LLT::scalar(1))
318 return false;
319
320 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
321
322 const RegClassOrRegBank &RegClassOrBank =
323 MRI->getRegClassOrRegBank(DefReg);
324
325 const TargetRegisterClass *DefRC =
327 if (!DefRC) {
328 if (!DefTy.isValid()) {
329 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
330 return false;
331 }
332
333 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
334 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
335 if (!DefRC) {
336 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
337 return false;
338 }
339 }
340
341 // If inputs have register bank, assign corresponding reg class.
342 // Note: registers don't need to have the same reg bank.
343 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
344 const Register SrcReg = I.getOperand(i).getReg();
345
346 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
347 if (RB) {
348 const LLT SrcTy = MRI->getType(SrcReg);
349 const TargetRegisterClass *SrcRC =
350 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
351 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
352 return false;
353 }
354 }
355
356 I.setDesc(TII.get(TargetOpcode::PHI));
357 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
358}
359
361AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
362 const TargetRegisterClass &SubRC,
363 unsigned SubIdx) const {
364
365 MachineInstr *MI = MO.getParent();
366 MachineBasicBlock *BB = MO.getParent()->getParent();
367 Register DstReg = MRI->createVirtualRegister(&SubRC);
368
369 if (MO.isReg()) {
370 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
371 Register Reg = MO.getReg();
372 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
373 .addReg(Reg, {}, ComposedSubIdx);
374
375 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
376 MO.isKill(), MO.isDead(), MO.isUndef(),
377 MO.isEarlyClobber(), 0, MO.isDebug(),
378 MO.isInternalRead());
379 }
380
381 assert(MO.isImm());
382
383 APInt Imm(64, MO.getImm());
384
385 switch (SubIdx) {
386 default:
387 llvm_unreachable("do not know to split immediate with this sub index.");
388 case AMDGPU::sub0:
389 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
390 case AMDGPU::sub1:
391 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
392 }
393}
394
395static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
396 switch (Opc) {
397 case AMDGPU::G_AND:
398 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
399 case AMDGPU::G_OR:
400 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
401 case AMDGPU::G_XOR:
402 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
403 default:
404 llvm_unreachable("not a bit op");
405 }
406}
407
408bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
409 Register DstReg = I.getOperand(0).getReg();
410 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
411
412 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
413 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
414 DstRB->getID() != AMDGPU::VCCRegBankID)
415 return false;
416
417 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
418 STI.isWave64());
419 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
420
421 // Dead implicit-def of scc
422 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
423 true, // isImp
424 false, // isKill
425 true)); // isDead
426 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
427 return true;
428}
429
430bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
431 MachineBasicBlock *BB = I.getParent();
432 MachineFunction *MF = BB->getParent();
433 Register DstReg = I.getOperand(0).getReg();
434 const DebugLoc &DL = I.getDebugLoc();
435 LLT Ty = MRI->getType(DstReg);
436 if (Ty.isVector())
437 return false;
438
439 unsigned Size = Ty.getSizeInBits();
440 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
441 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
442 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
443
444 if (Size == 32) {
445 if (IsSALU) {
446 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
447 MachineInstr *Add =
448 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
449 .add(I.getOperand(1))
450 .add(I.getOperand(2))
451 .setOperandDead(3); // Dead scc
452 I.eraseFromParent();
453 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
454 return true;
455 }
456
457 if (STI.hasAddNoCarryInsts()) {
458 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
459 I.setDesc(TII.get(Opc));
460 I.addOperand(*MF, MachineOperand::CreateImm(0));
461 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
462 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
463 return true;
464 }
465
466 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
467
468 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
469 MachineInstr *Add
470 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
471 .addDef(UnusedCarry, RegState::Dead)
472 .add(I.getOperand(1))
473 .add(I.getOperand(2))
474 .addImm(0);
475 I.eraseFromParent();
476 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
477 return true;
478 }
479
480 assert(!Sub && "illegal sub should not reach here");
481
482 const TargetRegisterClass &RC
483 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
484 const TargetRegisterClass &HalfRC
485 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
486
487 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
488 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
489 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
490 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
491
492 Register DstLo = MRI->createVirtualRegister(&HalfRC);
493 Register DstHi = MRI->createVirtualRegister(&HalfRC);
494
495 if (IsSALU) {
496 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
497 .add(Lo1)
498 .add(Lo2);
499 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
500 .add(Hi1)
501 .add(Hi2)
502 .setOperandDead(3); // Dead scc
503 } else {
504 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
505 Register CarryReg = MRI->createVirtualRegister(CarryRC);
506 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
507 .addDef(CarryReg)
508 .add(Lo1)
509 .add(Lo2)
510 .addImm(0);
511 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
512 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
513 .add(Hi1)
514 .add(Hi2)
515 .addReg(CarryReg, RegState::Kill)
516 .addImm(0);
517
518 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
519 }
520
521 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
522 .addReg(DstLo)
523 .addImm(AMDGPU::sub0)
524 .addReg(DstHi)
525 .addImm(AMDGPU::sub1);
526
527
528 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
529 return false;
530
531 I.eraseFromParent();
532 return true;
533}
534
535bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
536 MachineInstr &I) const {
537 MachineBasicBlock *BB = I.getParent();
538 MachineFunction *MF = BB->getParent();
539 const DebugLoc &DL = I.getDebugLoc();
540 Register Dst0Reg = I.getOperand(0).getReg();
541 Register Dst1Reg = I.getOperand(1).getReg();
542 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
543 I.getOpcode() == AMDGPU::G_UADDE;
544 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
545 I.getOpcode() == AMDGPU::G_USUBE;
546
547 if (isVCC(Dst1Reg, *MRI)) {
548 unsigned NoCarryOpc =
549 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
550 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
551 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
552 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
553 I.addOperand(*MF, MachineOperand::CreateImm(0));
554 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
555 return true;
556 }
557
558 Register Src0Reg = I.getOperand(2).getReg();
559 Register Src1Reg = I.getOperand(3).getReg();
560
561 if (HasCarryIn) {
562 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
563 .addReg(I.getOperand(4).getReg());
564 }
565
566 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
567 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
568
569 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
570 .add(I.getOperand(2))
571 .add(I.getOperand(3));
572
573 if (MRI->use_nodbg_empty(Dst1Reg)) {
574 CarryInst.setOperandDead(3); // Dead scc
575 } else {
576 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
577 .addReg(AMDGPU::SCC);
578 if (!MRI->getRegClassOrNull(Dst1Reg))
579 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
580 }
581
582 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
583 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
584 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
585 return false;
586
587 if (HasCarryIn &&
588 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
589 AMDGPU::SReg_32RegClass, *MRI))
590 return false;
591
592 I.eraseFromParent();
593 return true;
594}
595
596bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
597 MachineInstr &I) const {
598 MachineBasicBlock *BB = I.getParent();
599 MachineFunction *MF = BB->getParent();
600 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
601 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() &&
602 MRI->use_nodbg_empty(I.getOperand(1).getReg());
603
604 unsigned Opc;
605 if (Subtarget->hasMADIntraFwdBug())
606 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
607 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
608 else if (UseNoCarry)
609 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
610 : AMDGPU::V_MAD_NC_I64_I32_e64;
611 else
612 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
613
614 if (UseNoCarry)
615 I.removeOperand(1);
616
617 I.setDesc(TII.get(Opc));
618 I.addOperand(*MF, MachineOperand::CreateImm(0));
619 I.addImplicitDefUseOperands(*MF);
620 I.getOperand(0).setIsEarlyClobber(true);
621 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
622 return true;
623}
624
625// TODO: We should probably legalize these to only using 32-bit results.
626bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
627 MachineBasicBlock *BB = I.getParent();
628 Register DstReg = I.getOperand(0).getReg();
629 Register SrcReg = I.getOperand(1).getReg();
630 LLT DstTy = MRI->getType(DstReg);
631 LLT SrcTy = MRI->getType(SrcReg);
632 const unsigned SrcSize = SrcTy.getSizeInBits();
633 unsigned DstSize = DstTy.getSizeInBits();
634
635 // TODO: Should handle any multiple of 32 offset.
636 unsigned Offset = I.getOperand(2).getImm();
637 if (Offset % 32 != 0 || DstSize > 128)
638 return false;
639
640 // 16-bit operations really use 32-bit registers.
641 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
642 if (DstSize == 16)
643 DstSize = 32;
644
645 const TargetRegisterClass *DstRC =
646 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
647 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
648 return false;
649
650 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
651 const TargetRegisterClass *SrcRC =
652 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
653 if (!SrcRC)
654 return false;
655 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
656 DstSize / 32);
657 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
658 if (!SrcRC)
659 return false;
660
661 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
662 *SrcRC, I.getOperand(1));
663 const DebugLoc &DL = I.getDebugLoc();
664 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
665 .addReg(SrcReg, {}, SubReg);
666
667 I.eraseFromParent();
668 return true;
669}
670
671bool AMDGPUInstructionSelector::selectS16MergeToS32(MachineInstr &MI) const {
672 Register Dst = MI.getOperand(0).getReg();
673 Register Src0 = MI.getOperand(1).getReg();
674 Register Src1 = MI.getOperand(2).getReg();
675
676 LLT Src0Ty = MRI->getType(Src0);
677 LLT Src1Ty = MRI->getType(Src1);
678
679 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
680 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, TRI);
681 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, TRI);
682 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
683
684 Register ShiftSrc0;
685 Register ShiftSrc1;
686
687 const DebugLoc &DL = MI.getDebugLoc();
688 MachineBasicBlock *BB = MI.getParent();
689
690 // VGPR case
691 if (IsVector) {
692 // If source are both VGPR16, use REG_SEQUENCE with lo16/hi16 subregisters
693 if (Src0Bank->getID() == AMDGPU::VGPRRegBankID &&
694 Src1Bank->getID() == AMDGPU::VGPRRegBankID &&
695 Src0Ty == LLT::scalar(16) && Src1Ty == LLT::scalar(16)) {
696 BuildMI(*BB, MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), Dst)
697 .addReg(Src0)
698 .addImm(AMDGPU::lo16)
699 .addReg(Src1)
700 .addImm(AMDGPU::hi16);
701
702 if (!RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI))
703 return false;
704
705 MI.eraseFromParent();
706 return true;
707 }
708
709 // Otherwise, use V_LSHL_OR_B32_e64
710 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
711 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
712 .addImm(0xFFFF)
713 .addReg(Src0);
714 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
715
716 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
717 .addReg(Src1)
718 .addImm(16)
719 .addReg(TmpReg);
720 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
721
722 MI.eraseFromParent();
723 return true;
724 }
725
726 // SGPR case -> S_PACK_*_B32_B16
727 // With multiple uses of the shift, this will duplicate the shift and
728 // increase register pressure.
729 //
730 // (merge (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
731 // => (S_PACK_HH_B32_B16 $src0, $src1)
732 // (merge (lshr_oneuse SReg_32:$src0, 16), $src1)
733 // => (S_PACK_HL_B32_B16 $src0, $src1)
734 // (merge $src0, (lshr_oneuse SReg_32:$src1, 16))
735 // => (S_PACK_LH_B32_B16 $src0, $src1)
736 // (merge $src0, $src1)
737 // => (S_PACK_LL_B32_B16 $src0, $src1)
738
739 bool Shift0 = mi_match(
740 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
741
742 bool Shift1 = mi_match(
743 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
744
745 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
746 if (Shift0 && Shift1) {
747 Opc = AMDGPU::S_PACK_HH_B32_B16;
748 MI.getOperand(1).setReg(ShiftSrc0);
749 MI.getOperand(2).setReg(ShiftSrc1);
750 } else if (Shift1) {
751 Opc = AMDGPU::S_PACK_LH_B32_B16;
752 MI.getOperand(2).setReg(ShiftSrc1);
753 } else if (Shift0) {
754 auto ConstSrc1 =
755 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
756 if (ConstSrc1 && ConstSrc1->Value == 0) {
757 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
758 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
759 .addReg(ShiftSrc0)
760 .addImm(16)
761 .setOperandDead(3); // Dead scc
762
763 MI.eraseFromParent();
764 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
765 return true;
766 }
767 if (STI.hasSPackHL()) {
768 Opc = AMDGPU::S_PACK_HL_B32_B16;
769 MI.getOperand(1).setReg(ShiftSrc0);
770 }
771 }
772
773 MI.setDesc(TII.get(Opc));
774 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
775 return true;
776}
777
778bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
779 MachineBasicBlock *BB = MI.getParent();
780 Register DstReg = MI.getOperand(0).getReg();
781 LLT DstTy = MRI->getType(DstReg);
782 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
783
784 const unsigned SrcSize = SrcTy.getSizeInBits();
785 if (SrcSize < 32) {
786 // Handle s32 <- G_MERGE_VALUES s16, s16
787 if (SrcSize == 16 && DstTy.getSizeInBits() == 32 &&
788 MI.getNumOperands() == 3) {
789 return selectS16MergeToS32(MI);
790 }
791 return selectImpl(MI, *CoverageInfo);
792 }
793
794 const DebugLoc &DL = MI.getDebugLoc();
795 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
796 const unsigned DstSize = DstTy.getSizeInBits();
797 const TargetRegisterClass *DstRC =
798 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
799 if (!DstRC)
800 return false;
801
802 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
803 MachineInstrBuilder MIB =
804 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
805 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
806 MachineOperand &Src = MI.getOperand(I + 1);
807 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
808 MIB.addImm(SubRegs[I]);
809
810 const TargetRegisterClass *SrcRC
811 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
812 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
813 return false;
814 }
815
816 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
817 return false;
818
819 MI.eraseFromParent();
820 return true;
821}
822
823bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
824 MachineBasicBlock *BB = MI.getParent();
825 const int NumDst = MI.getNumOperands() - 1;
826
827 MachineOperand &Src = MI.getOperand(NumDst);
828
829 Register SrcReg = Src.getReg();
830 Register DstReg0 = MI.getOperand(0).getReg();
831 LLT DstTy = MRI->getType(DstReg0);
832 LLT SrcTy = MRI->getType(SrcReg);
833
834 const unsigned DstSize = DstTy.getSizeInBits();
835 const unsigned SrcSize = SrcTy.getSizeInBits();
836 const DebugLoc &DL = MI.getDebugLoc();
837 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
838
839 const TargetRegisterClass *SrcRC =
840 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
841 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
842 return false;
843
844 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
845 // source, and this relies on the fact that the same subregister indices are
846 // used for both.
847 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
848 for (int I = 0, E = NumDst; I != E; ++I) {
849 MachineOperand &Dst = MI.getOperand(I);
850 // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits.
851 if (SrcBank->getID() == AMDGPU::SGPRRegBankID &&
852 SubRegs[I] == AMDGPU::hi16) {
853 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg())
854 .addReg(SrcReg)
855 .addImm(16);
856 } else {
857 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
858 .addReg(SrcReg, {}, SubRegs[I]);
859 }
860
861 // Make sure the subregister index is valid for the source register.
862 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
863 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
864 return false;
865
866 const TargetRegisterClass *DstRC =
867 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
868 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
869 return false;
870 }
871
872 MI.eraseFromParent();
873 return true;
874}
875
876bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
877 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
878 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
879
880 Register Src0 = MI.getOperand(1).getReg();
881 Register Src1 = MI.getOperand(2).getReg();
882 LLT SrcTy = MRI->getType(Src0);
883 const unsigned SrcSize = SrcTy.getSizeInBits();
884
885 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
886 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
887 return selectG_MERGE_VALUES(MI);
888 }
889
890 // Selection logic below is for V2S16 only.
891 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
892 Register Dst = MI.getOperand(0).getReg();
893 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
894 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
895 SrcTy != LLT::scalar(32)))
896 return selectImpl(MI, *CoverageInfo);
897
898 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
899 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
900 return false;
901
902 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
903 DstBank->getID() == AMDGPU::VGPRRegBankID);
904 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
905
906 const DebugLoc &DL = MI.getDebugLoc();
907 MachineBasicBlock *BB = MI.getParent();
908
909 // First, before trying TableGen patterns, check if both sources are
910 // constants. In those cases, we can trivially compute the final constant
911 // and emit a simple move.
912 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
913 if (ConstSrc1) {
914 auto ConstSrc0 =
915 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
916 if (ConstSrc0) {
917 const int64_t K0 = ConstSrc0->Value.getSExtValue();
918 const int64_t K1 = ConstSrc1->Value.getSExtValue();
919 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
920 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
921 uint32_t Imm = Lo16 | (Hi16 << 16);
922
923 // VALU
924 if (IsVector) {
925 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
926 MI.eraseFromParent();
927 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
928 }
929
930 // SALU
931 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
932 MI.eraseFromParent();
933 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
934 }
935 }
936
937 // Now try TableGen patterns.
938 if (selectImpl(MI, *CoverageInfo))
939 return true;
940
941 // TODO: This should probably be a combine somewhere
942 // (build_vector $src0, undef) -> copy $src0
943 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
944 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
945 MI.setDesc(TII.get(AMDGPU::COPY));
946 MI.removeOperand(2);
947 const auto &RC =
948 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
949 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
950 RBI.constrainGenericRegister(Src0, RC, *MRI);
951 }
952
953 return selectS16MergeToS32(MI);
954}
955
956bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
957 const MachineOperand &MO = I.getOperand(0);
958
959 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
960 // regbank check here is to know why getConstrainedRegClassForOperand failed.
961 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
962 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
963 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
964 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
965 return true;
966 }
967
968 return false;
969}
970
971bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
972 MachineBasicBlock *BB = I.getParent();
973
974 Register DstReg = I.getOperand(0).getReg();
975 Register Src0Reg = I.getOperand(1).getReg();
976 Register Src1Reg = I.getOperand(2).getReg();
977 LLT Src1Ty = MRI->getType(Src1Reg);
978
979 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
980 unsigned InsSize = Src1Ty.getSizeInBits();
981
982 int64_t Offset = I.getOperand(3).getImm();
983
984 // FIXME: These cases should have been illegal and unnecessary to check here.
985 if (Offset % 32 != 0 || InsSize % 32 != 0)
986 return false;
987
988 // Currently not handled by getSubRegFromChannel.
989 if (InsSize > 128)
990 return false;
991
992 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
993 if (SubReg == AMDGPU::NoSubRegister)
994 return false;
995
996 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
997 const TargetRegisterClass *DstRC =
998 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
999 if (!DstRC)
1000 return false;
1001
1002 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
1003 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
1004 const TargetRegisterClass *Src0RC =
1005 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
1006 const TargetRegisterClass *Src1RC =
1007 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
1008
1009 // Deal with weird cases where the class only partially supports the subreg
1010 // index.
1011 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
1012 if (!Src0RC || !Src1RC)
1013 return false;
1014
1015 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
1016 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
1017 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
1018 return false;
1019
1020 const DebugLoc &DL = I.getDebugLoc();
1021 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
1022 .addReg(Src0Reg)
1023 .addReg(Src1Reg)
1024 .addImm(SubReg);
1025
1026 I.eraseFromParent();
1027 return true;
1028}
1029
1030bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
1031 Register DstReg = MI.getOperand(0).getReg();
1032 Register SrcReg = MI.getOperand(1).getReg();
1033 Register OffsetReg = MI.getOperand(2).getReg();
1034 Register WidthReg = MI.getOperand(3).getReg();
1035
1036 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
1037 "scalar BFX instructions are expanded in regbankselect");
1038 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
1039 "64-bit vector BFX instructions are expanded in regbankselect");
1040
1041 const DebugLoc &DL = MI.getDebugLoc();
1042 MachineBasicBlock *MBB = MI.getParent();
1043
1044 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
1045 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
1046 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
1047 .addReg(SrcReg)
1048 .addReg(OffsetReg)
1049 .addReg(WidthReg);
1050 MI.eraseFromParent();
1051 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1052 return true;
1053}
1054
1055bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1056 if (STI.getLDSBankCount() != 16)
1057 return selectImpl(MI, *CoverageInfo);
1058
1059 Register Dst = MI.getOperand(0).getReg();
1060 Register Src0 = MI.getOperand(2).getReg();
1061 Register M0Val = MI.getOperand(6).getReg();
1062 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1063 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1064 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1065 return false;
1066
1067 // This requires 2 instructions. It is possible to write a pattern to support
1068 // this, but the generated isel emitter doesn't correctly deal with multiple
1069 // output instructions using the same physical register input. The copy to m0
1070 // is incorrectly placed before the second instruction.
1071 //
1072 // TODO: Match source modifiers.
1073
1074 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1075 const DebugLoc &DL = MI.getDebugLoc();
1076 MachineBasicBlock *MBB = MI.getParent();
1077
1078 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1079 .addReg(M0Val);
1080 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1081 .addImm(2)
1082 .addImm(MI.getOperand(4).getImm()) // $attr
1083 .addImm(MI.getOperand(3).getImm()); // $attrchan
1084
1085 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1086 .addImm(0) // $src0_modifiers
1087 .addReg(Src0) // $src0
1088 .addImm(MI.getOperand(4).getImm()) // $attr
1089 .addImm(MI.getOperand(3).getImm()) // $attrchan
1090 .addImm(0) // $src2_modifiers
1091 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1092 .addImm(MI.getOperand(5).getImm()) // $high
1093 .addImm(0) // $clamp
1094 .addImm(0); // $omod
1095
1096 MI.eraseFromParent();
1097 return true;
1098}
1099
1100// Writelane is special in that it can use SGPR and M0 (which would normally
1101// count as using the constant bus twice - but in this case it is allowed since
1102// the lane selector doesn't count as a use of the constant bus). However, it is
1103// still required to abide by the 1 SGPR rule. Fix this up if we might have
1104// multiple SGPRs.
1105bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1106 // With a constant bus limit of at least 2, there's no issue.
1107 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1108 return selectImpl(MI, *CoverageInfo);
1109
1110 MachineBasicBlock *MBB = MI.getParent();
1111 const DebugLoc &DL = MI.getDebugLoc();
1112 Register VDst = MI.getOperand(0).getReg();
1113 Register Val = MI.getOperand(2).getReg();
1114 Register LaneSelect = MI.getOperand(3).getReg();
1115 Register VDstIn = MI.getOperand(4).getReg();
1116
1117 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1118
1119 std::optional<ValueAndVReg> ConstSelect =
1120 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1121 if (ConstSelect) {
1122 // The selector has to be an inline immediate, so we can use whatever for
1123 // the other operands.
1124 MIB.addReg(Val);
1125 MIB.addImm(ConstSelect->Value.getSExtValue() &
1126 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1127 } else {
1128 std::optional<ValueAndVReg> ConstVal =
1130
1131 // If the value written is an inline immediate, we can get away without a
1132 // copy to m0.
1133 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1134 STI.hasInv2PiInlineImm())) {
1135 MIB.addImm(ConstVal->Value.getSExtValue());
1136 MIB.addReg(LaneSelect);
1137 } else {
1138 MIB.addReg(Val);
1139
1140 // If the lane selector was originally in a VGPR and copied with
1141 // readfirstlane, there's a hazard to read the same SGPR from the
1142 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1143 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1144
1145 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1146 .addReg(LaneSelect);
1147 MIB.addReg(AMDGPU::M0);
1148 }
1149 }
1150
1151 MIB.addReg(VDstIn);
1152
1153 MI.eraseFromParent();
1154 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1155 return true;
1156}
1157
1158// We need to handle this here because tablegen doesn't support matching
1159// instructions with multiple outputs.
1160bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1161 Register Dst0 = MI.getOperand(0).getReg();
1162 Register Dst1 = MI.getOperand(1).getReg();
1163
1164 LLT Ty = MRI->getType(Dst0);
1165 unsigned Opc;
1166 if (Ty == LLT::scalar(32))
1167 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1168 else if (Ty == LLT::scalar(64))
1169 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1170 else
1171 return false;
1172
1173 // TODO: Match source modifiers.
1174
1175 const DebugLoc &DL = MI.getDebugLoc();
1176 MachineBasicBlock *MBB = MI.getParent();
1177
1178 Register Numer = MI.getOperand(3).getReg();
1179 Register Denom = MI.getOperand(4).getReg();
1180 unsigned ChooseDenom = MI.getOperand(5).getImm();
1181
1182 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1183
1184 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1185 .addDef(Dst1)
1186 .addImm(0) // $src0_modifiers
1187 .addUse(Src0) // $src0
1188 .addImm(0) // $src1_modifiers
1189 .addUse(Denom) // $src1
1190 .addImm(0) // $src2_modifiers
1191 .addUse(Numer) // $src2
1192 .addImm(0) // $clamp
1193 .addImm(0); // $omod
1194
1195 MI.eraseFromParent();
1196 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1197 return true;
1198}
1199
1200bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1201 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1202 switch (IntrinsicID) {
1203 case Intrinsic::amdgcn_if_break: {
1204 MachineBasicBlock *BB = I.getParent();
1205
1206 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1207 // SelectionDAG uses for wave32 vs wave64.
1208 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1209 .add(I.getOperand(0))
1210 .add(I.getOperand(2))
1211 .add(I.getOperand(3));
1212
1213 Register DstReg = I.getOperand(0).getReg();
1214 Register Src0Reg = I.getOperand(2).getReg();
1215 Register Src1Reg = I.getOperand(3).getReg();
1216
1217 I.eraseFromParent();
1218
1219 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1220 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1221
1222 return true;
1223 }
1224 case Intrinsic::amdgcn_interp_p1_f16:
1225 return selectInterpP1F16(I);
1226 case Intrinsic::amdgcn_wqm:
1227 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1228 case Intrinsic::amdgcn_softwqm:
1229 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1230 case Intrinsic::amdgcn_strict_wwm:
1231 case Intrinsic::amdgcn_wwm:
1232 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1233 case Intrinsic::amdgcn_strict_wqm:
1234 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1235 case Intrinsic::amdgcn_writelane:
1236 return selectWritelane(I);
1237 case Intrinsic::amdgcn_div_scale:
1238 return selectDivScale(I);
1239 case Intrinsic::amdgcn_icmp:
1240 case Intrinsic::amdgcn_fcmp:
1241 if (selectImpl(I, *CoverageInfo))
1242 return true;
1243 return selectIntrinsicCmp(I);
1244 case Intrinsic::amdgcn_ballot:
1245 return selectBallot(I);
1246 case Intrinsic::amdgcn_reloc_constant:
1247 return selectRelocConstant(I);
1248 case Intrinsic::amdgcn_groupstaticsize:
1249 return selectGroupStaticSize(I);
1250 case Intrinsic::returnaddress:
1251 return selectReturnAddress(I);
1252 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1253 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1254 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1255 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1256 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1257 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1258 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1259 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1260 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1261 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1262 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1263 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1264 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1265 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1266 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1267 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1268 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1269 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1270 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1271 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1272 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1273 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1274 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1275 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1276 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1277 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1278 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1279 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1280 return selectSMFMACIntrin(I);
1281 case Intrinsic::amdgcn_permlane16_swap:
1282 case Intrinsic::amdgcn_permlane32_swap:
1283 return selectPermlaneSwapIntrin(I, IntrinsicID);
1284 case Intrinsic::amdgcn_wave_shuffle:
1285 return selectWaveShuffleIntrin(I);
1286 case Intrinsic::amdgcn_fma_legacy:
1287 if (!STI.hasFmaLegacy32Insts()) {
1289 return false;
1290 }
1291 return selectImpl(I, *CoverageInfo);
1292 case Intrinsic::amdgcn_sudot4:
1293 case Intrinsic::amdgcn_sudot8:
1294 if (!STI.hasDot8Insts()) {
1296 return false;
1297 }
1298 return selectImpl(I, *CoverageInfo);
1299 case Intrinsic::amdgcn_permlane16:
1300 case Intrinsic::amdgcn_permlanex16:
1301 if (!STI.hasPermlane16Insts()) {
1303 return false;
1304 }
1305 return selectImpl(I, *CoverageInfo);
1306 case Intrinsic::amdgcn_mov_dpp8:
1307 if (!STI.hasDPP8()) {
1309 return false;
1310 }
1311 return selectImpl(I, *CoverageInfo);
1312 case Intrinsic::amdgcn_tanh:
1313 if (!STI.hasTanhInsts()) {
1315 return false;
1316 }
1317 return selectImpl(I, *CoverageInfo);
1318 default:
1319 return selectImpl(I, *CoverageInfo);
1320 }
1321}
1322
1324 const GCNSubtarget &ST) {
1325 if (Size != 16 && Size != 32 && Size != 64)
1326 return -1;
1327
1328 if (Size == 16 && !ST.has16BitInsts())
1329 return -1;
1330
1331 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1332 unsigned FakeS16Opc, unsigned S32Opc,
1333 unsigned S64Opc) {
1334 if (Size == 16)
1335 return ST.hasTrue16BitInsts()
1336 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1337 : S16Opc;
1338 if (Size == 32)
1339 return S32Opc;
1340 return S64Opc;
1341 };
1342
1343 switch (P) {
1344 default:
1345 llvm_unreachable("Unknown condition code!");
1346 case CmpInst::ICMP_NE:
1347 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1348 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1349 AMDGPU::V_CMP_NE_U64_e64);
1350 case CmpInst::ICMP_EQ:
1351 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1352 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1353 AMDGPU::V_CMP_EQ_U64_e64);
1354 case CmpInst::ICMP_SGT:
1355 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1356 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1357 AMDGPU::V_CMP_GT_I64_e64);
1358 case CmpInst::ICMP_SGE:
1359 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1360 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1361 AMDGPU::V_CMP_GE_I64_e64);
1362 case CmpInst::ICMP_SLT:
1363 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1364 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1365 AMDGPU::V_CMP_LT_I64_e64);
1366 case CmpInst::ICMP_SLE:
1367 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1368 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1369 AMDGPU::V_CMP_LE_I64_e64);
1370 case CmpInst::ICMP_UGT:
1371 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1372 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1373 AMDGPU::V_CMP_GT_U64_e64);
1374 case CmpInst::ICMP_UGE:
1375 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1376 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1377 AMDGPU::V_CMP_GE_U64_e64);
1378 case CmpInst::ICMP_ULT:
1379 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1380 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1381 AMDGPU::V_CMP_LT_U64_e64);
1382 case CmpInst::ICMP_ULE:
1383 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1384 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1385 AMDGPU::V_CMP_LE_U64_e64);
1386
1387 case CmpInst::FCMP_OEQ:
1388 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1389 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1390 AMDGPU::V_CMP_EQ_F64_e64);
1391 case CmpInst::FCMP_OGT:
1392 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1393 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1394 AMDGPU::V_CMP_GT_F64_e64);
1395 case CmpInst::FCMP_OGE:
1396 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1397 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1398 AMDGPU::V_CMP_GE_F64_e64);
1399 case CmpInst::FCMP_OLT:
1400 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1401 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1402 AMDGPU::V_CMP_LT_F64_e64);
1403 case CmpInst::FCMP_OLE:
1404 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1405 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1406 AMDGPU::V_CMP_LE_F64_e64);
1407 case CmpInst::FCMP_ONE:
1408 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1409 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1410 AMDGPU::V_CMP_NEQ_F64_e64);
1411 case CmpInst::FCMP_ORD:
1412 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1413 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1414 AMDGPU::V_CMP_O_F64_e64);
1415 case CmpInst::FCMP_UNO:
1416 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1417 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1418 AMDGPU::V_CMP_U_F64_e64);
1419 case CmpInst::FCMP_UEQ:
1420 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1421 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1422 AMDGPU::V_CMP_NLG_F64_e64);
1423 case CmpInst::FCMP_UGT:
1424 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1425 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1426 AMDGPU::V_CMP_NLE_F64_e64);
1427 case CmpInst::FCMP_UGE:
1428 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1429 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1430 AMDGPU::V_CMP_NLT_F64_e64);
1431 case CmpInst::FCMP_ULT:
1432 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1433 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1434 AMDGPU::V_CMP_NGE_F64_e64);
1435 case CmpInst::FCMP_ULE:
1436 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1437 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1438 AMDGPU::V_CMP_NGT_F64_e64);
1439 case CmpInst::FCMP_UNE:
1440 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1441 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1442 AMDGPU::V_CMP_NEQ_F64_e64);
1443 case CmpInst::FCMP_TRUE:
1444 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1445 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1446 AMDGPU::V_CMP_TRU_F64_e64);
1448 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1449 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1450 AMDGPU::V_CMP_F_F64_e64);
1451 }
1452}
1453
1454int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1455 unsigned Size) const {
1456 if (Size == 64) {
1457 if (!STI.hasScalarCompareEq64())
1458 return -1;
1459
1460 switch (P) {
1461 case CmpInst::ICMP_NE:
1462 return AMDGPU::S_CMP_LG_U64;
1463 case CmpInst::ICMP_EQ:
1464 return AMDGPU::S_CMP_EQ_U64;
1465 default:
1466 return -1;
1467 }
1468 }
1469
1470 if (Size == 32) {
1471 switch (P) {
1472 case CmpInst::ICMP_NE:
1473 return AMDGPU::S_CMP_LG_U32;
1474 case CmpInst::ICMP_EQ:
1475 return AMDGPU::S_CMP_EQ_U32;
1476 case CmpInst::ICMP_SGT:
1477 return AMDGPU::S_CMP_GT_I32;
1478 case CmpInst::ICMP_SGE:
1479 return AMDGPU::S_CMP_GE_I32;
1480 case CmpInst::ICMP_SLT:
1481 return AMDGPU::S_CMP_LT_I32;
1482 case CmpInst::ICMP_SLE:
1483 return AMDGPU::S_CMP_LE_I32;
1484 case CmpInst::ICMP_UGT:
1485 return AMDGPU::S_CMP_GT_U32;
1486 case CmpInst::ICMP_UGE:
1487 return AMDGPU::S_CMP_GE_U32;
1488 case CmpInst::ICMP_ULT:
1489 return AMDGPU::S_CMP_LT_U32;
1490 case CmpInst::ICMP_ULE:
1491 return AMDGPU::S_CMP_LE_U32;
1492 case CmpInst::FCMP_OEQ:
1493 return AMDGPU::S_CMP_EQ_F32;
1494 case CmpInst::FCMP_OGT:
1495 return AMDGPU::S_CMP_GT_F32;
1496 case CmpInst::FCMP_OGE:
1497 return AMDGPU::S_CMP_GE_F32;
1498 case CmpInst::FCMP_OLT:
1499 return AMDGPU::S_CMP_LT_F32;
1500 case CmpInst::FCMP_OLE:
1501 return AMDGPU::S_CMP_LE_F32;
1502 case CmpInst::FCMP_ONE:
1503 return AMDGPU::S_CMP_LG_F32;
1504 case CmpInst::FCMP_ORD:
1505 return AMDGPU::S_CMP_O_F32;
1506 case CmpInst::FCMP_UNO:
1507 return AMDGPU::S_CMP_U_F32;
1508 case CmpInst::FCMP_UEQ:
1509 return AMDGPU::S_CMP_NLG_F32;
1510 case CmpInst::FCMP_UGT:
1511 return AMDGPU::S_CMP_NLE_F32;
1512 case CmpInst::FCMP_UGE:
1513 return AMDGPU::S_CMP_NLT_F32;
1514 case CmpInst::FCMP_ULT:
1515 return AMDGPU::S_CMP_NGE_F32;
1516 case CmpInst::FCMP_ULE:
1517 return AMDGPU::S_CMP_NGT_F32;
1518 case CmpInst::FCMP_UNE:
1519 return AMDGPU::S_CMP_NEQ_F32;
1520 default:
1521 llvm_unreachable("Unknown condition code!");
1522 }
1523 }
1524
1525 if (Size == 16) {
1526 if (!STI.hasSALUFloatInsts())
1527 return -1;
1528
1529 switch (P) {
1530 case CmpInst::FCMP_OEQ:
1531 return AMDGPU::S_CMP_EQ_F16;
1532 case CmpInst::FCMP_OGT:
1533 return AMDGPU::S_CMP_GT_F16;
1534 case CmpInst::FCMP_OGE:
1535 return AMDGPU::S_CMP_GE_F16;
1536 case CmpInst::FCMP_OLT:
1537 return AMDGPU::S_CMP_LT_F16;
1538 case CmpInst::FCMP_OLE:
1539 return AMDGPU::S_CMP_LE_F16;
1540 case CmpInst::FCMP_ONE:
1541 return AMDGPU::S_CMP_LG_F16;
1542 case CmpInst::FCMP_ORD:
1543 return AMDGPU::S_CMP_O_F16;
1544 case CmpInst::FCMP_UNO:
1545 return AMDGPU::S_CMP_U_F16;
1546 case CmpInst::FCMP_UEQ:
1547 return AMDGPU::S_CMP_NLG_F16;
1548 case CmpInst::FCMP_UGT:
1549 return AMDGPU::S_CMP_NLE_F16;
1550 case CmpInst::FCMP_UGE:
1551 return AMDGPU::S_CMP_NLT_F16;
1552 case CmpInst::FCMP_ULT:
1553 return AMDGPU::S_CMP_NGE_F16;
1554 case CmpInst::FCMP_ULE:
1555 return AMDGPU::S_CMP_NGT_F16;
1556 case CmpInst::FCMP_UNE:
1557 return AMDGPU::S_CMP_NEQ_F16;
1558 default:
1559 llvm_unreachable("Unknown condition code!");
1560 }
1561 }
1562
1563 return -1;
1564}
1565
1566bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1567
1568 MachineBasicBlock *BB = I.getParent();
1569 const DebugLoc &DL = I.getDebugLoc();
1570
1571 Register SrcReg = I.getOperand(2).getReg();
1572 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1573
1574 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1575
1576 Register CCReg = I.getOperand(0).getReg();
1577 if (!isVCC(CCReg, *MRI)) {
1578 int Opcode = getS_CMPOpcode(Pred, Size);
1579 if (Opcode == -1)
1580 return false;
1581 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1582 .add(I.getOperand(2))
1583 .add(I.getOperand(3));
1584 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1585 .addReg(AMDGPU::SCC);
1586 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1587 bool Ret =
1588 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1589 I.eraseFromParent();
1590 return Ret;
1591 }
1592
1593 if (I.getOpcode() == AMDGPU::G_FCMP)
1594 return false;
1595
1596 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1597 if (Opcode == -1)
1598 return false;
1599
1600 MachineInstrBuilder ICmp;
1601 // t16 instructions
1602 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1603 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1604 .addImm(0)
1605 .add(I.getOperand(2))
1606 .addImm(0)
1607 .add(I.getOperand(3))
1608 .addImm(0); // op_sel
1609 } else {
1610 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1611 .add(I.getOperand(2))
1612 .add(I.getOperand(3));
1613 }
1614
1615 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1616 *TRI.getBoolRC(), *MRI);
1617 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1618 I.eraseFromParent();
1619 return true;
1620}
1621
1622bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1623 Register Dst = I.getOperand(0).getReg();
1624 if (isVCC(Dst, *MRI))
1625 return false;
1626
1627 LLT DstTy = MRI->getType(Dst);
1628 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1629 return false;
1630
1631 MachineBasicBlock *BB = I.getParent();
1632 const DebugLoc &DL = I.getDebugLoc();
1633 Register SrcReg = I.getOperand(2).getReg();
1634 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1635
1636 // i1 inputs are not supported in GlobalISel.
1637 if (Size == 1)
1638 return false;
1639
1640 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1641 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1642 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1643 I.eraseFromParent();
1644 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1645 }
1646
1647 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1648 if (Opcode == -1)
1649 return false;
1650
1651 MachineInstrBuilder SelectedMI;
1652 MachineOperand &LHS = I.getOperand(2);
1653 MachineOperand &RHS = I.getOperand(3);
1654 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1655 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1656 Register Src0Reg =
1657 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1658 Register Src1Reg =
1659 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1660 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1661 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1662 SelectedMI.addImm(Src0Mods);
1663 SelectedMI.addReg(Src0Reg);
1664 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1665 SelectedMI.addImm(Src1Mods);
1666 SelectedMI.addReg(Src1Reg);
1667 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1668 SelectedMI.addImm(0); // clamp
1669 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1670 SelectedMI.addImm(0); // op_sel
1671
1672 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1673 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1674
1675 I.eraseFromParent();
1676 return true;
1677}
1678
1679// Ballot has to zero bits in input lane-mask that are zero in current exec,
1680// Done as AND with exec. For inputs that are results of instruction that
1681// implicitly use same exec, for example compares in same basic block or SCC to
1682// VCC copy, use copy.
1685 MachineInstr *MI = MRI.getVRegDef(Reg);
1686 if (MI->getParent() != MBB)
1687 return false;
1688
1689 // Lane mask generated by SCC to VCC copy.
1690 if (MI->getOpcode() == AMDGPU::COPY) {
1691 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1692 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1693 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1694 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1695 return true;
1696 }
1697
1698 // Lane mask generated by SCC to VCC copy
1699 if (MI->getOpcode() == AMDGPU::G_AMDGPU_COPY_VCC_SCC)
1700 return true;
1701
1702 // Lane mask generated using compare with same exec.
1703 if (isa<GAnyCmp>(MI))
1704 return true;
1705
1706 Register LHS, RHS;
1707 // Look through AND.
1708 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1709 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1711
1712 return false;
1713}
1714
1715bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1716 MachineBasicBlock *BB = I.getParent();
1717 const DebugLoc &DL = I.getDebugLoc();
1718 Register DstReg = I.getOperand(0).getReg();
1719 Register SrcReg = I.getOperand(2).getReg();
1720 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1721 const unsigned WaveSize = STI.getWavefrontSize();
1722
1723 // In the common case, the return type matches the wave size.
1724 // However we also support emitting i64 ballots in wave32 mode.
1725 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1726 return false;
1727
1728 std::optional<ValueAndVReg> Arg =
1730
1731 Register Dst = DstReg;
1732 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1733 if (BallotSize != WaveSize) {
1734 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1735 }
1736
1737 if (Arg) {
1738 const int64_t Value = Arg->Value.getZExtValue();
1739 if (Value == 0) {
1740 // Dst = S_MOV 0
1741 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1742 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1743 } else {
1744 // Dst = COPY EXEC
1745 assert(Value == 1);
1746 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1747 }
1748 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1749 return false;
1750 } else {
1751 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1752 // Dst = COPY SrcReg
1753 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1754 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1755 return false;
1756 } else {
1757 // Dst = S_AND SrcReg, EXEC
1758 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1759 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1760 .addReg(SrcReg)
1761 .addReg(TRI.getExec())
1762 .setOperandDead(3); // Dead scc
1763 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1764 }
1765 }
1766
1767 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1768 if (BallotSize != WaveSize) {
1769 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1770 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1771 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1772 .addReg(Dst)
1773 .addImm(AMDGPU::sub0)
1774 .addReg(HiReg)
1775 .addImm(AMDGPU::sub1);
1776 }
1777
1778 I.eraseFromParent();
1779 return true;
1780}
1781
1782bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1783 Register DstReg = I.getOperand(0).getReg();
1784 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1785 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1786 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1787 return false;
1788
1789 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1790
1791 Module *M = MF->getFunction().getParent();
1792 const MDNode *Metadata = I.getOperand(2).getMetadata();
1793 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1794 auto *RelocSymbol = cast<GlobalVariable>(
1795 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1796
1797 MachineBasicBlock *BB = I.getParent();
1798 BuildMI(*BB, &I, I.getDebugLoc(),
1799 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1801
1802 I.eraseFromParent();
1803 return true;
1804}
1805
1806bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1807 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1808
1809 Register DstReg = I.getOperand(0).getReg();
1810 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1811 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1812 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1813
1814 MachineBasicBlock *MBB = I.getParent();
1815 const DebugLoc &DL = I.getDebugLoc();
1816
1817 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1818
1819 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1820 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1821 MIB.addImm(MFI->getLDSSize());
1822 } else {
1823 Module *M = MF->getFunction().getParent();
1824 const GlobalValue *GV =
1825 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1827 }
1828
1829 I.eraseFromParent();
1830 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1831 return true;
1832}
1833
1834bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1835 MachineBasicBlock *MBB = I.getParent();
1836 MachineFunction &MF = *MBB->getParent();
1837 const DebugLoc &DL = I.getDebugLoc();
1838
1839 MachineOperand &Dst = I.getOperand(0);
1840 Register DstReg = Dst.getReg();
1841 unsigned Depth = I.getOperand(2).getImm();
1842
1843 const TargetRegisterClass *RC
1844 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1845 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1846 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1847 return false;
1848
1849 // Check for kernel and shader functions
1850 if (Depth != 0 ||
1851 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1852 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1853 .addImm(0);
1854 I.eraseFromParent();
1855 return true;
1856 }
1857
1858 MachineFrameInfo &MFI = MF.getFrameInfo();
1859 // There is a call to @llvm.returnaddress in this function
1860 MFI.setReturnAddressIsTaken(true);
1861
1862 // Get the return address reg and mark it as an implicit live-in
1863 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1864 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1865 AMDGPU::SReg_64RegClass, DL);
1866 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1867 .addReg(LiveIn);
1868 I.eraseFromParent();
1869 return true;
1870}
1871
1872bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1873 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1874 // SelectionDAG uses for wave32 vs wave64.
1875 MachineBasicBlock *BB = MI.getParent();
1876 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1877 .add(MI.getOperand(1));
1878
1879 Register Reg = MI.getOperand(1).getReg();
1880 MI.eraseFromParent();
1881
1882 if (!MRI->getRegClassOrNull(Reg))
1883 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1884 return true;
1885}
1886
1887bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1888 MachineInstr &MI, Intrinsic::ID IntrID) const {
1889 MachineBasicBlock *MBB = MI.getParent();
1890 MachineFunction *MF = MBB->getParent();
1891 const DebugLoc &DL = MI.getDebugLoc();
1892
1893 unsigned IndexOperand = MI.getOperand(7).getImm();
1894 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1895 bool WaveDone = MI.getOperand(9).getImm() != 0;
1896
1897 if (WaveDone && !WaveRelease) {
1898 // TODO: Move this to IR verifier
1899 const Function &Fn = MF->getFunction();
1900 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1901 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1902 }
1903
1904 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1905 IndexOperand &= ~0x3f;
1906 unsigned CountDw = 0;
1907
1908 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1909 CountDw = (IndexOperand >> 24) & 0xf;
1910 IndexOperand &= ~(0xf << 24);
1911
1912 if (CountDw < 1 || CountDw > 4) {
1913 const Function &Fn = MF->getFunction();
1914 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1915 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1916 CountDw = 1;
1917 }
1918 }
1919
1920 if (IndexOperand) {
1921 const Function &Fn = MF->getFunction();
1922 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1923 Fn, "ds_ordered_count: bad index operand", DL));
1924 }
1925
1926 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1927 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1928
1929 unsigned Offset0 = OrderedCountIndex << 2;
1930 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1931
1932 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1933 Offset1 |= (CountDw - 1) << 6;
1934
1935 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1936 Offset1 |= ShaderType << 2;
1937
1938 unsigned Offset = Offset0 | (Offset1 << 8);
1939
1940 Register M0Val = MI.getOperand(2).getReg();
1941 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1942 .addReg(M0Val);
1943
1944 Register DstReg = MI.getOperand(0).getReg();
1945 Register ValReg = MI.getOperand(3).getReg();
1946 MachineInstrBuilder DS =
1947 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1948 .addReg(ValReg)
1949 .addImm(Offset)
1950 .cloneMemRefs(MI);
1951
1952 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1953 return false;
1954
1955 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1956 MI.eraseFromParent();
1957 return true;
1958}
1959
1960static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1961 switch (IntrID) {
1962 case Intrinsic::amdgcn_ds_gws_init:
1963 return AMDGPU::DS_GWS_INIT;
1964 case Intrinsic::amdgcn_ds_gws_barrier:
1965 return AMDGPU::DS_GWS_BARRIER;
1966 case Intrinsic::amdgcn_ds_gws_sema_v:
1967 return AMDGPU::DS_GWS_SEMA_V;
1968 case Intrinsic::amdgcn_ds_gws_sema_br:
1969 return AMDGPU::DS_GWS_SEMA_BR;
1970 case Intrinsic::amdgcn_ds_gws_sema_p:
1971 return AMDGPU::DS_GWS_SEMA_P;
1972 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1973 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1974 default:
1975 llvm_unreachable("not a gws intrinsic");
1976 }
1977}
1978
1979bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1980 Intrinsic::ID IID) const {
1981 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1982 !STI.hasGWSSemaReleaseAll()))
1983 return false;
1984
1985 // intrinsic ID, vsrc, offset
1986 const bool HasVSrc = MI.getNumOperands() == 3;
1987 assert(HasVSrc || MI.getNumOperands() == 2);
1988
1989 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1990 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1991 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1992 return false;
1993
1994 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1995 unsigned ImmOffset;
1996
1997 MachineBasicBlock *MBB = MI.getParent();
1998 const DebugLoc &DL = MI.getDebugLoc();
1999
2000 MachineInstr *Readfirstlane = nullptr;
2001
2002 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
2003 // incoming offset, in case there's an add of a constant. We'll have to put it
2004 // back later.
2005 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
2006 Readfirstlane = OffsetDef;
2007 BaseOffset = OffsetDef->getOperand(1).getReg();
2008 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
2009 }
2010
2011 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
2012 // If we have a constant offset, try to use the 0 in m0 as the base.
2013 // TODO: Look into changing the default m0 initialization value. If the
2014 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2015 // the immediate offset.
2016
2017 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
2018 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2019 .addImm(0);
2020 } else {
2021 std::tie(BaseOffset, ImmOffset) =
2022 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
2023
2024 if (Readfirstlane) {
2025 // We have the constant offset now, so put the readfirstlane back on the
2026 // variable component.
2027 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
2028 return false;
2029
2030 Readfirstlane->getOperand(1).setReg(BaseOffset);
2031 BaseOffset = Readfirstlane->getOperand(0).getReg();
2032 } else {
2033 if (!RBI.constrainGenericRegister(BaseOffset,
2034 AMDGPU::SReg_32RegClass, *MRI))
2035 return false;
2036 }
2037
2038 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2039 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
2040 .addReg(BaseOffset)
2041 .addImm(16)
2042 .setOperandDead(3); // Dead scc
2043
2044 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2045 .addReg(M0Base);
2046 }
2047
2048 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2049 // offset field) % 64. Some versions of the programming guide omit the m0
2050 // part, or claim it's from offset 0.
2051
2052 unsigned Opc = gwsIntrinToOpcode(IID);
2053 const MCInstrDesc &InstrDesc = TII.get(Opc);
2054
2055 if (HasVSrc) {
2056 Register VSrc = MI.getOperand(1).getReg();
2057
2058 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
2059 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
2060 const TargetRegisterClass *SubRC =
2061 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
2062
2063 if (!SubRC) {
2064 // 32-bit normal case.
2065 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
2066 return false;
2067
2068 BuildMI(*MBB, &MI, DL, InstrDesc)
2069 .addReg(VSrc)
2070 .addImm(ImmOffset)
2071 .cloneMemRefs(MI);
2072 } else {
2073 // Requires even register alignment, so create 64-bit value and pad the
2074 // top half with undef.
2075 Register DataReg = MRI->createVirtualRegister(DataRC);
2076 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
2077 return false;
2078
2079 Register UndefReg = MRI->createVirtualRegister(SubRC);
2080 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2081 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
2082 .addReg(VSrc)
2083 .addImm(AMDGPU::sub0)
2084 .addReg(UndefReg)
2085 .addImm(AMDGPU::sub1);
2086
2087 BuildMI(*MBB, &MI, DL, InstrDesc)
2088 .addReg(DataReg)
2089 .addImm(ImmOffset)
2090 .cloneMemRefs(MI);
2091 }
2092 } else {
2093 BuildMI(*MBB, &MI, DL, InstrDesc)
2094 .addImm(ImmOffset)
2095 .cloneMemRefs(MI);
2096 }
2097
2098 MI.eraseFromParent();
2099 return true;
2100}
2101
2102bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2103 bool IsAppend) const {
2104 Register PtrBase = MI.getOperand(2).getReg();
2105 LLT PtrTy = MRI->getType(PtrBase);
2106 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2107
2108 unsigned Offset;
2109 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2110
2111 // TODO: Should this try to look through readfirstlane like GWS?
2112 if (!isDSOffsetLegal(PtrBase, Offset)) {
2113 PtrBase = MI.getOperand(2).getReg();
2114 Offset = 0;
2115 }
2116
2117 MachineBasicBlock *MBB = MI.getParent();
2118 const DebugLoc &DL = MI.getDebugLoc();
2119 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2120
2121 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2122 .addReg(PtrBase);
2123 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2124 return false;
2125
2126 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2127 .addImm(Offset)
2128 .addImm(IsGDS ? -1 : 0)
2129 .cloneMemRefs(MI);
2130 MI.eraseFromParent();
2131 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2132 return true;
2133}
2134
2135bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2136 MachineFunction *MF = MI.getMF();
2137 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2138
2139 MFInfo->setInitWholeWave();
2140 return selectImpl(MI, *CoverageInfo);
2141}
2142
2143static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2144 bool &IsTexFail) {
2145 if (TexFailCtrl)
2146 IsTexFail = true;
2147
2148 TFE = TexFailCtrl & 0x1;
2149 TexFailCtrl &= ~(uint64_t)0x1;
2150 LWE = TexFailCtrl & 0x2;
2151 TexFailCtrl &= ~(uint64_t)0x2;
2152
2153 return TexFailCtrl == 0;
2154}
2155
2156bool AMDGPUInstructionSelector::selectImageIntrinsic(
2157 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2158 MachineBasicBlock *MBB = MI.getParent();
2159 const DebugLoc &DL = MI.getDebugLoc();
2160 unsigned IntrOpcode = Intr->BaseOpcode;
2161
2162 // For image atomic: use no-return opcode if result is unused.
2163 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2164 Register ResultDef = MI.getOperand(0).getReg();
2165 if (MRI->use_nodbg_empty(ResultDef))
2166 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2167 }
2168
2169 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2171
2172 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2173 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2174 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2175 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2176 const bool IsGFX13Plus = AMDGPU::isGFX13Plus(STI);
2177
2178 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2179
2180 Register VDataIn = AMDGPU::NoRegister;
2181 Register VDataOut = AMDGPU::NoRegister;
2182 LLT VDataTy;
2183 int NumVDataDwords = -1;
2184 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2185 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2186
2187 bool Unorm;
2188 if (!BaseOpcode->Sampler)
2189 Unorm = true;
2190 else
2191 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2192
2193 bool TFE;
2194 bool LWE;
2195 bool IsTexFail = false;
2196 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2197 TFE, LWE, IsTexFail))
2198 return false;
2199
2200 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2201 const bool IsA16 = (Flags & 1) != 0;
2202 const bool IsG16 = (Flags & 2) != 0;
2203
2204 // A16 implies 16 bit gradients if subtarget doesn't support G16
2205 if (IsA16 && !STI.hasG16() && !IsG16)
2206 return false;
2207
2208 unsigned DMask = 0;
2209 unsigned DMaskLanes = 0;
2210
2211 if (BaseOpcode->Atomic) {
2212 if (!BaseOpcode->NoReturn)
2213 VDataOut = MI.getOperand(0).getReg();
2214 VDataIn = MI.getOperand(2).getReg();
2215 LLT Ty = MRI->getType(VDataIn);
2216
2217 // Be careful to allow atomic swap on 16-bit element vectors.
2218 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2219 Ty.getSizeInBits() == 128 :
2220 Ty.getSizeInBits() == 64;
2221
2222 if (BaseOpcode->AtomicX2) {
2223 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2224
2225 DMask = Is64Bit ? 0xf : 0x3;
2226 NumVDataDwords = Is64Bit ? 4 : 2;
2227 } else {
2228 DMask = Is64Bit ? 0x3 : 0x1;
2229 NumVDataDwords = Is64Bit ? 2 : 1;
2230 }
2231 } else {
2232 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2233 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2234
2235 if (BaseOpcode->Store) {
2236 VDataIn = MI.getOperand(1).getReg();
2237 VDataTy = MRI->getType(VDataIn);
2238 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2239 } else if (BaseOpcode->NoReturn) {
2240 NumVDataDwords = 0;
2241 } else {
2242 VDataOut = MI.getOperand(0).getReg();
2243 VDataTy = MRI->getType(VDataOut);
2244 NumVDataDwords = DMaskLanes;
2245
2246 if (IsD16 && !STI.hasUnpackedD16VMem())
2247 NumVDataDwords = (DMaskLanes + 1) / 2;
2248 }
2249 }
2250
2251 // Set G16 opcode
2252 if (Subtarget->hasG16() && IsG16) {
2253 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2255 assert(G16MappingInfo);
2256 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2257 }
2258
2259 // TODO: Check this in verifier.
2260 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2261
2262 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2263 // Keep GLC only when the atomic's result is actually used.
2264 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2266 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2268 return false;
2269
2270 int NumVAddrRegs = 0;
2271 int NumVAddrDwords = 0;
2272 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2273 // Skip the $noregs and 0s inserted during legalization.
2274 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2275 if (!AddrOp.isReg())
2276 continue; // XXX - Break?
2277
2278 Register Addr = AddrOp.getReg();
2279 if (!Addr)
2280 break;
2281
2282 ++NumVAddrRegs;
2283 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2284 }
2285
2286 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2287 // NSA, these should have been packed into a single value in the first
2288 // address register
2289 const bool UseNSA =
2290 NumVAddrRegs != 1 &&
2291 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2292 : NumVAddrDwords == NumVAddrRegs);
2293 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2294 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2295 return false;
2296 }
2297
2298 if (IsTexFail)
2299 ++NumVDataDwords;
2300
2301 int Opcode = -1;
2302 if (IsGFX13Plus) {
2303 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx13,
2304 NumVDataDwords, NumVAddrDwords);
2305 } else if (IsGFX12Plus) {
2306 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2307 NumVDataDwords, NumVAddrDwords);
2308 } else if (IsGFX11Plus) {
2309 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2310 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2311 : AMDGPU::MIMGEncGfx11Default,
2312 NumVDataDwords, NumVAddrDwords);
2313 } else if (IsGFX10Plus) {
2314 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2315 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2316 : AMDGPU::MIMGEncGfx10Default,
2317 NumVDataDwords, NumVAddrDwords);
2318 } else {
2319 if (Subtarget->hasGFX90AInsts()) {
2320 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2321 NumVDataDwords, NumVAddrDwords);
2322 if (Opcode == -1) {
2323 LLVM_DEBUG(
2324 dbgs()
2325 << "requested image instruction is not supported on this GPU\n");
2326 return false;
2327 }
2328 }
2329 if (Opcode == -1 &&
2330 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2331 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2332 NumVDataDwords, NumVAddrDwords);
2333 if (Opcode == -1)
2334 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2335 NumVDataDwords, NumVAddrDwords);
2336 }
2337 if (Opcode == -1)
2338 return false;
2339
2340 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2341 .cloneMemRefs(MI);
2342
2343 if (VDataOut) {
2344 if (BaseOpcode->AtomicX2) {
2345 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2346
2347 Register TmpReg = MRI->createVirtualRegister(
2348 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2349 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2350
2351 MIB.addDef(TmpReg);
2352 if (!MRI->use_empty(VDataOut)) {
2353 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2354 .addReg(TmpReg, RegState::Kill, SubReg);
2355 }
2356
2357 } else {
2358 MIB.addDef(VDataOut); // vdata output
2359 }
2360 }
2361
2362 if (VDataIn)
2363 MIB.addReg(VDataIn); // vdata input
2364
2365 for (int I = 0; I != NumVAddrRegs; ++I) {
2366 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2367 if (SrcOp.isReg()) {
2368 assert(SrcOp.getReg() != 0);
2369 MIB.addReg(SrcOp.getReg());
2370 }
2371 }
2372
2373 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2374 if (BaseOpcode->Sampler)
2375 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2376
2377 MIB.addImm(DMask); // dmask
2378
2379 if (IsGFX10Plus)
2380 MIB.addImm(DimInfo->Encoding);
2381 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2382 MIB.addImm(Unorm);
2383
2384 MIB.addImm(CPol);
2385 MIB.addImm(IsA16 && // a16 or r128
2386 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2387 if (IsGFX10Plus)
2388 MIB.addImm(IsA16 ? -1 : 0);
2389
2390 if (!Subtarget->hasGFX90AInsts()) {
2391 MIB.addImm(TFE); // tfe
2392 } else if (TFE) {
2393 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2394 return false;
2395 }
2396
2397 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2398 MIB.addImm(LWE); // lwe
2399 if (!IsGFX10Plus)
2400 MIB.addImm(DimInfo->DA ? -1 : 0);
2401 if (BaseOpcode->HasD16)
2402 MIB.addImm(IsD16 ? -1 : 0);
2403
2404 MI.eraseFromParent();
2405 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2406 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2407 return true;
2408}
2409
2410// We need to handle this here because tablegen doesn't support matching
2411// instructions with multiple outputs.
2412bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2413 MachineInstr &MI) const {
2414 Register Dst0 = MI.getOperand(0).getReg();
2415 Register Dst1 = MI.getOperand(1).getReg();
2416
2417 const DebugLoc &DL = MI.getDebugLoc();
2418 MachineBasicBlock *MBB = MI.getParent();
2419
2420 Register Addr = MI.getOperand(3).getReg();
2421 Register Data0 = MI.getOperand(4).getReg();
2422 Register Data1 = MI.getOperand(5).getReg();
2423 unsigned Offset = MI.getOperand(6).getImm();
2424
2425 unsigned Opc;
2426 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2427 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2428 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2429 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2430 break;
2431 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2432 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2433 break;
2434 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2435 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2436 break;
2437 }
2438
2439 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2440 .addDef(Dst1)
2441 .addUse(Addr)
2442 .addUse(Data0)
2443 .addUse(Data1)
2444 .addImm(Offset)
2445 .cloneMemRefs(MI);
2446
2447 MI.eraseFromParent();
2448 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2449 return true;
2450}
2451
2452bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2453 MachineInstr &I) const {
2454 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2455 switch (IntrinsicID) {
2456 case Intrinsic::amdgcn_end_cf:
2457 return selectEndCfIntrinsic(I);
2458 case Intrinsic::amdgcn_ds_ordered_add:
2459 case Intrinsic::amdgcn_ds_ordered_swap:
2460 return selectDSOrderedIntrinsic(I, IntrinsicID);
2461 case Intrinsic::amdgcn_ds_gws_init:
2462 case Intrinsic::amdgcn_ds_gws_barrier:
2463 case Intrinsic::amdgcn_ds_gws_sema_v:
2464 case Intrinsic::amdgcn_ds_gws_sema_br:
2465 case Intrinsic::amdgcn_ds_gws_sema_p:
2466 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2467 return selectDSGWSIntrinsic(I, IntrinsicID);
2468 case Intrinsic::amdgcn_ds_append:
2469 return selectDSAppendConsume(I, true);
2470 case Intrinsic::amdgcn_ds_consume:
2471 return selectDSAppendConsume(I, false);
2472 case Intrinsic::amdgcn_init_whole_wave:
2473 return selectInitWholeWave(I);
2474 case Intrinsic::amdgcn_raw_buffer_load_lds:
2475 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2476 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2477 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2478 case Intrinsic::amdgcn_struct_buffer_load_lds:
2479 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2480 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2481 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2482 return selectBufferLoadLds(I);
2483 // Until we can store both the address space of the global and the LDS
2484 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2485 // that the argument is a global pointer (buffer pointers have been handled by
2486 // a LLVM IR-level lowering).
2487 case Intrinsic::amdgcn_load_to_lds:
2488 case Intrinsic::amdgcn_load_async_to_lds:
2489 case Intrinsic::amdgcn_global_load_lds:
2490 case Intrinsic::amdgcn_global_load_async_lds:
2491 return selectGlobalLoadLds(I);
2492 case Intrinsic::amdgcn_tensor_load_to_lds:
2493 case Intrinsic::amdgcn_tensor_store_from_lds:
2494 return selectTensorLoadStore(I, IntrinsicID);
2495 case Intrinsic::amdgcn_asyncmark:
2496 case Intrinsic::amdgcn_wait_asyncmark:
2497 if (!Subtarget->hasAsyncMark())
2498 return false;
2499 break;
2500 case Intrinsic::amdgcn_exp_compr:
2501 if (!STI.hasCompressedExport()) {
2503 return false;
2504 }
2505 break;
2506 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2507 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2508 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2509 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2510 return selectDSBvhStackIntrinsic(I);
2511 case Intrinsic::amdgcn_s_alloc_vgpr: {
2512 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2513 // SCC. We then need to COPY it into the result vreg.
2514 MachineBasicBlock *MBB = I.getParent();
2515 const DebugLoc &DL = I.getDebugLoc();
2516
2517 Register ResReg = I.getOperand(0).getReg();
2518
2519 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2520 .add(I.getOperand(2));
2521 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2522 .addReg(AMDGPU::SCC);
2523 I.eraseFromParent();
2524 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2525 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2526 }
2527 case Intrinsic::amdgcn_s_barrier_init:
2528 case Intrinsic::amdgcn_s_barrier_signal_var:
2529 return selectNamedBarrierInit(I, IntrinsicID);
2530 case Intrinsic::amdgcn_s_wakeup_barrier: {
2531 if (!STI.hasSWakeupBarrier()) {
2533 return false;
2534 }
2535 return selectNamedBarrierInst(I, IntrinsicID);
2536 }
2537 case Intrinsic::amdgcn_s_barrier_join:
2538 case Intrinsic::amdgcn_s_get_named_barrier_state:
2539 return selectNamedBarrierInst(I, IntrinsicID);
2540 case Intrinsic::amdgcn_s_get_barrier_state:
2541 return selectSGetBarrierState(I, IntrinsicID);
2542 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2543 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2544 }
2545 return selectImpl(I, *CoverageInfo);
2546}
2547
2548bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2549 if (selectImpl(I, *CoverageInfo))
2550 return true;
2551
2552 MachineBasicBlock *BB = I.getParent();
2553 const DebugLoc &DL = I.getDebugLoc();
2554
2555 Register DstReg = I.getOperand(0).getReg();
2556 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2557 assert(Size <= 32 || Size == 64);
2558 const MachineOperand &CCOp = I.getOperand(1);
2559 Register CCReg = CCOp.getReg();
2560 if (!isVCC(CCReg, *MRI)) {
2561 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2562 AMDGPU::S_CSELECT_B32;
2563 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2564 .addReg(CCReg);
2565
2566 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2567 // bank, because it does not cover the register class that we used to represent
2568 // for it. So we need to manually set the register class here.
2569 if (!MRI->getRegClassOrNull(CCReg))
2570 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2571 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2572 .add(I.getOperand(2))
2573 .add(I.getOperand(3));
2574
2576 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2577 I.eraseFromParent();
2578 return true;
2579 }
2580
2581 // Wide VGPR select should have been split in RegBankSelect.
2582 if (Size > 32)
2583 return false;
2584
2585 MachineInstr *Select =
2586 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2587 .addImm(0)
2588 .add(I.getOperand(3))
2589 .addImm(0)
2590 .add(I.getOperand(2))
2591 .add(I.getOperand(1));
2592
2594 I.eraseFromParent();
2595 return true;
2596}
2597
2598bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2599 Register DstReg = I.getOperand(0).getReg();
2600 Register SrcReg = I.getOperand(1).getReg();
2601 const LLT DstTy = MRI->getType(DstReg);
2602 const LLT SrcTy = MRI->getType(SrcReg);
2603 const LLT S1 = LLT::scalar(1);
2604
2605 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2606 const RegisterBank *DstRB;
2607 if (DstTy == S1) {
2608 // This is a special case. We don't treat s1 for legalization artifacts as
2609 // vcc booleans.
2610 DstRB = SrcRB;
2611 } else {
2612 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2613 if (SrcRB != DstRB)
2614 return false;
2615 }
2616
2617 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2618
2619 unsigned DstSize = DstTy.getSizeInBits();
2620 unsigned SrcSize = SrcTy.getSizeInBits();
2621
2622 const TargetRegisterClass *SrcRC =
2623 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2624 const TargetRegisterClass *DstRC =
2625 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2626 if (!SrcRC || !DstRC)
2627 return false;
2628
2629 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2630 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2631 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2632 return false;
2633 }
2634
2635 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2636 assert(STI.useRealTrue16Insts());
2637 const DebugLoc &DL = I.getDebugLoc();
2638 MachineBasicBlock *MBB = I.getParent();
2639 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2640 .addReg(SrcReg, {}, AMDGPU::lo16);
2641 I.eraseFromParent();
2642 return true;
2643 }
2644
2645 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2646 MachineBasicBlock *MBB = I.getParent();
2647 const DebugLoc &DL = I.getDebugLoc();
2648
2649 Register LoReg = MRI->createVirtualRegister(DstRC);
2650 Register HiReg = MRI->createVirtualRegister(DstRC);
2651 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2652 .addReg(SrcReg, {}, AMDGPU::sub0);
2653 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2654 .addReg(SrcReg, {}, AMDGPU::sub1);
2655
2656 if (IsVALU && STI.hasSDWA()) {
2657 // Write the low 16-bits of the high element into the high 16-bits of the
2658 // low element.
2659 MachineInstr *MovSDWA =
2660 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2661 .addImm(0) // $src0_modifiers
2662 .addReg(HiReg) // $src0
2663 .addImm(0) // $clamp
2664 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2665 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2666 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2667 .addReg(LoReg, RegState::Implicit);
2668 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2669 } else {
2670 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2671 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2672 Register ImmReg = MRI->createVirtualRegister(DstRC);
2673 if (IsVALU) {
2674 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2675 .addImm(16)
2676 .addReg(HiReg);
2677 } else {
2678 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2679 .addReg(HiReg)
2680 .addImm(16)
2681 .setOperandDead(3); // Dead scc
2682 }
2683
2684 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2685 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2686 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2687
2688 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2689 .addImm(0xffff);
2690 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2691 .addReg(LoReg)
2692 .addReg(ImmReg);
2693 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2694 .addReg(TmpReg0)
2695 .addReg(TmpReg1);
2696
2697 if (!IsVALU) {
2698 And.setOperandDead(3); // Dead scc
2699 Or.setOperandDead(3); // Dead scc
2700 }
2701 }
2702
2703 I.eraseFromParent();
2704 return true;
2705 }
2706
2707 if (!DstTy.isScalar())
2708 return false;
2709
2710 if (SrcSize > 32) {
2711 unsigned SubRegIdx = DstSize < 32
2712 ? static_cast<unsigned>(AMDGPU::sub0)
2713 : TRI.getSubRegFromChannel(0, DstSize / 32);
2714 if (SubRegIdx == AMDGPU::NoSubRegister)
2715 return false;
2716
2717 // Deal with weird cases where the class only partially supports the subreg
2718 // index.
2719 const TargetRegisterClass *SrcWithSubRC
2720 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2721 if (!SrcWithSubRC)
2722 return false;
2723
2724 if (SrcWithSubRC != SrcRC) {
2725 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2726 return false;
2727 }
2728
2729 I.getOperand(1).setSubReg(SubRegIdx);
2730 }
2731
2732 I.setDesc(TII.get(TargetOpcode::COPY));
2733 return true;
2734}
2735
2736/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2737static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2739 int SignedMask = static_cast<int>(Mask);
2740 return SignedMask >= -16 && SignedMask <= 64;
2741}
2742
2743// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2744const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2745 Register Reg, const MachineRegisterInfo &MRI,
2746 const TargetRegisterInfo &TRI) const {
2747 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2748 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2749 return RB;
2750
2751 // Ignore the type, since we don't use vcc in artifacts.
2752 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2753 return &RBI.getRegBankFromRegClass(*RC, LLT());
2754 return nullptr;
2755}
2756
2757bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2758 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2759 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2760 const DebugLoc &DL = I.getDebugLoc();
2761 MachineBasicBlock &MBB = *I.getParent();
2762 const Register DstReg = I.getOperand(0).getReg();
2763 const Register SrcReg = I.getOperand(1).getReg();
2764
2765 const LLT DstTy = MRI->getType(DstReg);
2766 const LLT SrcTy = MRI->getType(SrcReg);
2767 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2768 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2769 const unsigned DstSize = DstTy.getSizeInBits();
2770 if (!DstTy.isScalar())
2771 return false;
2772
2773 // Artifact casts should never use vcc.
2774 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2775
2776 // FIXME: This should probably be illegal and split earlier.
2777 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2778 if (DstSize <= 32)
2779 return selectCOPY(I);
2780
2781 const TargetRegisterClass *SrcRC =
2782 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2783 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2784 const TargetRegisterClass *DstRC =
2785 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2786
2787 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2788 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2789 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2790 .addReg(SrcReg)
2791 .addImm(AMDGPU::sub0)
2792 .addReg(UndefReg)
2793 .addImm(AMDGPU::sub1);
2794 I.eraseFromParent();
2795
2796 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2797 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2798 }
2799
2800 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2801 // 64-bit should have been split up in RegBankSelect
2802
2803 // Try to use an and with a mask if it will save code size.
2804 unsigned Mask;
2805 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2806 MachineInstr *ExtI =
2807 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2808 .addImm(Mask)
2809 .addReg(SrcReg);
2810 I.eraseFromParent();
2811 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2812 return true;
2813 }
2814
2815 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2816 MachineInstr *ExtI =
2817 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2818 .addReg(SrcReg)
2819 .addImm(0) // Offset
2820 .addImm(SrcSize); // Width
2821 I.eraseFromParent();
2822 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2823 return true;
2824 }
2825
2826 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2827 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2828 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2829 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2830 return false;
2831
2832 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2833 const unsigned SextOpc = SrcSize == 8 ?
2834 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2835 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2836 .addReg(SrcReg);
2837 I.eraseFromParent();
2838 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2839 }
2840
2841 // Using a single 32-bit SALU to calculate the high half is smaller than
2842 // S_BFE with a literal constant operand.
2843 if (DstSize > 32 && SrcSize == 32) {
2844 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2845 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2846 if (Signed) {
2847 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2848 .addReg(SrcReg, {}, SubReg)
2849 .addImm(31)
2850 .setOperandDead(3); // Dead scc
2851 } else {
2852 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2853 .addImm(0);
2854 }
2855 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2856 .addReg(SrcReg, {}, SubReg)
2857 .addImm(AMDGPU::sub0)
2858 .addReg(HiReg)
2859 .addImm(AMDGPU::sub1);
2860 I.eraseFromParent();
2861 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2862 *MRI);
2863 }
2864
2865 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2866 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2867
2868 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2869 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2870 // We need a 64-bit register source, but the high bits don't matter.
2871 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2872 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2873 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2874
2875 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2876 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2877 .addReg(SrcReg, {}, SubReg)
2878 .addImm(AMDGPU::sub0)
2879 .addReg(UndefReg)
2880 .addImm(AMDGPU::sub1);
2881
2882 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2883 .addReg(ExtReg)
2884 .addImm(SrcSize << 16);
2885
2886 I.eraseFromParent();
2887 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2888 }
2889
2890 unsigned Mask;
2891 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2892 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2893 .addReg(SrcReg)
2894 .addImm(Mask)
2895 .setOperandDead(3); // Dead scc
2896 } else {
2897 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2898 .addReg(SrcReg)
2899 .addImm(SrcSize << 16);
2900 }
2901
2902 I.eraseFromParent();
2903 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2904 }
2905
2906 return false;
2907}
2908
2912
2914 Register BitcastSrc;
2915 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2916 Reg = BitcastSrc;
2917 return Reg;
2918}
2919
2921 Register &Out) {
2922 // When unmerging a register that is composed of 2 x 16-bit values allow to
2923 // use an extract hi instruction for the upper 16 bits. We only need to check
2924 // the size of `In` as all defs are guaranteed to be the same type for
2925 // GUnmerge.
2926 if (auto *Unmerge = dyn_cast<GUnmerge>(MRI.getVRegDef(In))) {
2927 if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(1).getReg() == In &&
2928 MRI.getType(In).getSizeInBits() == 16) {
2929 Out = Unmerge->getSourceReg();
2930 return true;
2931 }
2932 }
2933
2934 Register Trunc;
2935 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2936 return false;
2937
2938 Register LShlSrc;
2939 Register Cst;
2940 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2941 Cst = stripCopy(Cst, MRI);
2942 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2943 Out = stripBitCast(LShlSrc, MRI);
2944 return true;
2945 }
2946 }
2947
2948 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2949 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2950 return false;
2951
2952 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2953 LLT::fixed_vector(2, 16));
2954
2955 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2956 assert(Mask.size() == 2);
2957
2958 if (Mask[0] == 1 && Mask[1] <= 1) {
2959 Out = Shuffle->getOperand(0).getReg();
2960 return true;
2961 }
2962
2963 return false;
2964}
2965
2966bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2967 if (!Subtarget->hasSALUFloatInsts())
2968 return false;
2969
2970 Register Dst = I.getOperand(0).getReg();
2971 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2972 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2973 return false;
2974
2975 Register Src = I.getOperand(1).getReg();
2976
2977 if (MRI->getType(Dst) == LLT::scalar(32) &&
2978 MRI->getType(Src) == LLT::scalar(16)) {
2979 if (isExtractHiElt(*MRI, Src, Src)) {
2980 MachineBasicBlock *BB = I.getParent();
2981 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2982 .addUse(Src);
2983 I.eraseFromParent();
2984 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2985 }
2986 }
2987
2988 return false;
2989}
2990
2991bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2992 // Only manually handle the f64 SGPR case.
2993 //
2994 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2995 // the bit ops theoretically have a second result due to the implicit def of
2996 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2997 // that is easy by disabling the check. The result works, but uses a
2998 // nonsensical sreg32orlds_and_sreg_1 regclass.
2999 //
3000 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
3001 // the variadic REG_SEQUENCE operands.
3002
3003 Register Dst = MI.getOperand(0).getReg();
3004 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
3005 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
3006 MRI->getType(Dst) != LLT::scalar(64))
3007 return false;
3008
3009 Register Src = MI.getOperand(1).getReg();
3010 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
3011 if (Fabs)
3012 Src = Fabs->getOperand(1).getReg();
3013
3014 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
3015 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
3016 return false;
3017
3018 MachineBasicBlock *BB = MI.getParent();
3019 const DebugLoc &DL = MI.getDebugLoc();
3020 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3021 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3022 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3023 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3024
3025 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
3026 .addReg(Src, {}, AMDGPU::sub0);
3027 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
3028 .addReg(Src, {}, AMDGPU::sub1);
3029 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3030 .addImm(0x80000000);
3031
3032 // Set or toggle sign bit.
3033 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
3034 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
3035 .addReg(HiReg)
3036 .addReg(ConstReg)
3037 .setOperandDead(3); // Dead scc
3038 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3039 .addReg(LoReg)
3040 .addImm(AMDGPU::sub0)
3041 .addReg(OpReg)
3042 .addImm(AMDGPU::sub1);
3043 MI.eraseFromParent();
3044 return true;
3045}
3046
3047// FIXME: This is a workaround for the same tablegen problems as G_FNEG
3048bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
3049 Register Dst = MI.getOperand(0).getReg();
3050 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
3051 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
3052 MRI->getType(Dst) != LLT::scalar(64))
3053 return false;
3054
3055 Register Src = MI.getOperand(1).getReg();
3056 MachineBasicBlock *BB = MI.getParent();
3057 const DebugLoc &DL = MI.getDebugLoc();
3058 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3059 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3060 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3061 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3062
3063 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
3064 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
3065 return false;
3066
3067 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
3068 .addReg(Src, {}, AMDGPU::sub0);
3069 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
3070 .addReg(Src, {}, AMDGPU::sub1);
3071 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3072 .addImm(0x7fffffff);
3073
3074 // Clear sign bit.
3075 // TODO: Should this used S_BITSET0_*?
3076 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
3077 .addReg(HiReg)
3078 .addReg(ConstReg)
3079 .setOperandDead(3); // Dead scc
3080 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3081 .addReg(LoReg)
3082 .addImm(AMDGPU::sub0)
3083 .addReg(OpReg)
3084 .addImm(AMDGPU::sub1);
3085
3086 MI.eraseFromParent();
3087 return true;
3088}
3089
3090static bool isConstant(const MachineInstr &MI) {
3091 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
3092}
3093
3094void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
3095 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
3096
3097 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
3098 const MachineInstr *PtrMI =
3099 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
3100
3101 assert(PtrMI);
3102
3103 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3104 return;
3105
3106 GEPInfo GEPInfo;
3107
3108 for (unsigned i = 1; i != 3; ++i) {
3109 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3110 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3111 assert(OpDef);
3112 if (i == 2 && isConstant(*OpDef)) {
3113 // TODO: Could handle constant base + variable offset, but a combine
3114 // probably should have commuted it.
3115 assert(GEPInfo.Imm == 0);
3116 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3117 continue;
3118 }
3119 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3120 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3121 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3122 else
3123 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3124 }
3125
3126 AddrInfo.push_back(GEPInfo);
3127 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3128}
3129
3130bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3131 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3132}
3133
3134bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3135 if (!MI.hasOneMemOperand())
3136 return false;
3137
3138 const MachineMemOperand *MMO = *MI.memoperands_begin();
3139 const Value *Ptr = MMO->getValue();
3140
3141 // UndefValue means this is a load of a kernel input. These are uniform.
3142 // Sometimes LDS instructions have constant pointers.
3143 // If Ptr is null, then that means this mem operand contains a
3144 // PseudoSourceValue like GOT.
3146 return true;
3147
3149 return true;
3150
3151 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3152 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3153 AMDGPU::SGPRRegBankID;
3154
3155 const Instruction *I = dyn_cast<Instruction>(Ptr);
3156 return I && I->getMetadata("amdgpu.uniform");
3157}
3158
3159bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3160 for (const GEPInfo &GEPInfo : AddrInfo) {
3161 if (!GEPInfo.VgprParts.empty())
3162 return true;
3163 }
3164 return false;
3165}
3166
3167void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3168 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3169 unsigned AS = PtrTy.getAddressSpace();
3171 STI.ldsRequiresM0Init()) {
3172 MachineBasicBlock *BB = I.getParent();
3173
3174 // If DS instructions require M0 initialization, insert it before selecting.
3175 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3176 .addImm(-1);
3177 }
3178}
3179
3180bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3181 MachineInstr &I) const {
3182 initM0(I);
3183 return selectImpl(I, *CoverageInfo);
3184}
3185
3187 if (Reg.isPhysical())
3188 return false;
3189
3191 const unsigned Opcode = MI.getOpcode();
3192
3193 if (Opcode == AMDGPU::COPY)
3194 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3195
3196 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3197 Opcode == AMDGPU::G_XOR)
3198 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3199 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3200
3201 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3202 return GI->is(Intrinsic::amdgcn_class);
3203
3204 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3205}
3206
3207bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3208 MachineBasicBlock *BB = I.getParent();
3209 MachineOperand &CondOp = I.getOperand(0);
3210 Register CondReg = CondOp.getReg();
3211 const DebugLoc &DL = I.getDebugLoc();
3212
3213 unsigned BrOpcode;
3214 Register CondPhysReg;
3215 const TargetRegisterClass *ConstrainRC;
3216
3217 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3218 // whether the branch is uniform when selecting the instruction. In
3219 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3220 // RegBankSelect knows what it's doing if the branch condition is scc, even
3221 // though it currently does not.
3222 if (!isVCC(CondReg, *MRI)) {
3223 if (MRI->getType(CondReg) != LLT::scalar(32))
3224 return false;
3225
3226 CondPhysReg = AMDGPU::SCC;
3227 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3228 ConstrainRC = &AMDGPU::SReg_32RegClass;
3229 } else {
3230 // FIXME: Should scc->vcc copies and with exec?
3231
3232 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3233 // need to insert an and with exec.
3234 if (!isVCmpResult(CondReg, *MRI)) {
3235 const bool Is64 = STI.isWave64();
3236 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3237 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3238
3239 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3240 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3241 .addReg(CondReg)
3242 .addReg(Exec)
3243 .setOperandDead(3); // Dead scc
3244 CondReg = TmpReg;
3245 }
3246
3247 CondPhysReg = TRI.getVCC();
3248 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3249 ConstrainRC = TRI.getBoolRC();
3250 }
3251
3252 if (!MRI->getRegClassOrNull(CondReg))
3253 MRI->setRegClass(CondReg, ConstrainRC);
3254
3255 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3256 .addReg(CondReg);
3257 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3258 .addMBB(I.getOperand(1).getMBB());
3259
3260 I.eraseFromParent();
3261 return true;
3262}
3263
3264bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3265 MachineInstr &I) const {
3266 Register DstReg = I.getOperand(0).getReg();
3267 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3268 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3269 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3270 if (IsVGPR)
3271 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3272
3273 return RBI.constrainGenericRegister(
3274 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3275}
3276
3277bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3278 Register DstReg = I.getOperand(0).getReg();
3279 Register SrcReg = I.getOperand(1).getReg();
3280 Register MaskReg = I.getOperand(2).getReg();
3281 LLT Ty = MRI->getType(DstReg);
3282 LLT MaskTy = MRI->getType(MaskReg);
3283 MachineBasicBlock *BB = I.getParent();
3284 const DebugLoc &DL = I.getDebugLoc();
3285
3286 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3287 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3288 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3289 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3290 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3291 return false;
3292
3293 // Try to avoid emitting a bit operation when we only need to touch half of
3294 // the 64-bit pointer.
3295 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3296 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3297 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3298
3299 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3300 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3301
3302 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3303 !CanCopyLow32 && !CanCopyHi32) {
3304 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3305 .addReg(SrcReg)
3306 .addReg(MaskReg)
3307 .setOperandDead(3); // Dead scc
3308 I.eraseFromParent();
3309 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3310 return true;
3311 }
3312
3313 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3314 const TargetRegisterClass &RegRC
3315 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3316
3317 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3318 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3319 const TargetRegisterClass *MaskRC =
3320 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3321
3322 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3323 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3324 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3325 return false;
3326
3327 if (Ty.getSizeInBits() == 32) {
3328 assert(MaskTy.getSizeInBits() == 32 &&
3329 "ptrmask should have been narrowed during legalize");
3330
3331 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3332 .addReg(SrcReg)
3333 .addReg(MaskReg);
3334
3335 if (!IsVGPR)
3336 NewOp.setOperandDead(3); // Dead scc
3337 I.eraseFromParent();
3338 return true;
3339 }
3340
3341 Register HiReg = MRI->createVirtualRegister(&RegRC);
3342 Register LoReg = MRI->createVirtualRegister(&RegRC);
3343
3344 // Extract the subregisters from the source pointer.
3345 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3346 .addReg(SrcReg, {}, AMDGPU::sub0);
3347 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3348 .addReg(SrcReg, {}, AMDGPU::sub1);
3349
3350 Register MaskedLo, MaskedHi;
3351
3352 if (CanCopyLow32) {
3353 // If all the bits in the low half are 1, we only need a copy for it.
3354 MaskedLo = LoReg;
3355 } else {
3356 // Extract the mask subregister and apply the and.
3357 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3358 MaskedLo = MRI->createVirtualRegister(&RegRC);
3359
3360 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3361 .addReg(MaskReg, {}, AMDGPU::sub0);
3362 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3363 .addReg(LoReg)
3364 .addReg(MaskLo);
3365 }
3366
3367 if (CanCopyHi32) {
3368 // If all the bits in the high half are 1, we only need a copy for it.
3369 MaskedHi = HiReg;
3370 } else {
3371 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3372 MaskedHi = MRI->createVirtualRegister(&RegRC);
3373
3374 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3375 .addReg(MaskReg, {}, AMDGPU::sub1);
3376 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3377 .addReg(HiReg)
3378 .addReg(MaskHi);
3379 }
3380
3381 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3382 .addReg(MaskedLo)
3383 .addImm(AMDGPU::sub0)
3384 .addReg(MaskedHi)
3385 .addImm(AMDGPU::sub1);
3386 I.eraseFromParent();
3387 return true;
3388}
3389
3390/// Return the register to use for the index value, and the subregister to use
3391/// for the indirectly accessed register.
3392static std::pair<Register, unsigned>
3394 const TargetRegisterClass *SuperRC, Register IdxReg,
3395 unsigned EltSize, GISelValueTracking &ValueTracking) {
3396 Register IdxBaseReg;
3397 int Offset;
3398
3399 std::tie(IdxBaseReg, Offset) =
3400 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3401 if (IdxBaseReg == AMDGPU::NoRegister) {
3402 // This will happen if the index is a known constant. This should ordinarily
3403 // be legalized out, but handle it as a register just in case.
3404 assert(Offset == 0);
3405 IdxBaseReg = IdxReg;
3406 }
3407
3408 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3409
3410 // Skip out of bounds offsets, or else we would end up using an undefined
3411 // register.
3412 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3413 return std::pair(IdxReg, SubRegs[0]);
3414 return std::pair(IdxBaseReg, SubRegs[Offset]);
3415}
3416
3417bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3418 MachineInstr &MI) const {
3419 Register DstReg = MI.getOperand(0).getReg();
3420 Register SrcReg = MI.getOperand(1).getReg();
3421 Register IdxReg = MI.getOperand(2).getReg();
3422
3423 LLT DstTy = MRI->getType(DstReg);
3424 LLT SrcTy = MRI->getType(SrcReg);
3425
3426 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3427 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3428 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3429
3430 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3431 // into a waterfall loop.
3432 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3433 return false;
3434
3435 const TargetRegisterClass *SrcRC =
3436 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3437 const TargetRegisterClass *DstRC =
3438 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3439 if (!SrcRC || !DstRC)
3440 return false;
3441 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3442 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3443 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3444 return false;
3445
3446 MachineBasicBlock *BB = MI.getParent();
3447 const DebugLoc &DL = MI.getDebugLoc();
3448 const bool Is64 = DstTy.getSizeInBits() == 64;
3449
3450 unsigned SubReg;
3451 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3452 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3453
3454 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3455 if (DstTy.getSizeInBits() != 32 && !Is64)
3456 return false;
3457
3458 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3459 .addReg(IdxReg);
3460
3461 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3462 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3463 .addReg(SrcReg, {}, SubReg)
3464 .addReg(SrcReg, RegState::Implicit);
3465 MI.eraseFromParent();
3466 return true;
3467 }
3468
3469 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3470 return false;
3471
3472 if (!STI.useVGPRIndexMode()) {
3473 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3474 .addReg(IdxReg);
3475 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3476 .addReg(SrcReg, {}, SubReg)
3477 .addReg(SrcReg, RegState::Implicit);
3478 MI.eraseFromParent();
3479 return true;
3480 }
3481
3482 const MCInstrDesc &GPRIDXDesc =
3483 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3484 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3485 .addReg(SrcReg)
3486 .addReg(IdxReg)
3487 .addImm(SubReg);
3488
3489 MI.eraseFromParent();
3490 return true;
3491}
3492
3493// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3494bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3495 MachineInstr &MI) const {
3496 Register DstReg = MI.getOperand(0).getReg();
3497 Register VecReg = MI.getOperand(1).getReg();
3498 Register ValReg = MI.getOperand(2).getReg();
3499 Register IdxReg = MI.getOperand(3).getReg();
3500
3501 LLT VecTy = MRI->getType(DstReg);
3502 LLT ValTy = MRI->getType(ValReg);
3503 unsigned VecSize = VecTy.getSizeInBits();
3504 unsigned ValSize = ValTy.getSizeInBits();
3505
3506 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3507 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3508 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3509
3510 assert(VecTy.getElementType() == ValTy);
3511
3512 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3513 // into a waterfall loop.
3514 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3515 return false;
3516
3517 const TargetRegisterClass *VecRC =
3518 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3519 const TargetRegisterClass *ValRC =
3520 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3521
3522 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3523 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3524 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3525 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3526 return false;
3527
3528 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3529 return false;
3530
3531 unsigned SubReg;
3532 std::tie(IdxReg, SubReg) =
3533 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3534
3535 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3536 STI.useVGPRIndexMode();
3537
3538 MachineBasicBlock *BB = MI.getParent();
3539 const DebugLoc &DL = MI.getDebugLoc();
3540
3541 if (!IndexMode) {
3542 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3543 .addReg(IdxReg);
3544
3545 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3546 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3547 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3548 .addReg(VecReg)
3549 .addReg(ValReg)
3550 .addImm(SubReg);
3551 MI.eraseFromParent();
3552 return true;
3553 }
3554
3555 const MCInstrDesc &GPRIDXDesc =
3556 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3557 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3558 .addReg(VecReg)
3559 .addReg(ValReg)
3560 .addReg(IdxReg)
3561 .addImm(SubReg);
3562
3563 MI.eraseFromParent();
3564 return true;
3565}
3566
3567static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3568 switch (Intr) {
3569 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3570 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3571 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3572 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3573 case Intrinsic::amdgcn_load_async_to_lds:
3574 case Intrinsic::amdgcn_global_load_async_lds:
3575 return true;
3576 }
3577 return false;
3578}
3579
3580bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3581 if (!Subtarget->hasVMemToLDSLoad())
3582 return false;
3583 unsigned Opc;
3584 unsigned Size = MI.getOperand(3).getImm();
3585 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3586
3587 // The struct intrinsic variants add one additional operand over raw.
3588 const bool HasVIndex = MI.getNumOperands() == 9;
3589 Register VIndex;
3590 int OpOffset = 0;
3591 if (HasVIndex) {
3592 VIndex = MI.getOperand(4).getReg();
3593 OpOffset = 1;
3594 }
3595
3596 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3597 std::optional<ValueAndVReg> MaybeVOffset =
3599 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3600
3601 switch (Size) {
3602 default:
3603 return false;
3604 case 1:
3605 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3606 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3607 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3608 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3609 break;
3610 case 2:
3611 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3612 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3613 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3614 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3615 break;
3616 case 4:
3617 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3618 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3619 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3620 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3621 break;
3622 case 12:
3623 if (!Subtarget->hasLDSLoadB96_B128())
3624 return false;
3625
3626 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3627 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3628 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3629 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3630 break;
3631 case 16:
3632 if (!Subtarget->hasLDSLoadB96_B128())
3633 return false;
3634
3635 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3636 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3637 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3638 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3639 break;
3640 }
3641
3642 MachineBasicBlock *MBB = MI.getParent();
3643 const DebugLoc &DL = MI.getDebugLoc();
3644 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3645 .add(MI.getOperand(2));
3646
3647 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3648
3649 if (HasVIndex && HasVOffset) {
3650 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3651 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3652 .addReg(VIndex)
3653 .addImm(AMDGPU::sub0)
3654 .addReg(VOffset)
3655 .addImm(AMDGPU::sub1);
3656
3657 MIB.addReg(IdxReg);
3658 } else if (HasVIndex) {
3659 MIB.addReg(VIndex);
3660 } else if (HasVOffset) {
3661 MIB.addReg(VOffset);
3662 }
3663
3664 MIB.add(MI.getOperand(1)); // rsrc
3665 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3666 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3667 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3668 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3669 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3670 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3671 MIB.addImm(
3672 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3673 ? 1
3674 : 0); // swz
3675 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3676
3677 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3678 // Don't set the offset value here because the pointer points to the base of
3679 // the buffer.
3680 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3681
3682 MachinePointerInfo StorePtrI = LoadPtrI;
3683 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3687
3688 auto F = LoadMMO->getFlags() &
3690 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3691 Size, LoadMMO->getBaseAlign());
3692
3693 MachineMemOperand *StoreMMO =
3694 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3695 sizeof(int32_t), LoadMMO->getBaseAlign());
3696
3697 MIB.setMemRefs({LoadMMO, StoreMMO});
3698
3699 MI.eraseFromParent();
3700 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3701 return true;
3702}
3703
3704/// Match a zero extend from a 32-bit value to 64-bits.
3705Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3706 Register ZExtSrc;
3707 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3708 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3709
3710 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3711 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3712 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3713 return Register();
3714
3715 assert(Def->getNumOperands() == 3 &&
3716 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3717 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3718 return Def->getOperand(1).getReg();
3719 }
3720
3721 return Register();
3722}
3723
3724/// Match a sign extend from a 32-bit value to 64-bits.
3725Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3726 Register SExtSrc;
3727 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3728 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3729
3730 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3731 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3732 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3733 return Register();
3734
3735 assert(Def->getNumOperands() == 3 &&
3736 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3737 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3738 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3739 m_SpecificICst(31))))
3740 return Def->getOperand(1).getReg();
3741
3742 if (VT->signBitIsZero(Reg))
3743 return matchZeroExtendFromS32(Reg);
3744
3745 return Register();
3746}
3747
3748/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3749/// is 32-bit.
3751AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3752 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3753 : matchZeroExtendFromS32(Reg);
3754}
3755
3756/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3757/// is 32-bit.
3759AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3760 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3761 : matchSignExtendFromS32(Reg);
3762}
3763
3765AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3766 bool IsSigned) const {
3767 if (IsSigned)
3768 return matchSignExtendFromS32OrS32(Reg);
3769
3770 return matchZeroExtendFromS32OrS32(Reg);
3771}
3772
3773Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3774 Register AnyExtSrc;
3775 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3776 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3777
3778 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3779 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3780 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3781 return Register();
3782
3783 assert(Def->getNumOperands() == 3 &&
3784 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3785
3786 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3787 return Def->getOperand(1).getReg();
3788
3789 return Register();
3790}
3791
3792bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3793 if (!Subtarget->hasVMemToLDSLoad())
3794 return false;
3795
3796 unsigned Opc;
3797 unsigned Size = MI.getOperand(3).getImm();
3798 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3799
3800 switch (Size) {
3801 default:
3802 return false;
3803 case 1:
3804 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3805 break;
3806 case 2:
3807 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3808 break;
3809 case 4:
3810 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3811 break;
3812 case 12:
3813 if (!Subtarget->hasLDSLoadB96_B128())
3814 return false;
3815 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3816 break;
3817 case 16:
3818 if (!Subtarget->hasLDSLoadB96_B128())
3819 return false;
3820 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3821 break;
3822 }
3823
3824 MachineBasicBlock *MBB = MI.getParent();
3825 const DebugLoc &DL = MI.getDebugLoc();
3826 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3827 .add(MI.getOperand(2));
3828
3829 Register Addr = MI.getOperand(1).getReg();
3830 Register VOffset;
3831 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3832 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3833 if (!isSGPR(Addr)) {
3834 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3835 if (isSGPR(AddrDef->Reg)) {
3836 Addr = AddrDef->Reg;
3837 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3838 Register SAddr =
3839 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3840 if (isSGPR(SAddr)) {
3841 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3842 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3843 Addr = SAddr;
3844 VOffset = Off;
3845 }
3846 }
3847 }
3848 }
3849
3850 if (isSGPR(Addr)) {
3852 if (!VOffset) {
3853 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3854 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3855 .addImm(0);
3856 }
3857 }
3858
3859 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3860 .addReg(Addr);
3861
3862 if (isSGPR(Addr))
3863 MIB.addReg(VOffset);
3864
3865 MIB.add(MI.getOperand(4)); // offset
3866
3867 unsigned Aux = MI.getOperand(5).getImm();
3868 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3869 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3870
3871 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3872 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3873 LoadPtrI.Offset = MI.getOperand(4).getImm();
3874 MachinePointerInfo StorePtrI = LoadPtrI;
3875 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3879 auto F = LoadMMO->getFlags() &
3881 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3882 Size, LoadMMO->getBaseAlign());
3883 MachineMemOperand *StoreMMO =
3884 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3885 sizeof(int32_t), Align(4));
3886
3887 MIB.setMemRefs({LoadMMO, StoreMMO});
3888
3889 MI.eraseFromParent();
3890 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3891 return true;
3892}
3893
3894bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3895 Intrinsic::ID IID) const {
3896 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3897 unsigned Opc =
3898 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3899 int NumGroups = 4;
3900
3901 // A lamda function to check whether an operand is a vector of all 0s.
3902 const auto isAllZeros = [&](MachineOperand &Opnd) {
3903 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3904 if (!DefMI)
3905 return false;
3906 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3907 };
3908
3909 // Use _D2 version if both group 2 and 3 are zero-initialized.
3910 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3911 NumGroups = 2;
3912 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3913 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3914 }
3915
3916 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3917 // for now because all existing targets only support up to 4 groups.
3918 MachineBasicBlock *MBB = MI.getParent();
3919 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3920 .add(MI.getOperand(1)) // D# group 0
3921 .add(MI.getOperand(2)); // D# group 1
3922
3923 if (NumGroups >= 4) { // Has at least 4 groups
3924 MIB.add(MI.getOperand(3)) // D# group 2
3925 .add(MI.getOperand(4)); // D# group 3
3926 }
3927
3928 MIB.addImm(0) // r128
3929 .add(MI.getOperand(6)); // cpol
3930
3931 MI.eraseFromParent();
3932 return true;
3933}
3934
3935bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3936 MachineInstr &MI) const {
3937 unsigned OpcodeOpIdx =
3938 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3939 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3940 MI.removeOperand(OpcodeOpIdx);
3941 MI.addImplicitDefUseOperands(*MI.getMF());
3942 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3943 return true;
3944}
3945
3946// FIXME: This should be removed and let the patterns select. We just need the
3947// AGPR/VGPR combination versions.
3948bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3949 unsigned Opc;
3950 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3951 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3952 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3953 break;
3954 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3955 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3956 break;
3957 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3958 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3959 break;
3960 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3961 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3962 break;
3963 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3964 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3965 break;
3966 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3967 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3968 break;
3969 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3970 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3971 break;
3972 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3973 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3974 break;
3975 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3976 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3977 break;
3978 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3979 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3980 break;
3981 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3982 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3983 break;
3984 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3985 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3986 break;
3987 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3988 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3989 break;
3990 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3991 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3992 break;
3993 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3994 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3995 break;
3996 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3997 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3998 break;
3999 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
4000 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
4001 break;
4002 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
4003 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
4004 break;
4005 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
4006 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
4007 break;
4008 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
4009 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
4010 break;
4011 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
4012 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
4013 break;
4014 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
4015 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
4016 break;
4017 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
4018 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
4019 break;
4020 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
4021 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
4022 break;
4023 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
4024 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
4025 break;
4026 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
4027 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
4028 break;
4029 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
4030 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
4031 break;
4032 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
4033 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
4034 break;
4035 default:
4036 llvm_unreachable("unhandled smfmac intrinsic");
4037 }
4038
4039 auto VDst_In = MI.getOperand(4);
4040
4041 MI.setDesc(TII.get(Opc));
4042 MI.removeOperand(4); // VDst_In
4043 MI.removeOperand(1); // Intrinsic ID
4044 MI.addOperand(VDst_In); // Readd VDst_In to the end
4045 MI.addImplicitDefUseOperands(*MI.getMF());
4046 const MCInstrDesc &MCID = MI.getDesc();
4047 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
4048 MI.getOperand(0).setIsEarlyClobber(true);
4049 }
4050 return true;
4051}
4052
4053bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
4054 MachineInstr &MI, Intrinsic::ID IntrID) const {
4055 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
4056 !Subtarget->hasPermlane16Swap())
4057 return false;
4058 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
4059 !Subtarget->hasPermlane32Swap())
4060 return false;
4061
4062 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
4063 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
4064 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
4065
4066 MI.removeOperand(2);
4067 MI.setDesc(TII.get(Opcode));
4068 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
4069
4070 MachineOperand &FI = MI.getOperand(4);
4072
4073 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
4074 return true;
4075}
4076
4077bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
4078 Register DstReg = MI.getOperand(0).getReg();
4079 Register SrcReg = MI.getOperand(1).getReg();
4080 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4081 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4082 MachineBasicBlock *MBB = MI.getParent();
4083 const DebugLoc &DL = MI.getDebugLoc();
4084
4085 if (IsVALU) {
4086 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
4087 .addImm(Subtarget->getWavefrontSizeLog2())
4088 .addReg(SrcReg);
4089 } else {
4090 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
4091 .addReg(SrcReg)
4092 .addImm(Subtarget->getWavefrontSizeLog2())
4093 .setOperandDead(3); // Dead scc
4094 }
4095
4096 const TargetRegisterClass &RC =
4097 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
4098 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
4099 return false;
4100
4101 MI.eraseFromParent();
4102 return true;
4103}
4104
4105bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4106 MachineInstr &MI) const {
4107 assert(MI.getNumOperands() == 4);
4108 MachineBasicBlock *MBB = MI.getParent();
4109 const DebugLoc &DL = MI.getDebugLoc();
4110
4111 Register DstReg = MI.getOperand(0).getReg();
4112 Register ValReg = MI.getOperand(2).getReg();
4113 Register IdxReg = MI.getOperand(3).getReg();
4114
4115 const LLT DstTy = MRI->getType(DstReg);
4116 unsigned DstSize = DstTy.getSizeInBits();
4117 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4118 const TargetRegisterClass *DstRC =
4119 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4120
4121 if (DstTy != LLT::scalar(32))
4122 return false;
4123
4124 if (!Subtarget->supportsBPermute())
4125 return false;
4126
4127 // If we can bpermute across the whole wave, then just do that
4128 if (Subtarget->supportsWaveWideBPermute()) {
4129 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4130 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4131 .addImm(2)
4132 .addReg(IdxReg);
4133
4134 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4135 .addReg(ShiftIdxReg)
4136 .addReg(ValReg)
4137 .addImm(0);
4138 } else {
4139 // Otherwise, we need to make use of whole wave mode
4140 assert(Subtarget->isWave64());
4141
4142 // Set inactive lanes to poison
4143 Register UndefValReg =
4144 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4145 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4146
4147 Register UndefExecReg = MRI->createVirtualRegister(
4148 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4149 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4150
4151 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4152 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4153 .addImm(0)
4154 .addReg(ValReg)
4155 .addImm(0)
4156 .addReg(UndefValReg)
4157 .addReg(UndefExecReg);
4158
4159 // ds_bpermute requires index to be multiplied by 4
4160 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4161 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4162 .addImm(2)
4163 .addReg(IdxReg);
4164
4165 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4166 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4167 .addImm(0)
4168 .addReg(ShiftIdxReg)
4169 .addImm(0)
4170 .addReg(UndefValReg)
4171 .addReg(UndefExecReg);
4172
4173 // Get permutation of each half, then we'll select which one to use
4174 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4175 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4176 .addReg(PoisonIdxReg)
4177 .addReg(PoisonValReg)
4178 .addImm(0);
4179
4180 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4181 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4182 .addReg(PoisonValReg);
4183
4184 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4185 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4186 .addReg(PoisonIdxReg)
4187 .addReg(SwappedValReg)
4188 .addImm(0);
4189
4190 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4191 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4192 .addReg(OppSidePermReg);
4193
4194 // Select which side to take the permute from
4195 // We can get away with only using mbcnt_lo here since we're only
4196 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4197 // returns 32 for lanes 32-63.
4198 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4199 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4200 .addImm(-1)
4201 .addImm(0);
4202
4203 Register XORReg = MRI->createVirtualRegister(DstRC);
4204 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4205 .addReg(ThreadIDReg)
4206 .addReg(PoisonIdxReg);
4207
4208 Register ANDReg = MRI->createVirtualRegister(DstRC);
4209 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4210 .addReg(XORReg)
4211 .addImm(32);
4212
4213 Register CompareReg = MRI->createVirtualRegister(
4214 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4215 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4216 .addReg(ANDReg)
4217 .addImm(0);
4218
4219 // Finally do the selection
4220 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4221 .addImm(0)
4222 .addReg(WWMSwapPermReg)
4223 .addImm(0)
4224 .addReg(SameSidePermReg)
4225 .addReg(CompareReg);
4226 }
4227
4228 MI.eraseFromParent();
4229 return true;
4230}
4231
4232// Match BITOP3 operation and return a number of matched instructions plus
4233// truth table.
4234static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4236 const MachineRegisterInfo &MRI) {
4237 unsigned NumOpcodes = 0;
4238 uint8_t LHSBits, RHSBits;
4239
4240 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4241 // Define truth table given Src0, Src1, Src2 bits permutations:
4242 // 0 0 0
4243 // 0 0 1
4244 // 0 1 0
4245 // 0 1 1
4246 // 1 0 0
4247 // 1 0 1
4248 // 1 1 0
4249 // 1 1 1
4250 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4251
4252 if (mi_match(Op, MRI, m_AllOnesInt())) {
4253 Bits = 0xff;
4254 return true;
4255 }
4256 if (mi_match(Op, MRI, m_ZeroInt())) {
4257 Bits = 0;
4258 return true;
4259 }
4260
4261 for (unsigned I = 0; I < Src.size(); ++I) {
4262 // Try to find existing reused operand
4263 if (Src[I] == Op) {
4264 Bits = SrcBits[I];
4265 return true;
4266 }
4267 // Try to replace parent operator
4268 if (Src[I] == R) {
4269 Bits = SrcBits[I];
4270 Src[I] = Op;
4271 return true;
4272 }
4273 }
4274
4275 if (Src.size() == 3) {
4276 // No room left for operands. Try one last time, there can be a 'not' of
4277 // one of our source operands. In this case we can compute the bits
4278 // without growing Src vector.
4279 Register LHS;
4280 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4282 for (unsigned I = 0; I < Src.size(); ++I) {
4283 if (Src[I] == LHS) {
4284 Bits = ~SrcBits[I];
4285 return true;
4286 }
4287 }
4288 }
4289
4290 return false;
4291 }
4292
4293 Bits = SrcBits[Src.size()];
4294 Src.push_back(Op);
4295 return true;
4296 };
4297
4298 MachineInstr *MI = MRI.getVRegDef(R);
4299 switch (MI->getOpcode()) {
4300 case TargetOpcode::G_AND:
4301 case TargetOpcode::G_OR:
4302 case TargetOpcode::G_XOR: {
4303 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4304 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4305
4306 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4307 if (!getOperandBits(LHS, LHSBits) ||
4308 !getOperandBits(RHS, RHSBits)) {
4309 Src = std::move(Backup);
4310 return std::make_pair(0, 0);
4311 }
4312
4313 // Recursion is naturally limited by the size of the operand vector.
4314 //
4315 // When LHS and RHS share a common sub-expression, one side's recursion
4316 // may decompose that sub-expression and replace the Src slot the other
4317 // side occupies with sub-operands via the "replace parent" path in
4318 // getOperandBits. The other side's cached bit-pattern then refers to a
4319 // slot whose contents changed, producing a wrong truth table.
4320 //
4321 // We detect this in three ways:
4322 // (A) If LHS recursed, its truth table is valid against the Src state
4323 // when LHS recursion completed (SrcAfterLHS). If RHS recursion
4324 // then mutates a Src slot that LHSBits depends on, LHSBits is
4325 // stale.
4326 // (B) If RHS did not recurse, RHSBits came from getOperandBits and
4327 // refers to a specific Src slot. If that slot's contents changed
4328 // (by either recursion), RHSBits is stale.
4329 // (C) Symmetrically for LHS if it did not recurse.
4330 SmallVector<Register, 3> SrcBeforeRecurse(Src.begin(), Src.end());
4331 uint8_t LHSBitsOrig = LHSBits;
4332 uint8_t RHSBitsOrig = RHSBits;
4333
4334 auto LHSOp = BitOp3_Op(LHS, Src, MRI);
4335 if (LHSOp.first) {
4336 NumOpcodes += LHSOp.first;
4337 LHSBits = LHSOp.second;
4338 }
4339
4340 SmallVector<Register, 3> SrcAfterLHS(Src.begin(), Src.end());
4341
4342 auto RHSOp = BitOp3_Op(RHS, Src, MRI);
4343 if (RHSOp.first) {
4344 NumOpcodes += RHSOp.first;
4345 RHSBits = RHSOp.second;
4346 }
4347
4348 // dependsOnSlot: true iff the truth table TT varies with slot Slot.
4349 auto dependsOnSlot = [](uint8_t TT, int Slot) -> bool {
4350 if (Slot < 0 || Slot > 2)
4351 return false;
4352 const uint8_t Masks[3] = {0x0f, 0x33, 0x55};
4353 const int Shifts[3] = {4, 2, 1};
4354 return ((TT ^ (TT >> Shifts[Slot])) & Masks[Slot]) != 0;
4355 };
4356
4357 // findSlot: locate the Src slot a getOperandBits result depends on,
4358 // including negated (NOT) patterns that getOperandBits resolves via
4359 // the ~SrcBits[I] shortcut.
4360 const uint8_t SrcBitsConst[3] = {0xf0, 0xcc, 0xaa};
4361 auto findSlot = [&](uint8_t Bits, Register Op,
4362 const SmallVectorImpl<Register> &S) -> int {
4363 Register NegatedInner;
4364 bool IsNegationOp = mi_match(Op, MRI, m_Not(m_Reg(NegatedInner)));
4365 if (IsNegationOp)
4366 NegatedInner = getSrcRegIgnoringCopies(NegatedInner, MRI);
4367 for (int I = 0; I < (int)S.size(); I++) {
4368 if (Bits == SrcBitsConst[I] && S[I] == Op)
4369 return I;
4370 if (IsNegationOp && Bits == (uint8_t)~SrcBitsConst[I] &&
4371 S[I] == NegatedInner)
4372 return I;
4373 }
4374 return -1;
4375 };
4376
4377 bool Stale = false;
4378
4379 // (A) LHS recursed: its truth table is against SrcAfterLHS.
4380 // Check if RHS recursion mutated a slot that LHSBits uses.
4381 if (LHSOp.first) {
4382 for (int I = 0; I < (int)SrcAfterLHS.size() && I < 3; I++) {
4383 if (I < (int)Src.size() && Src[I] != SrcAfterLHS[I] &&
4384 dependsOnSlot(LHSBits, I)) {
4385 Stale = true;
4386 break;
4387 }
4388 }
4389 }
4390
4391 // (B) RHS did not recurse: RHSBits from getOperandBits is against
4392 // SrcBeforeRecurse. Check if that slot was mutated since then.
4393 if (!Stale && !RHSOp.first) {
4394 int Slot = findSlot(RHSBitsOrig, RHS, SrcBeforeRecurse);
4395 if (Slot >= 0 &&
4396 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4397 Stale = true;
4398 }
4399
4400 // (C) LHS did not recurse: LHSBits from getOperandBits is against
4401 // SrcBeforeRecurse. Check if that slot was mutated since then.
4402 if (!Stale && !LHSOp.first) {
4403 int Slot = findSlot(LHSBitsOrig, LHS, SrcBeforeRecurse);
4404 if (Slot >= 0 &&
4405 (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))
4406 Stale = true;
4407 }
4408
4409 if (Stale) {
4410 Src = std::move(SrcBeforeRecurse);
4411 LHSBits = LHSBitsOrig;
4412 RHSBits = RHSBitsOrig;
4413 NumOpcodes = 0;
4414 }
4415 break;
4416 }
4417 default:
4418 return std::make_pair(0, 0);
4419 }
4420
4421 uint8_t TTbl;
4422 switch (MI->getOpcode()) {
4423 case TargetOpcode::G_AND:
4424 TTbl = LHSBits & RHSBits;
4425 break;
4426 case TargetOpcode::G_OR:
4427 TTbl = LHSBits | RHSBits;
4428 break;
4429 case TargetOpcode::G_XOR:
4430 TTbl = LHSBits ^ RHSBits;
4431 break;
4432 default:
4433 break;
4434 }
4435
4436 return std::make_pair(NumOpcodes + 1, TTbl);
4437}
4438
4439bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4440 if (!Subtarget->hasBitOp3Insts())
4441 return false;
4442
4443 Register DstReg = MI.getOperand(0).getReg();
4444 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4445 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4446 if (!IsVALU)
4447 return false;
4448
4450 uint8_t TTbl;
4451 unsigned NumOpcodes;
4452
4453 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4454
4455 // Src.empty() case can happen if all operands are all zero or all ones.
4456 // Normally it shall be optimized out before reaching this.
4457 if (NumOpcodes < 2 || Src.empty())
4458 return false;
4459
4460 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4461 if (NumOpcodes == 2 && IsB32) {
4462 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4463 // asm more readable. This cannot be modeled with AddedComplexity because
4464 // selector does not know how many operations did we match.
4465 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4466 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4467 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4468 return false;
4469 } else if (NumOpcodes < 4) {
4470 // For a uniform case threshold should be higher to account for moves
4471 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4472 // in SGPRs and a readtfirstlane after.
4473 return false;
4474 }
4475
4476 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4477 if (!IsB32 && STI.hasTrue16BitInsts())
4478 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4479 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4480 unsigned CBL = STI.getConstantBusLimit(Opc);
4481 MachineBasicBlock *MBB = MI.getParent();
4482 const DebugLoc &DL = MI.getDebugLoc();
4483
4484 for (unsigned I = 0; I < Src.size(); ++I) {
4485 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4486 if (RB->getID() != AMDGPU::SGPRRegBankID)
4487 continue;
4488 if (CBL > 0) {
4489 --CBL;
4490 continue;
4491 }
4492 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4493 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4494 .addReg(Src[I]);
4495 Src[I] = NewReg;
4496 }
4497
4498 // Last operand can be ignored, turning a ternary operation into a binary.
4499 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4500 // 'c' with 'a' here without changing the answer. In some pathological
4501 // cases it should be possible to get an operation with a single operand
4502 // too if optimizer would not catch it.
4503 while (Src.size() < 3)
4504 Src.push_back(Src[0]);
4505
4506 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4507 if (!IsB32)
4508 MIB.addImm(0); // src_mod0
4509 MIB.addReg(Src[0]);
4510 if (!IsB32)
4511 MIB.addImm(0); // src_mod1
4512 MIB.addReg(Src[1]);
4513 if (!IsB32)
4514 MIB.addImm(0); // src_mod2
4515 MIB.addReg(Src[2])
4516 .addImm(TTbl);
4517 if (!IsB32)
4518 MIB.addImm(0); // op_sel
4519
4520 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4521 MI.eraseFromParent();
4522
4523 return true;
4524}
4525
4526bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4527 Register SrcReg = MI.getOperand(0).getReg();
4528 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4529 return false;
4530
4531 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4532 Register SP =
4533 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4534 Register WaveAddr = getWaveAddress(DefMI);
4535 MachineBasicBlock *MBB = MI.getParent();
4536 const DebugLoc &DL = MI.getDebugLoc();
4537
4538 if (!WaveAddr) {
4539 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4540 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4541 .addReg(SrcReg)
4542 .addImm(Subtarget->getWavefrontSizeLog2())
4543 .setOperandDead(3); // Dead scc
4544 }
4545
4546 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4547 .addReg(WaveAddr);
4548
4549 MI.eraseFromParent();
4550 return true;
4551}
4552
4554
4555 if (!I.isPreISelOpcode()) {
4556 if (I.isCopy())
4557 return selectCOPY(I);
4558 return true;
4559 }
4560
4561 switch (I.getOpcode()) {
4562 case TargetOpcode::G_AND:
4563 case TargetOpcode::G_OR:
4564 case TargetOpcode::G_XOR:
4565 if (selectBITOP3(I))
4566 return true;
4567 if (selectImpl(I, *CoverageInfo))
4568 return true;
4569 return selectG_AND_OR_XOR(I);
4570 case TargetOpcode::G_ADD:
4571 case TargetOpcode::G_SUB:
4572 case TargetOpcode::G_PTR_ADD:
4573 if (selectImpl(I, *CoverageInfo))
4574 return true;
4575 return selectG_ADD_SUB(I);
4576 case TargetOpcode::G_UADDO:
4577 case TargetOpcode::G_USUBO:
4578 case TargetOpcode::G_UADDE:
4579 case TargetOpcode::G_USUBE:
4580 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4581 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4582 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4583 return selectG_AMDGPU_MAD_64_32(I);
4584 case TargetOpcode::G_INTTOPTR:
4585 case TargetOpcode::G_BITCAST:
4586 case TargetOpcode::G_PTRTOINT:
4587 case TargetOpcode::G_FREEZE:
4588 return selectCOPY(I);
4589 case TargetOpcode::G_FNEG:
4590 if (selectImpl(I, *CoverageInfo))
4591 return true;
4592 return selectG_FNEG(I);
4593 case TargetOpcode::G_FABS:
4594 if (selectImpl(I, *CoverageInfo))
4595 return true;
4596 return selectG_FABS(I);
4597 case TargetOpcode::G_EXTRACT:
4598 return selectG_EXTRACT(I);
4599 case TargetOpcode::G_MERGE_VALUES:
4600 case TargetOpcode::G_CONCAT_VECTORS:
4601 return selectG_MERGE_VALUES(I);
4602 case TargetOpcode::G_UNMERGE_VALUES:
4603 return selectG_UNMERGE_VALUES(I);
4604 case TargetOpcode::G_BUILD_VECTOR:
4605 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4606 return selectG_BUILD_VECTOR(I);
4607 case TargetOpcode::G_IMPLICIT_DEF:
4608 return selectG_IMPLICIT_DEF(I);
4609 case TargetOpcode::G_INSERT:
4610 return selectG_INSERT(I);
4611 case TargetOpcode::G_INTRINSIC:
4612 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4613 return selectG_INTRINSIC(I);
4614 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4615 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4616 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4617 case TargetOpcode::G_ICMP:
4618 case TargetOpcode::G_FCMP:
4619 if (selectG_ICMP_or_FCMP(I))
4620 return true;
4621 return selectImpl(I, *CoverageInfo);
4622 case TargetOpcode::G_LOAD:
4623 case TargetOpcode::G_ZEXTLOAD:
4624 case TargetOpcode::G_SEXTLOAD:
4625 case TargetOpcode::G_STORE:
4626 case TargetOpcode::G_ATOMIC_CMPXCHG:
4627 case TargetOpcode::G_ATOMICRMW_XCHG:
4628 case TargetOpcode::G_ATOMICRMW_ADD:
4629 case TargetOpcode::G_ATOMICRMW_SUB:
4630 case TargetOpcode::G_ATOMICRMW_AND:
4631 case TargetOpcode::G_ATOMICRMW_OR:
4632 case TargetOpcode::G_ATOMICRMW_XOR:
4633 case TargetOpcode::G_ATOMICRMW_MIN:
4634 case TargetOpcode::G_ATOMICRMW_MAX:
4635 case TargetOpcode::G_ATOMICRMW_UMIN:
4636 case TargetOpcode::G_ATOMICRMW_UMAX:
4637 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4638 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4639 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4640 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4641 case TargetOpcode::G_ATOMICRMW_FADD:
4642 case TargetOpcode::G_ATOMICRMW_FMIN:
4643 case TargetOpcode::G_ATOMICRMW_FMAX:
4644 return selectG_LOAD_STORE_ATOMICRMW(I);
4645 case TargetOpcode::G_SELECT:
4646 return selectG_SELECT(I);
4647 case TargetOpcode::G_TRUNC:
4648 return selectG_TRUNC(I);
4649 case TargetOpcode::G_SEXT:
4650 case TargetOpcode::G_ZEXT:
4651 case TargetOpcode::G_ANYEXT:
4652 case TargetOpcode::G_SEXT_INREG:
4653 // This is a workaround. For extension from type i1, `selectImpl()` uses
4654 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4655 // i1 can only be hold in a SGPR class.
4656 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4657 selectImpl(I, *CoverageInfo))
4658 return true;
4659 return selectG_SZA_EXT(I);
4660 case TargetOpcode::G_FPEXT:
4661 if (selectG_FPEXT(I))
4662 return true;
4663 return selectImpl(I, *CoverageInfo);
4664 case TargetOpcode::G_BRCOND:
4665 return selectG_BRCOND(I);
4666 case TargetOpcode::G_GLOBAL_VALUE:
4667 return selectG_GLOBAL_VALUE(I);
4668 case TargetOpcode::G_PTRMASK:
4669 return selectG_PTRMASK(I);
4670 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4671 return selectG_EXTRACT_VECTOR_ELT(I);
4672 case TargetOpcode::G_INSERT_VECTOR_ELT:
4673 return selectG_INSERT_VECTOR_ELT(I);
4674 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4675 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4676 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4677 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4678 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4679 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4681 assert(Intr && "not an image intrinsic with image pseudo");
4682 return selectImageIntrinsic(I, Intr);
4683 }
4684 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4685 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4686 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4687 return selectBVHIntersectRayIntrinsic(I);
4688 case AMDGPU::G_SBFX:
4689 case AMDGPU::G_UBFX:
4690 return selectG_SBFX_UBFX(I);
4691 case AMDGPU::G_SI_CALL:
4692 I.setDesc(TII.get(AMDGPU::SI_CALL));
4693 return true;
4694 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4695 return selectWaveAddress(I);
4696 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4697 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4698 return true;
4699 }
4700 case AMDGPU::G_STACKRESTORE:
4701 return selectStackRestore(I);
4702 case AMDGPU::G_PHI:
4703 return selectPHI(I);
4704 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4705 return selectCOPY_SCC_VCC(I);
4706 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4707 return selectCOPY_VCC_SCC(I);
4708 case AMDGPU::G_AMDGPU_READANYLANE:
4709 return selectReadAnyLane(I);
4710 case TargetOpcode::G_CONSTANT:
4711 case TargetOpcode::G_FCONSTANT:
4712 default:
4713 return selectImpl(I, *CoverageInfo);
4714 }
4715 return false;
4716}
4717
4719AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4720 return {{
4721 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4722 }};
4723
4724}
4725
4726std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4727 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4728 unsigned Mods = 0;
4729 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4730
4731 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4732 Src = MI->getOperand(1).getReg();
4733 Mods |= SISrcMods::NEG;
4734 MI = getDefIgnoringCopies(Src, *MRI);
4735 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4736 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4737 // denormal mode, but we're implicitly canonicalizing in a source operand.
4738 const ConstantFP *LHS =
4739 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4740 if (LHS && LHS->isZero()) {
4741 Mods |= SISrcMods::NEG;
4742 Src = MI->getOperand(2).getReg();
4743 }
4744 }
4745
4746 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4747 Src = MI->getOperand(1).getReg();
4748 Mods |= SISrcMods::ABS;
4749 }
4750
4751 if (OpSel)
4752 Mods |= SISrcMods::OP_SEL_0;
4753
4754 return std::pair(Src, Mods);
4755}
4756
4757std::pair<Register, unsigned>
4758AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4759 unsigned Mods;
4760 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4761 Mods |= SISrcMods::OP_SEL_1;
4762 return std::pair(Src, Mods);
4763}
4764
4765Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4766 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4767 bool ForceVGPR) const {
4768 if ((Mods != 0 || ForceVGPR) &&
4769 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4770
4771 // If we looked through copies to find source modifiers on an SGPR operand,
4772 // we now have an SGPR register source. To avoid potentially violating the
4773 // constant bus restriction, we need to insert a copy to a VGPR.
4774 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4775 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4776 TII.get(AMDGPU::COPY), VGPRSrc)
4777 .addReg(Src);
4778 Src = VGPRSrc;
4779 }
4780
4781 return Src;
4782}
4783
4784///
4785/// This will select either an SGPR or VGPR operand and will save us from
4786/// having to write an extra tablegen pattern.
4788AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4789 return {{
4790 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4791 }};
4792}
4793
4795AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4796 Register Src;
4797 unsigned Mods;
4798 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4799
4800 return {{
4801 [=](MachineInstrBuilder &MIB) {
4802 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4803 },
4804 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4805 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4806 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4807 }};
4808}
4809
4811AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4812 Register Src;
4813 unsigned Mods;
4814 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4815 /*IsCanonicalizing=*/true,
4816 /*AllowAbs=*/false);
4817
4818 return {{
4819 [=](MachineInstrBuilder &MIB) {
4820 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4821 },
4822 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4823 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4824 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4825 }};
4826}
4827
4829AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4830 return {{
4831 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4832 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4833 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4834 }};
4835}
4836
4838AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4839 Register Src;
4840 unsigned Mods;
4841 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4842
4843 return {{
4844 [=](MachineInstrBuilder &MIB) {
4845 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4846 },
4847 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4848 }};
4849}
4850
4852AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4853 MachineOperand &Root) const {
4854 Register Src;
4855 unsigned Mods;
4856 std::tie(Src, Mods) =
4857 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4858
4859 return {{
4860 [=](MachineInstrBuilder &MIB) {
4861 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4862 },
4863 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4864 }};
4865}
4866
4868AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4869 Register Src;
4870 unsigned Mods;
4871 std::tie(Src, Mods) =
4872 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4873 /*AllowAbs=*/false);
4874
4875 return {{
4876 [=](MachineInstrBuilder &MIB) {
4877 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4878 },
4879 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4880 }};
4881}
4882
4884AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4885 Register Reg = Root.getReg();
4886 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4887 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4888 return {};
4889 return {{
4890 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4891 }};
4892}
4893
4894enum class SrcStatus {
4899 // This means current op = [op_upper, op_lower] and src = -op_lower.
4902 // This means current op = [op_upper, op_lower] and src = [op_upper,
4903 // -op_lower].
4911};
4912/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4913static bool isTruncHalf(const MachineInstr *MI,
4914 const MachineRegisterInfo &MRI) {
4915 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4916 return false;
4917
4918 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4919 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4920 return DstSize * 2 == SrcSize;
4921}
4922
4923/// Test if the MI is logic shift right with half bits,
4924/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4925static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4926 if (MI->getOpcode() != AMDGPU::G_LSHR)
4927 return false;
4928
4929 Register ShiftSrc;
4930 std::optional<ValueAndVReg> ShiftAmt;
4931 if (mi_match(MI->getOperand(0).getReg(), MRI,
4932 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4933 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4934 unsigned Shift = ShiftAmt->Value.getZExtValue();
4935 return Shift * 2 == SrcSize;
4936 }
4937 return false;
4938}
4939
4940/// Test if the MI is shift left with half bits,
4941/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4942static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4943 if (MI->getOpcode() != AMDGPU::G_SHL)
4944 return false;
4945
4946 Register ShiftSrc;
4947 std::optional<ValueAndVReg> ShiftAmt;
4948 if (mi_match(MI->getOperand(0).getReg(), MRI,
4949 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4950 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4951 unsigned Shift = ShiftAmt->Value.getZExtValue();
4952 return Shift * 2 == SrcSize;
4953 }
4954 return false;
4955}
4956
4957/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4958static bool isUnmergeHalf(const MachineInstr *MI,
4959 const MachineRegisterInfo &MRI) {
4960 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4961 return false;
4962 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4963 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4964}
4965
4967
4969 const MachineRegisterInfo &MRI) {
4970 LLT OpTy = MRI.getType(Reg);
4971 if (OpTy.isScalar())
4972 return TypeClass::SCALAR;
4973 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4976}
4977
4979 const MachineRegisterInfo &MRI) {
4980 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4981 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4982 return SrcStatus::INVALID;
4983
4984 switch (S) {
4985 case SrcStatus::IS_SAME:
4986 if (NegType == TypeClass::VECTOR_OF_TWO) {
4987 // Vector of 2:
4988 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4989 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4990 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4991 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4993 }
4994 if (NegType == TypeClass::SCALAR) {
4995 // Scalar:
4996 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4997 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4998 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4999 // [SrcHi, SrcLo] = [-OpHi, OpLo]
5000 return SrcStatus::IS_HI_NEG;
5001 }
5002 break;
5004 if (NegType == TypeClass::VECTOR_OF_TWO) {
5005 // Vector of 2:
5006 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5007 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
5008 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
5009 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
5010 return SrcStatus::IS_LO_NEG;
5011 }
5012 if (NegType == TypeClass::SCALAR) {
5013 // Scalar:
5014 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5015 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
5016 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5017 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
5018 return SrcStatus::IS_SAME;
5019 }
5020 break;
5022 if (NegType == TypeClass::VECTOR_OF_TWO) {
5023 // Vector of 2:
5024 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
5025 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
5026 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
5027 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
5028 return SrcStatus::IS_HI_NEG;
5029 }
5030 if (NegType == TypeClass::SCALAR) {
5031 // Scalar:
5032 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
5033 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
5034 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5035 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
5037 }
5038 break;
5040 if (NegType == TypeClass::VECTOR_OF_TWO) {
5041 // Vector of 2:
5042 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
5043 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
5044 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
5045 // [SrcHi, SrcLo] = [OpHi, OpLo]
5046 return SrcStatus::IS_SAME;
5047 }
5048 if (NegType == TypeClass::SCALAR) {
5049 // Scalar:
5050 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
5051 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
5052 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
5053 // [SrcHi, SrcLo] = [OpHi, -OpLo]
5054 return SrcStatus::IS_LO_NEG;
5055 }
5056 break;
5058 // Vector of 2:
5059 // Src = CurrUpper
5060 // Curr = [CurrUpper, CurrLower]
5061 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5062 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5063 // Src = -OpUpper
5064 //
5065 // Scalar:
5066 // Src = CurrUpper
5067 // Curr = [CurrUpper, CurrLower]
5068 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5069 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5070 // Src = -OpUpper
5073 if (NegType == TypeClass::VECTOR_OF_TWO) {
5074 // Vector of 2:
5075 // Src = CurrLower
5076 // Curr = [CurrUpper, CurrLower]
5077 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5078 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5079 // Src = -OpLower
5081 }
5082 if (NegType == TypeClass::SCALAR) {
5083 // Scalar:
5084 // Src = CurrLower
5085 // Curr = [CurrUpper, CurrLower]
5086 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5087 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5088 // Src = OpLower
5090 }
5091 break;
5093 // Vector of 2:
5094 // Src = -CurrUpper
5095 // Curr = [CurrUpper, CurrLower]
5096 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5097 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5098 // Src = -(-OpUpper) = OpUpper
5099 //
5100 // Scalar:
5101 // Src = -CurrUpper
5102 // Curr = [CurrUpper, CurrLower]
5103 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5104 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5105 // Src = -(-OpUpper) = OpUpper
5108 if (NegType == TypeClass::VECTOR_OF_TWO) {
5109 // Vector of 2:
5110 // Src = -CurrLower
5111 // Curr = [CurrUpper, CurrLower]
5112 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5113 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5114 // Src = -(-OpLower) = OpLower
5116 }
5117 if (NegType == TypeClass::SCALAR) {
5118 // Scalar:
5119 // Src = -CurrLower
5120 // Curr = [CurrUpper, CurrLower]
5121 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5122 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5123 // Src = -OpLower
5125 }
5126 break;
5127 default:
5128 break;
5129 }
5130 llvm_unreachable("unexpected SrcStatus & NegType combination");
5131}
5132
5133static std::optional<std::pair<Register, SrcStatus>>
5134calcNextStatus(std::pair<Register, SrcStatus> Curr,
5135 const MachineRegisterInfo &MRI) {
5136 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
5137
5138 unsigned Opc = MI->getOpcode();
5139
5140 // Handle general Opc cases.
5141 switch (Opc) {
5142 case AMDGPU::G_BITCAST:
5143 return std::optional<std::pair<Register, SrcStatus>>(
5144 {MI->getOperand(1).getReg(), Curr.second});
5145 case AMDGPU::COPY:
5146 if (MI->getOperand(1).getReg().isPhysical())
5147 return std::nullopt;
5148 return std::optional<std::pair<Register, SrcStatus>>(
5149 {MI->getOperand(1).getReg(), Curr.second});
5150 case AMDGPU::G_FNEG: {
5151 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
5152 if (Stat == SrcStatus::INVALID)
5153 return std::nullopt;
5154 return std::optional<std::pair<Register, SrcStatus>>(
5155 {MI->getOperand(1).getReg(), Stat});
5156 }
5157 default:
5158 break;
5159 }
5160
5161 // Calc next Stat from current Stat.
5162 switch (Curr.second) {
5163 case SrcStatus::IS_SAME:
5164 if (isTruncHalf(MI, MRI))
5165 return std::optional<std::pair<Register, SrcStatus>>(
5166 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5167 else if (isUnmergeHalf(MI, MRI)) {
5168 if (Curr.first == MI->getOperand(0).getReg())
5169 return std::optional<std::pair<Register, SrcStatus>>(
5170 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
5171 return std::optional<std::pair<Register, SrcStatus>>(
5172 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
5173 }
5174 break;
5176 if (isTruncHalf(MI, MRI)) {
5177 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5178 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
5179 // = [OpLowerHi, OpLowerLo]
5180 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5181 // = [-OpLowerHi, OpLowerLo]
5182 // = -OpLower
5183 return std::optional<std::pair<Register, SrcStatus>>(
5184 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5185 }
5186 if (isUnmergeHalf(MI, MRI)) {
5187 if (Curr.first == MI->getOperand(0).getReg())
5188 return std::optional<std::pair<Register, SrcStatus>>(
5189 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5190 return std::optional<std::pair<Register, SrcStatus>>(
5191 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5192 }
5193 break;
5195 if (isShlHalf(MI, MRI))
5196 return std::optional<std::pair<Register, SrcStatus>>(
5197 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5198 break;
5200 if (isLshrHalf(MI, MRI))
5201 return std::optional<std::pair<Register, SrcStatus>>(
5202 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5203 break;
5205 if (isShlHalf(MI, MRI))
5206 return std::optional<std::pair<Register, SrcStatus>>(
5207 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5208 break;
5210 if (isLshrHalf(MI, MRI))
5211 return std::optional<std::pair<Register, SrcStatus>>(
5212 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5213 break;
5214 default:
5215 break;
5216 }
5217 return std::nullopt;
5218}
5219
5220/// This is used to control valid status that current MI supports. For example,
5221/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5222/// bit on VOP3P.
5223/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5224/// for different MI on different arch
5226private:
5227 bool HasNeg = false;
5228 // Assume all complex pattern of VOP3P have opsel.
5229 bool HasOpsel = true;
5230
5231public:
5233 const MachineInstr *MI = MRI.getVRegDef(Reg);
5234 unsigned Opc = MI->getOpcode();
5235
5236 if (Opc == TargetOpcode::G_INTRINSIC) {
5237 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5238 // Only float point intrinsic has neg & neg_hi bits.
5239 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5240 HasNeg = true;
5242 // Keep same for generic op.
5243 HasNeg = true;
5244 }
5245 }
5246 bool checkOptions(SrcStatus Stat) const {
5247 if (!HasNeg &&
5248 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5249 return false;
5250 }
5251 if (!HasOpsel &&
5252 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5253 return false;
5254 }
5255 return true;
5256 }
5257};
5258
5261 int MaxDepth = 3) {
5262 int Depth = 0;
5263 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5265
5266 while (Depth <= MaxDepth && Curr.has_value()) {
5267 Depth++;
5268 if (SO.checkOptions(Curr.value().second))
5269 Statlist.push_back(Curr.value());
5270 Curr = calcNextStatus(Curr.value(), MRI);
5271 }
5272
5273 return Statlist;
5274}
5275
5276static std::pair<Register, SrcStatus>
5278 int MaxDepth = 3) {
5279 int Depth = 0;
5280 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5281 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5282
5283 while (Depth <= MaxDepth && Curr.has_value()) {
5284 Depth++;
5285 SrcStatus Stat = Curr.value().second;
5286 if (SO.checkOptions(Stat)) {
5287 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5289 LastSameOrNeg = Curr.value();
5290 }
5291 Curr = calcNextStatus(Curr.value(), MRI);
5292 }
5293
5294 return LastSameOrNeg;
5295}
5296
5297static bool isSameBitWidth(Register Reg1, Register Reg2,
5298 const MachineRegisterInfo &MRI) {
5299 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5300 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5301 return Width1 == Width2;
5302}
5303
5304static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5305 // SrcStatus::IS_LOWER_HALF remain 0.
5306 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5307 Mods ^= SISrcMods::NEG_HI;
5308 Mods |= SISrcMods::OP_SEL_1;
5309 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5310 Mods |= SISrcMods::OP_SEL_1;
5311 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5312 Mods ^= SISrcMods::NEG_HI;
5313 else if (HiStat == SrcStatus::IS_HI_NEG)
5314 Mods ^= SISrcMods::NEG_HI;
5315
5316 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5317 Mods ^= SISrcMods::NEG;
5318 Mods |= SISrcMods::OP_SEL_0;
5319 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5320 Mods |= SISrcMods::OP_SEL_0;
5321 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5322 Mods |= SISrcMods::NEG;
5323 else if (LoStat == SrcStatus::IS_HI_NEG)
5324 Mods ^= SISrcMods::NEG;
5325
5326 return Mods;
5327}
5328
5329static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5330 Register RootReg, const SIInstrInfo &TII,
5331 const MachineRegisterInfo &MRI) {
5332 auto IsHalfState = [](SrcStatus S) {
5335 };
5336 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5337 IsHalfState(HiStat);
5338}
5339
5340std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5341 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5342 unsigned Mods = 0;
5343 // No modification if Root type is not form of <2 x Type>.
5344 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5345 Mods |= SISrcMods::OP_SEL_1;
5346 return {RootReg, Mods};
5347 }
5348
5349 SearchOptions SO(RootReg, MRI);
5350
5351 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5352
5353 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5355 else if (Stat.second == SrcStatus::IS_HI_NEG)
5356 Mods ^= SISrcMods::NEG_HI;
5357 else if (Stat.second == SrcStatus::IS_LO_NEG)
5358 Mods ^= SISrcMods::NEG;
5359
5360 // 64-bit VOP3P instructions do not have OPSEL or ABS. Bail on v2f64 or v2i64.
5361 // TODO: Select NEG_LO and NEG_HI modifiers from BUILD_VECTOR.
5362 if (MRI.getType(RootReg).getSizeInBits() == 128) {
5363 Mods |= SISrcMods::OP_SEL_1; // Just the default, OPSEL unsupported.
5364 return {Stat.first, Mods};
5365 }
5366
5367 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5368
5369 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5370 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5371 Mods |= SISrcMods::OP_SEL_1;
5372 return {Stat.first, Mods};
5373 }
5374
5376 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5377
5378 if (StatlistHi.empty()) {
5379 Mods |= SISrcMods::OP_SEL_1;
5380 return {Stat.first, Mods};
5381 }
5382
5384 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5385
5386 if (StatlistLo.empty()) {
5387 Mods |= SISrcMods::OP_SEL_1;
5388 return {Stat.first, Mods};
5389 }
5390
5391 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5392 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5393 if (StatlistHi[I].first == StatlistLo[J].first &&
5394 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5395 StatlistHi[I].first, RootReg, TII, MRI))
5396 return {StatlistHi[I].first,
5397 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5398 }
5399 }
5400 // Packed instructions do not have abs modifiers.
5401 Mods |= SISrcMods::OP_SEL_1;
5402
5403 return {Stat.first, Mods};
5404}
5405
5406// Removed unused function `getAllKindImm` to eliminate dead code.
5407
5408static bool checkRB(Register Reg, unsigned int RBNo,
5409 const AMDGPURegisterBankInfo &RBI,
5410 const MachineRegisterInfo &MRI,
5411 const TargetRegisterInfo &TRI) {
5412 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5413 return RB->getID() == RBNo;
5414}
5415
5416// This function is used to get the correct register bank for returned reg.
5417// Assume:
5418// 1. VOP3P is always legal for VGPR.
5419// 2. RootOp's regbank is legal.
5420// Thus
5421// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5422// 2. If RootOp is VGPR, then NewOp must be VGPR.
5424 const AMDGPURegisterBankInfo &RBI,
5426 const TargetRegisterInfo &TRI,
5427 const SIInstrInfo &TII) {
5428 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5429 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5430 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5431 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5432 return NewReg;
5433
5434 MachineInstr *MI = MRI.getVRegDef(RootReg);
5435 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5436 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5437 return RootReg;
5438 }
5439
5440 MachineBasicBlock *BB = MI->getParent();
5441 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5442
5444 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5445 .addReg(NewReg);
5446
5447 // Only accept VGPR.
5448 return MIB->getOperand(0).getReg();
5449}
5450
5452AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5453 bool IsDOT) const {
5454 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5455 Register Reg;
5456 unsigned Mods;
5457 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5458
5459 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5460 return {{
5461 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5462 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5463 }};
5464}
5465
5467AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5468
5469 return selectVOP3PRetHelper(Root);
5470}
5471
5473AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5474
5475 return selectVOP3PRetHelper(Root, true);
5476}
5477
5479AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5480 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5481 Register Src;
5482 unsigned Mods;
5483 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
5484 if (Mods != SISrcMods::OP_SEL_1)
5485 return {};
5486
5487 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5488}
5489
5491AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5492 Register Src;
5493 unsigned Mods;
5494 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5495
5496 return {{
5497 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5498 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5499 }};
5500}
5501
5503AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5504 Register Src;
5505 unsigned Mods;
5506 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5507 if (Mods != SISrcMods::OP_SEL_1)
5508 return {};
5509
5510 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5511}
5512
5514AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5515 MachineOperand &Root) const {
5516 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5517 "expected i1 value");
5518 unsigned Mods = SISrcMods::OP_SEL_1;
5519 if (Root.getImm() != 0)
5520 Mods |= SISrcMods::OP_SEL_0;
5521
5522 return {{
5523 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5524 }};
5525}
5526
5528 MachineInstr *InsertPt,
5529 MachineRegisterInfo &MRI) {
5530 const TargetRegisterClass *DstRegClass;
5531 switch (Elts.size()) {
5532 case 8:
5533 DstRegClass = &AMDGPU::VReg_256RegClass;
5534 break;
5535 case 4:
5536 DstRegClass = &AMDGPU::VReg_128RegClass;
5537 break;
5538 case 2:
5539 DstRegClass = &AMDGPU::VReg_64RegClass;
5540 break;
5541 default:
5542 llvm_unreachable("unhandled Reg sequence size");
5543 }
5544
5545 MachineIRBuilder B(*InsertPt);
5546 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5547 .addDef(MRI.createVirtualRegister(DstRegClass));
5548 for (unsigned i = 0; i < Elts.size(); ++i) {
5549 MIB.addReg(Elts[i]);
5551 }
5552 return MIB->getOperand(0).getReg();
5553}
5554
5555static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5557 MachineInstr *InsertPt,
5558 MachineRegisterInfo &MRI) {
5559 if (ModOpcode == TargetOpcode::G_FNEG) {
5560 Mods |= SISrcMods::NEG;
5561 // Check if all elements also have abs modifier
5562 SmallVector<Register, 8> NegAbsElts;
5563 for (auto El : Elts) {
5564 Register FabsSrc;
5565 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5566 break;
5567 NegAbsElts.push_back(FabsSrc);
5568 }
5569 if (Elts.size() != NegAbsElts.size()) {
5570 // Neg
5571 Src = buildRegSequence(Elts, InsertPt, MRI);
5572 } else {
5573 // Neg and Abs
5574 Mods |= SISrcMods::NEG_HI;
5575 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5576 }
5577 } else {
5578 assert(ModOpcode == TargetOpcode::G_FABS);
5579 // Abs
5580 Mods |= SISrcMods::NEG_HI;
5581 Src = buildRegSequence(Elts, InsertPt, MRI);
5582 }
5583}
5584
5586AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5587 Register Src = Root.getReg();
5588 unsigned Mods = SISrcMods::OP_SEL_1;
5590
5591 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5592 assert(BV->getNumSources() > 0);
5593 // Based on first element decide which mod we match, neg or abs
5594 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5595 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5596 ? AMDGPU::G_FNEG
5597 : AMDGPU::G_FABS;
5598 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5599 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5600 if (ElF32->getOpcode() != ModOpcode)
5601 break;
5602 EltsF32.push_back(ElF32->getOperand(1).getReg());
5603 }
5604
5605 // All elements had ModOpcode modifier
5606 if (BV->getNumSources() == EltsF32.size()) {
5607 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5608 *MRI);
5609 }
5610 }
5611
5612 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5613 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5614}
5615
5617AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5618 Register Src = Root.getReg();
5619 unsigned Mods = SISrcMods::OP_SEL_1;
5620 SmallVector<Register, 8> EltsV2F16;
5621
5622 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5623 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5624 Register FNegSrc;
5625 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5626 break;
5627 EltsV2F16.push_back(FNegSrc);
5628 }
5629
5630 // All elements had ModOpcode modifier
5631 if (CV->getNumSources() == EltsV2F16.size()) {
5632 Mods |= SISrcMods::NEG;
5633 Mods |= SISrcMods::NEG_HI;
5634 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5635 }
5636 }
5637
5638 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5639 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5640}
5641
5643AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5644 Register Src = Root.getReg();
5645 unsigned Mods = SISrcMods::OP_SEL_1;
5646 SmallVector<Register, 8> EltsV2F16;
5647
5648 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5649 assert(CV->getNumSources() > 0);
5650 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5651 // Based on first element decide which mod we match, neg or abs
5652 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5653 ? AMDGPU::G_FNEG
5654 : AMDGPU::G_FABS;
5655
5656 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5657 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5658 if (ElV2F16->getOpcode() != ModOpcode)
5659 break;
5660 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5661 }
5662
5663 // All elements had ModOpcode modifier
5664 if (CV->getNumSources() == EltsV2F16.size()) {
5665 MachineIRBuilder B(*Root.getParent());
5666 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5667 *MRI);
5668 }
5669 }
5670
5671 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5672 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5673}
5674
5676AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5677 std::optional<FPValueAndVReg> FPValReg;
5678 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5679 if (TII.isInlineConstant(FPValReg->Value)) {
5680 return {{[=](MachineInstrBuilder &MIB) {
5681 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5682 }}};
5683 }
5684 // Non-inlineable splat floats should not fall-through for integer immediate
5685 // checks.
5686 return {};
5687 }
5688
5689 APInt ICst;
5690 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5691 if (TII.isInlineConstant(ICst)) {
5692 return {
5693 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5694 }
5695 }
5696
5697 return {};
5698}
5699
5701AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5702 Register Src =
5703 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5704 unsigned Key = 0;
5705
5706 Register ShiftSrc;
5707 std::optional<ValueAndVReg> ShiftAmt;
5708 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5709 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5710 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5711 Key = ShiftAmt->Value.getZExtValue() / 8;
5712 Src = ShiftSrc;
5713 }
5714
5715 return {{
5716 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5717 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5718 }};
5719}
5720
5722AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5723
5724 Register Src =
5725 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5726 unsigned Key = 0;
5727
5728 Register ShiftSrc;
5729 std::optional<ValueAndVReg> ShiftAmt;
5730 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5731 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5732 ShiftAmt->Value.getZExtValue() == 16) {
5733 Src = ShiftSrc;
5734 Key = 1;
5735 }
5736
5737 return {{
5738 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5739 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5740 }};
5741}
5742
5744AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5745 Register Src =
5746 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5747 unsigned Key = 0;
5748
5749 Register S32 = matchZeroExtendFromS32(Src);
5750 if (!S32)
5751 S32 = matchAnyExtendFromS32(Src);
5752
5753 if (S32) {
5754 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5755 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5756 assert(Def->getNumOperands() == 3);
5757 Register DstReg1 = Def->getOperand(1).getReg();
5758 if (mi_match(S32, *MRI,
5759 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5760 Src = Def->getOperand(2).getReg();
5761 Key = 1;
5762 }
5763 }
5764 }
5765
5766 return {{
5767 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5768 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5769 }};
5770}
5771
5773AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5774 Register Src;
5775 unsigned Mods;
5776 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5777
5778 // FIXME: Handle op_sel
5779 return {{
5780 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5781 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5782 }};
5783}
5784
5785// FIXME-TRUE16 remove when fake16 is removed
5787AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5788 Register Src;
5789 unsigned Mods;
5790 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5791 /*IsCanonicalizing=*/true,
5792 /*AllowAbs=*/false,
5793 /*OpSel=*/false);
5794
5795 return {{
5796 [=](MachineInstrBuilder &MIB) {
5797 MIB.addReg(
5798 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5799 },
5800 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5801 }};
5802}
5803
5805AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5806 Register Src;
5807 unsigned Mods;
5808 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5809 /*IsCanonicalizing=*/true,
5810 /*AllowAbs=*/false,
5811 /*OpSel=*/true);
5812
5813 return {{
5814 [=](MachineInstrBuilder &MIB) {
5815 MIB.addReg(
5816 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5817 },
5818 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5819 }};
5820}
5821
5822// Given \p Offset and load specified by the \p Root operand check if \p Offset
5823// is a multiple of the load byte size. If it is update \p Offset to a
5824// pre-scaled value and return true.
5825bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5827 bool IsSigned) const {
5828 if (!Subtarget->hasScaleOffset())
5829 return false;
5830
5831 const MachineInstr &MI = *Root.getParent();
5832 MachineMemOperand *MMO = *MI.memoperands_begin();
5833
5834 if (!MMO->getSize().hasValue())
5835 return false;
5836
5837 uint64_t Size = MMO->getSize().getValue();
5838
5839 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5840 if (!OffsetReg)
5841 OffsetReg = Offset;
5842
5843 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5844 OffsetReg = Def->Reg;
5845
5846 Register Op0;
5847 MachineInstr *Mul;
5848 bool ScaleOffset =
5849 (isPowerOf2_64(Size) &&
5850 mi_match(OffsetReg, *MRI,
5851 m_GShl(m_Reg(Op0),
5854 mi_match(OffsetReg, *MRI,
5856 m_Copy(m_SpecificICst(Size))))) ||
5857 mi_match(
5858 OffsetReg, *MRI,
5859 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5860 m_Reg(Op0), m_SpecificICst(Size))) ||
5861 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5862 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5863 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5864 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5865 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5866 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5867 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5868 mi_match(Mul->getOperand(3).getReg(), *MRI,
5870 m_Copy(m_SpecificICst(Size))))) &&
5871 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5872
5873 if (ScaleOffset)
5874 Offset = Op0;
5875
5876 return ScaleOffset;
5877}
5878
5879bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5880 Register &Base,
5881 Register *SOffset,
5882 int64_t *Offset,
5883 bool *ScaleOffset) const {
5884 MachineInstr *MI = Root.getParent();
5885 MachineBasicBlock *MBB = MI->getParent();
5886
5887 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5888 // then we can select all ptr + 32-bit offsets.
5889 SmallVector<GEPInfo, 4> AddrInfo;
5890 getAddrModeInfo(*MI, *MRI, AddrInfo);
5891
5892 if (AddrInfo.empty())
5893 return false;
5894
5895 const GEPInfo &GEPI = AddrInfo[0];
5896 std::optional<int64_t> EncodedImm;
5897
5898 if (ScaleOffset)
5899 *ScaleOffset = false;
5900
5901 if (SOffset && Offset) {
5902 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5903 /*HasSOffset=*/true);
5904 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5905 AddrInfo.size() > 1) {
5906 const GEPInfo &GEPI2 = AddrInfo[1];
5907 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5908 Register OffsetReg = GEPI2.SgprParts[1];
5909 if (ScaleOffset)
5910 *ScaleOffset =
5911 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5912 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5913 if (OffsetReg) {
5914 Base = GEPI2.SgprParts[0];
5915 *SOffset = OffsetReg;
5916 *Offset = *EncodedImm;
5917 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5918 return true;
5919
5920 // For unbuffered smem loads, it is illegal for the Immediate Offset
5921 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5922 // is negative. Handle the case where the Immediate Offset + SOffset
5923 // is negative.
5924 auto SKnown = VT->getKnownBits(*SOffset);
5925 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5926 return false;
5927
5928 return true;
5929 }
5930 }
5931 }
5932 return false;
5933 }
5934
5935 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5936 /*HasSOffset=*/false);
5937 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5938 Base = GEPI.SgprParts[0];
5939 *Offset = *EncodedImm;
5940 return true;
5941 }
5942
5943 // SGPR offset is unsigned.
5944 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5945 GEPI.Imm != 0) {
5946 // If we make it this far we have a load with an 32-bit immediate offset.
5947 // It is OK to select this using a sgpr offset, because we have already
5948 // failed trying to select this load into one of the _IMM variants since
5949 // the _IMM Patterns are considered before the _SGPR patterns.
5950 Base = GEPI.SgprParts[0];
5951 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5952 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5953 .addImm(GEPI.Imm);
5954 return true;
5955 }
5956
5957 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5958 Register OffsetReg = GEPI.SgprParts[1];
5959 if (ScaleOffset)
5960 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5961 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5962 if (OffsetReg) {
5963 Base = GEPI.SgprParts[0];
5964 *SOffset = OffsetReg;
5965 return true;
5966 }
5967 }
5968
5969 return false;
5970}
5971
5973AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5974 Register Base;
5975 int64_t Offset;
5976 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5977 /* ScaleOffset */ nullptr))
5978 return std::nullopt;
5979
5980 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5981 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5982}
5983
5985AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5986 SmallVector<GEPInfo, 4> AddrInfo;
5987 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5988
5989 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5990 return std::nullopt;
5991
5992 const GEPInfo &GEPInfo = AddrInfo[0];
5993 Register PtrReg = GEPInfo.SgprParts[0];
5994 std::optional<int64_t> EncodedImm =
5995 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5996 if (!EncodedImm)
5997 return std::nullopt;
5998
5999 return {{
6000 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
6001 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
6002 }};
6003}
6004
6006AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
6007 Register Base, SOffset;
6008 bool ScaleOffset;
6009 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
6010 &ScaleOffset))
6011 return std::nullopt;
6012
6013 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
6014 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
6015 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6016 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
6017}
6018
6020AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
6021 Register Base, SOffset;
6022 int64_t Offset;
6023 bool ScaleOffset;
6024 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
6025 return std::nullopt;
6026
6027 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
6028 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
6029 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6030 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6031 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
6032}
6033
6034std::pair<Register, int> AMDGPUInstructionSelector::selectFlatOffsetImpl(
6035 MachineOperand &Root, AMDGPU::FlatAddrSpace FlatVariant) const {
6036 MachineInstr *MI = Root.getParent();
6037
6038 auto Default = std::pair(Root.getReg(), 0);
6039
6040 if (!STI.hasFlatInstOffsets())
6041 return Default;
6042
6043 Register PtrBase;
6044 int64_t ConstOffset;
6045 bool IsInBounds;
6046 std::tie(PtrBase, ConstOffset, IsInBounds) =
6047 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6048
6049 // Adding the offset to the base address with an immediate in a FLAT
6050 // instruction must not change the memory aperture in which the address falls.
6051 // Therefore we can only fold offsets from inbounds GEPs into FLAT
6052 // instructions.
6053 if (ConstOffset == 0 ||
6054 (FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch &&
6055 !isFlatScratchBaseLegal(Root.getReg())) ||
6056 (FlatVariant == AMDGPU::FlatAddrSpace::FLAT && !IsInBounds))
6057 return Default;
6058
6059 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
6060 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
6061 return Default;
6062
6063 return std::pair(PtrBase, ConstOffset);
6064}
6065
6067AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
6068 auto PtrWithOffset = selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FLAT);
6069
6070 return {{
6071 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
6072 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
6073 }};
6074}
6075
6077AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
6078 auto PtrWithOffset =
6079 selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FlatGlobal);
6080
6081 return {{
6082 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
6083 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
6084 }};
6085}
6086
6088AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
6089 auto PtrWithOffset =
6090 selectFlatOffsetImpl(Root, AMDGPU::FlatAddrSpace::FlatScratch);
6091
6092 return {{
6093 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
6094 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
6095 }};
6096}
6097
6098// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
6100AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
6101 unsigned CPolBits,
6102 bool NeedIOffset) const {
6103 Register Addr = Root.getReg();
6104 Register PtrBase;
6105 int64_t ConstOffset;
6106 int64_t ImmOffset = 0;
6107
6108 // Match the immediate offset first, which canonically is moved as low as
6109 // possible.
6110 std::tie(PtrBase, ConstOffset, std::ignore) =
6111 getPtrBaseWithConstantOffset(Addr, *MRI);
6112
6113 if (ConstOffset != 0) {
6114 if (NeedIOffset &&
6115 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
6117 Addr = PtrBase;
6118 ImmOffset = ConstOffset;
6119 } else {
6120 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
6121 if (isSGPR(PtrBaseDef->Reg)) {
6122 if (ConstOffset > 0) {
6123 // Offset is too large.
6124 //
6125 // saddr + large_offset -> saddr +
6126 // (voffset = large_offset & ~MaxOffset) +
6127 // (large_offset & MaxOffset);
6128 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
6129 if (NeedIOffset) {
6130 std::tie(SplitImmOffset, RemainderOffset) =
6131 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
6133 }
6134
6135 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
6136 : isUInt<32>(RemainderOffset)) {
6137 MachineInstr *MI = Root.getParent();
6138 MachineBasicBlock *MBB = MI->getParent();
6139 Register HighBits =
6140 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6141
6142 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6143 HighBits)
6144 .addImm(RemainderOffset);
6145
6146 if (NeedIOffset)
6147 return {{
6148 [=](MachineInstrBuilder &MIB) {
6149 MIB.addReg(PtrBase);
6150 }, // saddr
6151 [=](MachineInstrBuilder &MIB) {
6152 MIB.addReg(HighBits);
6153 }, // voffset
6154 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
6155 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
6156 }};
6157 return {{
6158 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
6159 [=](MachineInstrBuilder &MIB) {
6160 MIB.addReg(HighBits);
6161 }, // voffset
6162 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
6163 }};
6164 }
6165 }
6166
6167 // We are adding a 64 bit SGPR and a constant. If constant bus limit
6168 // is 1 we would need to perform 1 or 2 extra moves for each half of
6169 // the constant and it is better to do a scalar add and then issue a
6170 // single VALU instruction to materialize zero. Otherwise it is less
6171 // instructions to perform VALU adds with immediates or inline literals.
6172 unsigned NumLiterals =
6173 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
6174 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
6175 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
6176 return std::nullopt;
6177 }
6178 }
6179 }
6180
6181 // Match the variable offset.
6182 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6183 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6184 // Look through the SGPR->VGPR copy.
6185 Register SAddr =
6186 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
6187
6188 if (isSGPR(SAddr)) {
6189 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
6190
6191 // It's possible voffset is an SGPR here, but the copy to VGPR will be
6192 // inserted later.
6193 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
6194 Subtarget->hasSignedGVSOffset());
6195 if (Register VOffset = matchExtendFromS32OrS32(
6196 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
6197 if (NeedIOffset)
6198 return {{[=](MachineInstrBuilder &MIB) { // saddr
6199 MIB.addReg(SAddr);
6200 },
6201 [=](MachineInstrBuilder &MIB) { // voffset
6202 MIB.addReg(VOffset);
6203 },
6204 [=](MachineInstrBuilder &MIB) { // offset
6205 MIB.addImm(ImmOffset);
6206 },
6207 [=](MachineInstrBuilder &MIB) { // cpol
6208 MIB.addImm(CPolBits |
6209 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6210 }}};
6211 return {{[=](MachineInstrBuilder &MIB) { // saddr
6212 MIB.addReg(SAddr);
6213 },
6214 [=](MachineInstrBuilder &MIB) { // voffset
6215 MIB.addReg(VOffset);
6216 },
6217 [=](MachineInstrBuilder &MIB) { // cpol
6218 MIB.addImm(CPolBits |
6219 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6220 }}};
6221 }
6222 }
6223 }
6224
6225 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6226 // drop this.
6227 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6228 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6229 return std::nullopt;
6230
6231 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6232 // moves required to copy a 64-bit SGPR to VGPR.
6233 MachineInstr *MI = Root.getParent();
6234 MachineBasicBlock *MBB = MI->getParent();
6235 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6236
6237 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6238 .addImm(0);
6239
6240 if (NeedIOffset)
6241 return {{
6242 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6243 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6244 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6245 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6246 }};
6247 return {{
6248 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6249 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6250 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6251 }};
6252}
6253
6255AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6256 return selectGlobalSAddr(Root, 0);
6257}
6258
6260AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6261 const MachineInstr &I = *Root.getParent();
6262
6263 // We are assuming CPol is always the last operand of the intrinsic.
6264 auto PassedCPol =
6265 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6266 return selectGlobalSAddr(Root, PassedCPol);
6267}
6268
6270AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6271 const MachineInstr &I = *Root.getParent();
6272
6273 // We are assuming CPol is second from last operand of the intrinsic.
6274 auto PassedCPol =
6275 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6276 return selectGlobalSAddr(Root, PassedCPol);
6277}
6278
6280AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6281 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6282}
6283
6285AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6286 MachineOperand &Root) const {
6287 const MachineInstr &I = *Root.getParent();
6288
6289 // We are assuming CPol is always the last operand of the intrinsic.
6290 auto PassedCPol =
6291 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6292 return selectGlobalSAddr(Root, PassedCPol, false);
6293}
6294
6296AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6297 MachineOperand &Root) const {
6298 const MachineInstr &I = *Root.getParent();
6299
6300 // We are assuming CPol is second from last operand of the intrinsic.
6301 auto PassedCPol =
6302 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6303 return selectGlobalSAddr(Root, PassedCPol, false);
6304}
6305
6307AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6308 Register Addr = Root.getReg();
6309 Register PtrBase;
6310 int64_t ConstOffset;
6311 int64_t ImmOffset = 0;
6312
6313 // Match the immediate offset first, which canonically is moved as low as
6314 // possible.
6315 std::tie(PtrBase, ConstOffset, std::ignore) =
6316 getPtrBaseWithConstantOffset(Addr, *MRI);
6317
6318 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6319 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6321 Addr = PtrBase;
6322 ImmOffset = ConstOffset;
6323 }
6324
6325 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6326 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6327 int FI = AddrDef->MI->getOperand(1).getIndex();
6328 return {{
6329 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6330 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6331 }};
6332 }
6333
6334 Register SAddr = AddrDef->Reg;
6335
6336 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6337 Register LHS = AddrDef->MI->getOperand(1).getReg();
6338 Register RHS = AddrDef->MI->getOperand(2).getReg();
6339 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6340 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6341
6342 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6343 isSGPR(RHSDef->Reg)) {
6344 int FI = LHSDef->MI->getOperand(1).getIndex();
6345 MachineInstr &I = *Root.getParent();
6346 MachineBasicBlock *BB = I.getParent();
6347 const DebugLoc &DL = I.getDebugLoc();
6348 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6349
6350 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6351 .addFrameIndex(FI)
6352 .addReg(RHSDef->Reg)
6353 .setOperandDead(3); // Dead scc
6354 }
6355 }
6356
6357 if (!isSGPR(SAddr))
6358 return std::nullopt;
6359
6360 return {{
6361 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6362 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6363 }};
6364}
6365
6366// Check whether the flat scratch SVS swizzle bug affects this access.
6367bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6368 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6369 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6370 return false;
6371
6372 // The bug affects the swizzling of SVS accesses if there is any carry out
6373 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6374 // voffset to (soffset + inst_offset).
6375 auto VKnown = VT->getKnownBits(VAddr);
6376 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6377 KnownBits::makeConstant(APInt(32, ImmOffset)));
6378 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6379 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6380 return (VMax & 3) + (SMax & 3) >= 4;
6381}
6382
6384AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6385 Register Addr = Root.getReg();
6386 Register PtrBase;
6387 int64_t ConstOffset;
6388 int64_t ImmOffset = 0;
6389
6390 // Match the immediate offset first, which canonically is moved as low as
6391 // possible.
6392 std::tie(PtrBase, ConstOffset, std::ignore) =
6393 getPtrBaseWithConstantOffset(Addr, *MRI);
6394
6395 Register OrigAddr = Addr;
6396 if (ConstOffset != 0 &&
6397 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6399 Addr = PtrBase;
6400 ImmOffset = ConstOffset;
6401 }
6402
6403 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6404 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6405 return std::nullopt;
6406
6407 Register RHS = AddrDef->MI->getOperand(2).getReg();
6408 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6409 return std::nullopt;
6410
6411 Register LHS = AddrDef->MI->getOperand(1).getReg();
6412 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6413
6414 if (OrigAddr != Addr) {
6415 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6416 return std::nullopt;
6417 } else {
6418 if (!isFlatScratchBaseLegalSV(OrigAddr))
6419 return std::nullopt;
6420 }
6421
6422 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6423 return std::nullopt;
6424
6425 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6427 : 0;
6428
6429 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6430 int FI = LHSDef->MI->getOperand(1).getIndex();
6431 return {{
6432 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6433 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6434 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6435 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6436 }};
6437 }
6438
6439 if (!isSGPR(LHS))
6440 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6441 LHS = Def->Reg;
6442
6443 if (!isSGPR(LHS))
6444 return std::nullopt;
6445
6446 return {{
6447 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6448 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6449 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6450 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6451 }};
6452}
6453
6455AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6456 MachineInstr *MI = Root.getParent();
6457 MachineBasicBlock *MBB = MI->getParent();
6458 MachineFunction *MF = MBB->getParent();
6459 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6460
6461 int64_t Offset = 0;
6462 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6464 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6465
6466 // TODO: Should this be inside the render function? The iterator seems to
6467 // move.
6468 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6469 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6470 HighBits)
6471 .addImm(Offset & ~MaxOffset);
6472
6473 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6474 MIB.addReg(Info->getScratchRSrcReg());
6475 },
6476 [=](MachineInstrBuilder &MIB) { // vaddr
6477 MIB.addReg(HighBits);
6478 },
6479 [=](MachineInstrBuilder &MIB) { // soffset
6480 // Use constant zero for soffset and rely on eliminateFrameIndex
6481 // to choose the appropriate frame register if need be.
6482 MIB.addImm(0);
6483 },
6484 [=](MachineInstrBuilder &MIB) { // offset
6485 MIB.addImm(Offset & MaxOffset);
6486 }}};
6487 }
6488
6489 assert(Offset == 0 || Offset == -1);
6490
6491 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6492 // offsets.
6493 std::optional<int> FI;
6494 Register VAddr = Root.getReg();
6495
6496 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6497 Register PtrBase;
6498 int64_t ConstOffset;
6499 std::tie(PtrBase, ConstOffset, std::ignore) =
6500 getPtrBaseWithConstantOffset(VAddr, *MRI);
6501 if (ConstOffset != 0) {
6502 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6503 (!STI.privateMemoryResourceIsRangeChecked() ||
6504 VT->signBitIsZero(PtrBase))) {
6505 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6506 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6507 FI = PtrBaseDef->getOperand(1).getIndex();
6508 else
6509 VAddr = PtrBase;
6510 Offset = ConstOffset;
6511 }
6512 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6513 FI = RootDef->getOperand(1).getIndex();
6514 }
6515
6516 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6517 MIB.addReg(Info->getScratchRSrcReg());
6518 },
6519 [=](MachineInstrBuilder &MIB) { // vaddr
6520 if (FI)
6521 MIB.addFrameIndex(*FI);
6522 else
6523 MIB.addReg(VAddr);
6524 },
6525 [=](MachineInstrBuilder &MIB) { // soffset
6526 // Use constant zero for soffset and rely on eliminateFrameIndex
6527 // to choose the appropriate frame register if need be.
6528 MIB.addImm(0);
6529 },
6530 [=](MachineInstrBuilder &MIB) { // offset
6531 MIB.addImm(Offset);
6532 }}};
6533}
6534
6535bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6536 int64_t Offset) const {
6537 if (!isUInt<16>(Offset))
6538 return false;
6539
6540 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6541 return true;
6542
6543 // On Southern Islands instruction with a negative base value and an offset
6544 // don't seem to work.
6545 return VT->signBitIsZero(Base);
6546}
6547
6548bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6549 int64_t Offset1,
6550 unsigned Size) const {
6551 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6552 return false;
6553 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6554 return false;
6555
6556 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6557 return true;
6558
6559 // On Southern Islands instruction with a negative base value and an offset
6560 // don't seem to work.
6561 return VT->signBitIsZero(Base);
6562}
6563
6564// Return whether the operation has NoUnsignedWrap property.
6565static bool isNoUnsignedWrap(MachineInstr *Addr) {
6566 return Addr->getOpcode() == TargetOpcode::G_OR ||
6567 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6569}
6570
6571// Check that the base address of flat scratch load/store in the form of `base +
6572// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6573// requirement). We always treat the first operand as the base address here.
6574bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6575 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6576
6577 if (isNoUnsignedWrap(AddrMI))
6578 return true;
6579
6580 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6581 // values.
6582 if (STI.hasSignedScratchOffsets())
6583 return true;
6584
6585 Register LHS = AddrMI->getOperand(1).getReg();
6586 Register RHS = AddrMI->getOperand(2).getReg();
6587
6588 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6589 std::optional<ValueAndVReg> RhsValReg =
6591 // If the immediate offset is negative and within certain range, the base
6592 // address cannot also be negative. If the base is also negative, the sum
6593 // would be either negative or much larger than the valid range of scratch
6594 // memory a thread can access.
6595 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6596 RhsValReg->Value.getSExtValue() > -0x40000000)
6597 return true;
6598 }
6599
6600 return VT->signBitIsZero(LHS);
6601}
6602
6603// Check address value in SGPR/VGPR are legal for flat scratch in the form
6604// of: SGPR + VGPR.
6605bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6606 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6607
6608 if (isNoUnsignedWrap(AddrMI))
6609 return true;
6610
6611 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6612 // values.
6613 if (STI.hasSignedScratchOffsets())
6614 return true;
6615
6616 Register LHS = AddrMI->getOperand(1).getReg();
6617 Register RHS = AddrMI->getOperand(2).getReg();
6618 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6619}
6620
6621// Check address value in SGPR/VGPR are legal for flat scratch in the form
6622// of: SGPR + VGPR + Imm.
6623bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6624 Register Addr) const {
6625 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6626 // values.
6627 if (STI.hasSignedScratchOffsets())
6628 return true;
6629
6630 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6631 Register Base = AddrMI->getOperand(1).getReg();
6632 std::optional<DefinitionAndSourceRegister> BaseDef =
6634 std::optional<ValueAndVReg> RHSOffset =
6636 assert(RHSOffset);
6637
6638 // If the immediate offset is negative and within certain range, the base
6639 // address cannot also be negative. If the base is also negative, the sum
6640 // would be either negative or much larger than the valid range of scratch
6641 // memory a thread can access.
6642 if (isNoUnsignedWrap(BaseDef->MI) &&
6643 (isNoUnsignedWrap(AddrMI) ||
6644 (RHSOffset->Value.getSExtValue() < 0 &&
6645 RHSOffset->Value.getSExtValue() > -0x40000000)))
6646 return true;
6647
6648 Register LHS = BaseDef->MI->getOperand(1).getReg();
6649 Register RHS = BaseDef->MI->getOperand(2).getReg();
6650 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6651}
6652
6653bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6654 unsigned ShAmtBits) const {
6655 assert(MI.getOpcode() == TargetOpcode::G_AND);
6656
6657 std::optional<APInt> RHS =
6658 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6659 if (!RHS)
6660 return false;
6661
6662 if (RHS->countr_one() >= ShAmtBits)
6663 return true;
6664
6665 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6666 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6667}
6668
6670AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6671 MachineOperand &Root) const {
6672 Register Reg = Root.getReg();
6673 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6674
6675 std::optional<DefinitionAndSourceRegister> Def =
6677 assert(Def && "this shouldn't be an optional result");
6678 Reg = Def->Reg;
6679
6680 if (Register WaveBase = getWaveAddress(Def->MI)) {
6681 return {{
6682 [=](MachineInstrBuilder &MIB) { // rsrc
6683 MIB.addReg(Info->getScratchRSrcReg());
6684 },
6685 [=](MachineInstrBuilder &MIB) { // soffset
6686 MIB.addReg(WaveBase);
6687 },
6688 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6689 }};
6690 }
6691
6692 int64_t Offset = 0;
6693
6694 // FIXME: Copy check is a hack
6696 if (mi_match(Reg, *MRI,
6697 m_GPtrAdd(m_Reg(BasePtr),
6699 if (!TII.isLegalMUBUFImmOffset(Offset))
6700 return {};
6701 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6702 Register WaveBase = getWaveAddress(BasePtrDef);
6703 if (!WaveBase)
6704 return {};
6705
6706 return {{
6707 [=](MachineInstrBuilder &MIB) { // rsrc
6708 MIB.addReg(Info->getScratchRSrcReg());
6709 },
6710 [=](MachineInstrBuilder &MIB) { // soffset
6711 MIB.addReg(WaveBase);
6712 },
6713 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6714 }};
6715 }
6716
6717 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6718 !TII.isLegalMUBUFImmOffset(Offset))
6719 return {};
6720
6721 return {{
6722 [=](MachineInstrBuilder &MIB) { // rsrc
6723 MIB.addReg(Info->getScratchRSrcReg());
6724 },
6725 [=](MachineInstrBuilder &MIB) { // soffset
6726 MIB.addImm(0);
6727 },
6728 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6729 }};
6730}
6731
6732std::pair<Register, unsigned>
6733AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6734 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6735 int64_t ConstAddr = 0;
6736
6737 Register PtrBase;
6738 int64_t Offset;
6739 std::tie(PtrBase, Offset, std::ignore) =
6740 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6741
6742 if (Offset) {
6743 if (isDSOffsetLegal(PtrBase, Offset)) {
6744 // (add n0, c0)
6745 return std::pair(PtrBase, Offset);
6746 }
6747 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6748 // TODO
6749
6750
6751 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6752 // TODO
6753
6754 }
6755
6756 return std::pair(Root.getReg(), 0);
6757}
6758
6760AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6761 Register Reg;
6762 unsigned Offset;
6763 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6764 return {{
6765 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6766 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6767 }};
6768}
6769
6771AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6772 return selectDSReadWrite2(Root, 4);
6773}
6774
6776AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6777 return selectDSReadWrite2(Root, 8);
6778}
6779
6781AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6782 unsigned Size) const {
6783 Register Reg;
6784 unsigned Offset;
6785 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6786 return {{
6787 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6788 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6789 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6790 }};
6791}
6792
6793std::pair<Register, unsigned>
6794AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6795 unsigned Size) const {
6796 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6797 int64_t ConstAddr = 0;
6798
6799 Register PtrBase;
6800 int64_t Offset;
6801 std::tie(PtrBase, Offset, std::ignore) =
6802 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6803
6804 if (Offset) {
6805 int64_t OffsetValue0 = Offset;
6806 int64_t OffsetValue1 = Offset + Size;
6807 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6808 // (add n0, c0)
6809 return std::pair(PtrBase, OffsetValue0 / Size);
6810 }
6811 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6812 // TODO
6813
6814 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6815 // TODO
6816
6817 }
6818
6819 return std::pair(Root.getReg(), 0);
6820}
6821
6822/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6823/// the base value with the constant offset, and if the offset computation is
6824/// known to be inbounds. There may be intervening copies between \p Root and
6825/// the identified constant. Returns \p Root, 0, false if this does not match
6826/// the pattern.
6827std::tuple<Register, int64_t, bool>
6828AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6829 Register Root, const MachineRegisterInfo &MRI) const {
6830 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6831 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6832 return {Root, 0, false};
6833
6834 MachineOperand &RHS = RootI->getOperand(2);
6835 std::optional<ValueAndVReg> MaybeOffset =
6837 if (!MaybeOffset)
6838 return {Root, 0, false};
6839 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6840 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6841 IsInBounds};
6842}
6843
6845 MIB.addImm(0);
6846}
6847
6848/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6849/// BasePtr is not valid, a null base pointer will be used.
6851 uint32_t FormatLo, uint32_t FormatHi,
6852 Register BasePtr) {
6853 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6854 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6855 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6856 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6857
6858 B.buildInstr(AMDGPU::S_MOV_B32)
6859 .addDef(RSrc2)
6860 .addImm(FormatLo);
6861 B.buildInstr(AMDGPU::S_MOV_B32)
6862 .addDef(RSrc3)
6863 .addImm(FormatHi);
6864
6865 // Build the half of the subregister with the constants before building the
6866 // full 128-bit register. If we are building multiple resource descriptors,
6867 // this will allow CSEing of the 2-component register.
6868 B.buildInstr(AMDGPU::REG_SEQUENCE)
6869 .addDef(RSrcHi)
6870 .addReg(RSrc2)
6871 .addImm(AMDGPU::sub0)
6872 .addReg(RSrc3)
6873 .addImm(AMDGPU::sub1);
6874
6875 Register RSrcLo = BasePtr;
6876 if (!BasePtr) {
6877 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6878 B.buildInstr(AMDGPU::S_MOV_B64)
6879 .addDef(RSrcLo)
6880 .addImm(0);
6881 }
6882
6883 B.buildInstr(AMDGPU::REG_SEQUENCE)
6884 .addDef(RSrc)
6885 .addReg(RSrcLo)
6886 .addImm(AMDGPU::sub0_sub1)
6887 .addReg(RSrcHi)
6888 .addImm(AMDGPU::sub2_sub3);
6889
6890 return RSrc;
6891}
6892
6894 const SIInstrInfo &TII, Register BasePtr) {
6895 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6896
6897 // FIXME: Why are half the "default" bits ignored based on the addressing
6898 // mode?
6899 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6900}
6901
6903 const SIInstrInfo &TII, Register BasePtr) {
6904 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6905
6906 // FIXME: Why are half the "default" bits ignored based on the addressing
6907 // mode?
6908 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6909}
6910
6911AMDGPUInstructionSelector::MUBUFAddressData
6912AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6913 MUBUFAddressData Data;
6914 Data.N0 = Src;
6915
6916 Register PtrBase;
6917 int64_t Offset;
6918
6919 std::tie(PtrBase, Offset, std::ignore) =
6920 getPtrBaseWithConstantOffset(Src, *MRI);
6921 if (isUInt<32>(Offset)) {
6922 Data.N0 = PtrBase;
6923 Data.Offset = Offset;
6924 }
6925
6926 if (MachineInstr *InputAdd
6927 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6928 Data.N2 = InputAdd->getOperand(1).getReg();
6929 Data.N3 = InputAdd->getOperand(2).getReg();
6930
6931 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6932 // FIXME: Don't know this was defined by operand 0
6933 //
6934 // TODO: Remove this when we have copy folding optimizations after
6935 // RegBankSelect.
6936 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6937 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6938 }
6939
6940 return Data;
6941}
6942
6943/// Return if the addr64 mubuf mode should be used for the given address.
6944bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6945 // (ptr_add N2, N3) -> addr64, or
6946 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6947 if (Addr.N2)
6948 return true;
6949
6950 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6951 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6952}
6953
6954/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6955/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6956/// component.
6957void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6958 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6959 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6960 return;
6961
6962 // Illegal offset, store it in soffset.
6963 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6964 B.buildInstr(AMDGPU::S_MOV_B32)
6965 .addDef(SOffset)
6966 .addImm(ImmOffset);
6967 ImmOffset = 0;
6968}
6969
6970bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6971 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6972 Register &SOffset, int64_t &Offset) const {
6973 // FIXME: Predicates should stop this from reaching here.
6974 // addr64 bit was removed for volcanic islands.
6975 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6976 return false;
6977
6978 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6979 if (!shouldUseAddr64(AddrData))
6980 return false;
6981
6982 Register N0 = AddrData.N0;
6983 Register N2 = AddrData.N2;
6984 Register N3 = AddrData.N3;
6985 Offset = AddrData.Offset;
6986
6987 // Base pointer for the SRD.
6988 Register SRDPtr;
6989
6990 if (N2) {
6991 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6992 assert(N3);
6993 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6994 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6995 // addr64, and construct the default resource from a 0 address.
6996 VAddr = N0;
6997 } else {
6998 SRDPtr = N3;
6999 VAddr = N2;
7000 }
7001 } else {
7002 // N2 is not divergent.
7003 SRDPtr = N2;
7004 VAddr = N3;
7005 }
7006 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
7007 // Use the default null pointer in the resource
7008 VAddr = N0;
7009 } else {
7010 // N0 -> offset, or
7011 // (N0 + C1) -> offset
7012 SRDPtr = N0;
7013 }
7014
7015 MachineIRBuilder B(*Root.getParent());
7016 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
7017 splitIllegalMUBUFOffset(B, SOffset, Offset);
7018 return true;
7019}
7020
7021bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
7022 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
7023 int64_t &Offset) const {
7024
7025 // FIXME: Pattern should not reach here.
7026 if (STI.useFlatForGlobal())
7027 return false;
7028
7029 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
7030 if (shouldUseAddr64(AddrData))
7031 return false;
7032
7033 // N0 -> offset, or
7034 // (N0 + C1) -> offset
7035 Register SRDPtr = AddrData.N0;
7036 Offset = AddrData.Offset;
7037
7038 // TODO: Look through extensions for 32-bit soffset.
7039 MachineIRBuilder B(*Root.getParent());
7040
7041 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
7042 splitIllegalMUBUFOffset(B, SOffset, Offset);
7043 return true;
7044}
7045
7047AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
7048 Register VAddr;
7049 Register RSrcReg;
7050 Register SOffset;
7051 int64_t Offset = 0;
7052
7053 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
7054 return {};
7055
7056 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
7057 // pattern.
7058 return {{
7059 [=](MachineInstrBuilder &MIB) { // rsrc
7060 MIB.addReg(RSrcReg);
7061 },
7062 [=](MachineInstrBuilder &MIB) { // vaddr
7063 MIB.addReg(VAddr);
7064 },
7065 [=](MachineInstrBuilder &MIB) { // soffset
7066 if (SOffset)
7067 MIB.addReg(SOffset);
7068 else if (STI.hasRestrictedSOffset())
7069 MIB.addReg(AMDGPU::SGPR_NULL);
7070 else
7071 MIB.addImm(0);
7072 },
7073 [=](MachineInstrBuilder &MIB) { // offset
7074 MIB.addImm(Offset);
7075 },
7076 addZeroImm, // cpol
7077 addZeroImm, // tfe
7078 addZeroImm // swz
7079 }};
7080}
7081
7083AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
7084 Register RSrcReg;
7085 Register SOffset;
7086 int64_t Offset = 0;
7087
7088 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
7089 return {};
7090
7091 return {{
7092 [=](MachineInstrBuilder &MIB) { // rsrc
7093 MIB.addReg(RSrcReg);
7094 },
7095 [=](MachineInstrBuilder &MIB) { // soffset
7096 if (SOffset)
7097 MIB.addReg(SOffset);
7098 else if (STI.hasRestrictedSOffset())
7099 MIB.addReg(AMDGPU::SGPR_NULL);
7100 else
7101 MIB.addImm(0);
7102 },
7103 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
7104 addZeroImm, // cpol
7105 addZeroImm, // tfe
7106 addZeroImm, // swz
7107 }};
7108}
7109
7111AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
7112
7113 Register SOffset = Root.getReg();
7114
7115 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
7116 SOffset = AMDGPU::SGPR_NULL;
7117
7118 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
7119}
7120
7121/// Get an immediate that must be 32-bits, and treated as zero extended.
7122static std::optional<uint64_t>
7124 // getIConstantVRegVal sexts any values, so see if that matters.
7125 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
7126 if (!OffsetVal || !isInt<32>(*OffsetVal))
7127 return std::nullopt;
7128 return Lo_32(*OffsetVal);
7129}
7130
7132AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
7133 std::optional<uint64_t> OffsetVal =
7134 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
7135 if (!OffsetVal)
7136 return {};
7137
7138 std::optional<int64_t> EncodedImm =
7139 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
7140 if (!EncodedImm)
7141 return {};
7142
7143 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
7144}
7145
7147AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
7148 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
7149
7150 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
7151 if (!OffsetVal)
7152 return {};
7153
7154 std::optional<int64_t> EncodedImm =
7156 if (!EncodedImm)
7157 return {};
7158
7159 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
7160}
7161
7163AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
7164 // Match the (soffset + offset) pair as a 32-bit register base and
7165 // an immediate offset.
7166 Register SOffset;
7167 unsigned Offset;
7168 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
7169 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
7170 if (!SOffset)
7171 return std::nullopt;
7172
7173 std::optional<int64_t> EncodedOffset =
7174 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
7175 if (!EncodedOffset)
7176 return std::nullopt;
7177
7178 assert(MRI->getType(SOffset) == LLT::scalar(32));
7179 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
7180 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
7181}
7182
7183std::pair<Register, unsigned>
7184AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
7185 bool &Matched) const {
7186 Matched = false;
7187
7188 Register Src;
7189 unsigned Mods;
7190 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
7191
7192 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
7193 assert(MRI->getType(Src) == LLT::scalar(16));
7194
7195 // Only change Src if src modifier could be gained. In such cases new Src
7196 // could be sgpr but this does not violate constant bus restriction for
7197 // instruction that is being selected.
7198 Src = stripBitCast(Src, *MRI);
7199
7200 const auto CheckAbsNeg = [&]() {
7201 // Be careful about folding modifiers if we already have an abs. fneg is
7202 // applied last, so we don't want to apply an earlier fneg.
7203 if ((Mods & SISrcMods::ABS) == 0) {
7204 unsigned ModsTmp;
7205 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7206
7207 if ((ModsTmp & SISrcMods::NEG) != 0)
7208 Mods ^= SISrcMods::NEG;
7209
7210 if ((ModsTmp & SISrcMods::ABS) != 0)
7211 Mods |= SISrcMods::ABS;
7212 }
7213 };
7214
7215 CheckAbsNeg();
7216
7217 // op_sel/op_sel_hi decide the source type and source.
7218 // If the source's op_sel_hi is set, it indicates to do a conversion from
7219 // fp16. If the sources's op_sel is set, it picks the high half of the
7220 // source register.
7221
7222 Mods |= SISrcMods::OP_SEL_1;
7223
7224 if (isExtractHiElt(*MRI, Src, Src)) {
7225 Mods |= SISrcMods::OP_SEL_0;
7226 CheckAbsNeg();
7227 }
7228
7229 Matched = true;
7230 }
7231
7232 return {Src, Mods};
7233}
7234
7236AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7237 MachineOperand &Root) const {
7238 Register Src;
7239 unsigned Mods;
7240 bool Matched;
7241 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7242 if (!Matched)
7243 return {};
7244
7245 return {{
7246 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7247 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7248 }};
7249}
7250
7252AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7253 Register Src;
7254 unsigned Mods;
7255 bool Matched;
7256 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7257
7258 return {{
7259 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7261 }};
7262}
7263
7264bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7265 MachineInstr &I, Intrinsic::ID IntrID) const {
7266 MachineBasicBlock *MBB = I.getParent();
7267 const DebugLoc &DL = I.getDebugLoc();
7268 Register CCReg = I.getOperand(0).getReg();
7269
7270 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7271 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7272
7273 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7274 .addImm(I.getOperand(2).getImm());
7275
7276 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7277
7278 I.eraseFromParent();
7279 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7280 *MRI);
7281}
7282
7283bool AMDGPUInstructionSelector::selectSGetBarrierState(
7284 MachineInstr &I, Intrinsic::ID IntrID) const {
7285 MachineBasicBlock *MBB = I.getParent();
7286 const DebugLoc &DL = I.getDebugLoc();
7287 const MachineOperand &BarOp = I.getOperand(2);
7288 std::optional<int64_t> BarValImm =
7289 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7290
7291 if (!BarValImm) {
7292 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7293 .addReg(BarOp.getReg());
7294 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7295 }
7296 MachineInstrBuilder MIB;
7297 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7298 : AMDGPU::S_GET_BARRIER_STATE_M0;
7299 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7300
7301 auto DstReg = I.getOperand(0).getReg();
7302 const TargetRegisterClass *DstRC =
7303 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7304 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7305 return false;
7306 MIB.addDef(DstReg);
7307 if (BarValImm) {
7308 MIB.addImm(*BarValImm);
7309 }
7310 I.eraseFromParent();
7311 return true;
7312}
7313
7314unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7315 if (HasInlineConst) {
7316 switch (IntrID) {
7317 default:
7318 llvm_unreachable("not a named barrier op");
7319 case Intrinsic::amdgcn_s_barrier_join:
7320 return AMDGPU::S_BARRIER_JOIN_IMM;
7321 case Intrinsic::amdgcn_s_wakeup_barrier:
7322 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7323 case Intrinsic::amdgcn_s_get_named_barrier_state:
7324 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7325 };
7326 } else {
7327 switch (IntrID) {
7328 default:
7329 llvm_unreachable("not a named barrier op");
7330 case Intrinsic::amdgcn_s_barrier_join:
7331 return AMDGPU::S_BARRIER_JOIN_M0;
7332 case Intrinsic::amdgcn_s_wakeup_barrier:
7333 return AMDGPU::S_WAKEUP_BARRIER_M0;
7334 case Intrinsic::amdgcn_s_get_named_barrier_state:
7335 return AMDGPU::S_GET_BARRIER_STATE_M0;
7336 };
7337 }
7338}
7339
7340bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7341 MachineInstr &I, Intrinsic::ID IntrID) const {
7342 MachineBasicBlock *MBB = I.getParent();
7343 const DebugLoc &DL = I.getDebugLoc();
7344 const MachineOperand &BarOp = I.getOperand(1);
7345 const MachineOperand &CntOp = I.getOperand(2);
7346
7347 // A member count of 0 means "keep existing member count". That plus a known
7348 // constant value for the barrier ID lets us use the immarg form.
7349 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7350 std::optional<int64_t> CntImm =
7351 getIConstantVRegSExtVal(CntOp.getReg(), *MRI);
7352 if (CntImm && *CntImm == 0) {
7353 std::optional<int64_t> BarValImm =
7354 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7355 if (BarValImm) {
7356 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7357 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
7358 .addImm(BarID);
7359 I.eraseFromParent();
7360 return true;
7361 }
7362 }
7363 }
7364
7365 // BarID = (BarOp >> 4) & 0x3F
7366 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7367 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7368 .add(BarOp)
7369 .addImm(4u)
7370 .setOperandDead(3); // Dead scc
7371
7372 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7373 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7374 .addReg(TmpReg0)
7375 .addImm(0x3F)
7376 .setOperandDead(3); // Dead scc
7377
7378 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7379 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7380 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7381 .add(CntOp)
7382 .addImm(0x3F)
7383 .setOperandDead(3); // Dead scc
7384
7385 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7386 constexpr unsigned ShAmt = 16;
7387 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7388 .addReg(TmpReg2)
7389 .addImm(ShAmt)
7390 .setOperandDead(3); // Dead scc
7391
7392 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7393 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7394 .addReg(TmpReg1)
7395 .addReg(TmpReg3)
7396 .setOperandDead(3); // Dead scc;
7397
7398 auto CopyMIB =
7399 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7400 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7401
7402 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7403 ? AMDGPU::S_BARRIER_INIT_M0
7404 : AMDGPU::S_BARRIER_SIGNAL_M0;
7405 MachineInstrBuilder MIB;
7406 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7407
7408 I.eraseFromParent();
7409 return true;
7410}
7411
7412bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7413 MachineInstr &I, Intrinsic::ID IntrID) const {
7414 MachineBasicBlock *MBB = I.getParent();
7415 const DebugLoc &DL = I.getDebugLoc();
7416 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7417 ? I.getOperand(2)
7418 : I.getOperand(1);
7419 std::optional<int64_t> BarValImm =
7420 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7421
7422 if (!BarValImm) {
7423 // BarID = (BarOp >> 4) & 0x3F
7424 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7425 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7426 .addReg(BarOp.getReg())
7427 .addImm(4u)
7428 .setOperandDead(3); // Dead scc;
7429
7430 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7431 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7432 .addReg(TmpReg0)
7433 .addImm(0x3F)
7434 .setOperandDead(3); // Dead scc;
7435
7436 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7437 .addReg(TmpReg1);
7438 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7439 }
7440
7441 MachineInstrBuilder MIB;
7442 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7443 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7444
7445 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7446 auto DstReg = I.getOperand(0).getReg();
7447 const TargetRegisterClass *DstRC =
7448 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7449 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7450 return false;
7451 MIB.addDef(DstReg);
7452 }
7453
7454 if (BarValImm) {
7455 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7456 MIB.addImm(BarId);
7457 }
7458
7459 I.eraseFromParent();
7460 return true;
7461}
7462
7463void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7464 const MachineInstr &MI,
7465 int OpIdx) const {
7466 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7467 "Expected G_CONSTANT");
7468 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7469}
7470
7471void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7472 const MachineInstr &MI,
7473 int OpIdx) const {
7474 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7475 "Expected G_CONSTANT");
7476 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7477}
7478
7479void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7480 const MachineInstr &MI,
7481 int OpIdx) const {
7482 const MachineOperand &Op = MI.getOperand(1);
7483 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7484 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7485}
7486
7487void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7488 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7489 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7490 "Expected G_CONSTANT");
7491 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7492}
7493
7494/// This only really exists to satisfy DAG type checking machinery, so is a
7495/// no-op here.
7496void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7497 const MachineInstr &MI,
7498 int OpIdx) const {
7499 const MachineOperand &Op = MI.getOperand(OpIdx);
7500 int64_t Imm;
7501 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7502 MIB.addImm(Imm);
7503 else
7504 MIB.addImm(Op.getImm());
7505}
7506
7507void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7508 const MachineInstr &MI,
7509 int OpIdx) const {
7510 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7511}
7512
7513void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7514 const MachineInstr &MI,
7515 int OpIdx) const {
7516 assert(OpIdx >= 0 && "expected to match an immediate operand");
7517 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7518}
7519
7520void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7521 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7522 assert(OpIdx >= 0 && "expected to match an immediate operand");
7523 MIB.addImm(
7524 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7525}
7526
7527void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7528 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7529 assert(OpIdx >= 0 && "expected to match an immediate operand");
7530 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7532 : (int64_t)SISrcMods::DST_OP_SEL);
7533}
7534
7535void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7536 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7537 assert(OpIdx >= 0 && "expected to match an immediate operand");
7538 MIB.addImm(
7539 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7540}
7541
7542void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7543 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7544 assert(OpIdx >= 0 && "expected to match an immediate operand");
7545 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7546 ? (int64_t)(SISrcMods::OP_SEL_0)
7547 : 0);
7548}
7549
7550void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7551 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7552 assert(OpIdx >= 0 && "expected to match an immediate operand");
7553 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7554 : 0);
7555}
7556
7557void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7558 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7559 assert(OpIdx >= 0 && "expected to match an immediate operand");
7560 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7561 : 0);
7562}
7563
7564void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7565 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7566 assert(OpIdx >= 0 && "expected to match an immediate operand");
7567 MIB.addImm(
7568 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7569}
7570
7571void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7572 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7573 assert(OpIdx >= 0 && "expected to match an immediate operand");
7574 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7575 ? (int64_t)SISrcMods::DST_OP_SEL
7576 : 0);
7577}
7578
7579void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7580 const MachineInstr &MI,
7581 int OpIdx) const {
7582 assert(OpIdx >= 0 && "expected to match an immediate operand");
7583 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7586}
7587
7588void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7589 const MachineInstr &MI,
7590 int OpIdx) const {
7591 assert(OpIdx >= 0 && "expected to match an immediate operand");
7592 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7595 MIB.addImm(Swizzle);
7596}
7597
7598void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7599 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7600 assert(OpIdx >= 0 && "expected to match an immediate operand");
7601 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7604 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7605}
7606
7607void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7608 const MachineInstr &MI,
7609 int OpIdx) const {
7610 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7611}
7612
7613void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7614 const MachineInstr &MI,
7615 int OpIdx) const {
7616 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7617 int ExpVal = APF.getExactLog2Abs();
7618 assert(ExpVal != INT_MIN);
7619 MIB.addImm(ExpVal);
7620}
7621
7622void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7623 const MachineInstr &MI,
7624 int OpIdx) const {
7625 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7626 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7627 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7628 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7629 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7630}
7631
7632void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7633 const MachineInstr &MI,
7634 int OpIdx) const {
7635 unsigned Mods = SISrcMods::OP_SEL_1;
7636 if (MI.getOperand(OpIdx).getImm())
7637 Mods ^= SISrcMods::NEG;
7638 MIB.addImm((int64_t)Mods);
7639}
7640
7641void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7642 const MachineInstr &MI,
7643 int OpIdx) const {
7644 unsigned Mods = SISrcMods::OP_SEL_1;
7645 if (MI.getOperand(OpIdx).getImm())
7647 MIB.addImm((int64_t)Mods);
7648}
7649
7650void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7651 const MachineInstr &MI,
7652 int OpIdx) const {
7653 unsigned Val = MI.getOperand(OpIdx).getImm();
7654 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7655 if (Val == 1) // neg
7656 Mods ^= SISrcMods::NEG;
7657 if (Val == 2) // abs
7658 Mods ^= SISrcMods::ABS;
7659 if (Val == 3) // neg and abs
7660 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7661 MIB.addImm((int64_t)Mods);
7662}
7663
7664void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7665 const MachineInstr &MI,
7666 int OpIdx) const {
7667 uint32_t V = MI.getOperand(2).getImm();
7670 if (!Subtarget->hasSafeCUPrefetch())
7671 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7672 MIB.addImm(V);
7673}
7674
7675/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7676void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7677 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7678 unsigned Val = MI.getOperand(OpIdx).getImm();
7679 unsigned New = 0;
7680 if (Val & 0x1)
7682 if (Val & 0x2)
7684 MIB.addImm(New);
7685}
7686
7687bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7688 return TII.isInlineConstant(Imm);
7689}
7690
7691bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7692 return TII.isInlineConstant(Imm);
7693}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void diagnoseUnsupportedIntrinsic(const MachineInstr &I)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1600
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:754
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:752
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:742
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
bool isFPPredicate() const
Definition InstrTypes.h:845
bool isIntPredicate() const
Definition InstrTypes.h:846
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.h:218
Diagnostic information for unsupported feature in backend.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX13Plus(const MCSubtargetInfo &STI)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:861
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1447
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:60
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:656
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:464
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:297
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:159
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:497
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:317
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:442
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:436
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:472
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:504
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.