LLVM 22.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
246 return false;
247
248 Register DstReg = I.getOperand(0).getReg();
249 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
250
251 I.eraseFromParent();
252 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
253}
254
255bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
256 const DebugLoc &DL = I.getDebugLoc();
257 MachineBasicBlock *BB = I.getParent();
258
259 Register DstReg = I.getOperand(0).getReg();
260 Register SrcReg = I.getOperand(1).getReg();
261 std::optional<ValueAndVReg> Arg =
262 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
263
264 if (Arg) {
265 const int64_t Value = Arg->Value.getZExtValue();
266 if (Value == 0) {
267 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
268 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
269 } else {
270 assert(Value == 1);
271 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
272 }
273 I.eraseFromParent();
274 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
275 }
276
277 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
278 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
279
280 unsigned SelectOpcode =
281 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
282 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
283 .addReg(TRI.getExec())
284 .addImm(0);
285
286 I.eraseFromParent();
287 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302}
303
304bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
305 const Register DefReg = I.getOperand(0).getReg();
306 const LLT DefTy = MRI->getType(DefReg);
307
308 // S1 G_PHIs should not be selected in instruction-select, instead:
309 // - divergent S1 G_PHI should go through lane mask merging algorithm
310 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
311 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
312 if (DefTy == LLT::scalar(1))
313 return false;
314
315 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
316
317 const RegClassOrRegBank &RegClassOrBank =
318 MRI->getRegClassOrRegBank(DefReg);
319
320 const TargetRegisterClass *DefRC =
322 if (!DefRC) {
323 if (!DefTy.isValid()) {
324 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
325 return false;
326 }
327
328 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
329 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
330 if (!DefRC) {
331 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
332 return false;
333 }
334 }
335
336 // If inputs have register bank, assign corresponding reg class.
337 // Note: registers don't need to have the same reg bank.
338 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
339 const Register SrcReg = I.getOperand(i).getReg();
340
341 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
342 if (RB) {
343 const LLT SrcTy = MRI->getType(SrcReg);
344 const TargetRegisterClass *SrcRC =
345 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
346 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
347 return false;
348 }
349 }
350
351 I.setDesc(TII.get(TargetOpcode::PHI));
352 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
353}
354
356AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
357 const TargetRegisterClass &SubRC,
358 unsigned SubIdx) const {
359
360 MachineInstr *MI = MO.getParent();
361 MachineBasicBlock *BB = MO.getParent()->getParent();
362 Register DstReg = MRI->createVirtualRegister(&SubRC);
363
364 if (MO.isReg()) {
365 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
366 Register Reg = MO.getReg();
367 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
368 .addReg(Reg, 0, ComposedSubIdx);
369
370 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
371 MO.isKill(), MO.isDead(), MO.isUndef(),
372 MO.isEarlyClobber(), 0, MO.isDebug(),
373 MO.isInternalRead());
374 }
375
376 assert(MO.isImm());
377
378 APInt Imm(64, MO.getImm());
379
380 switch (SubIdx) {
381 default:
382 llvm_unreachable("do not know to split immediate with this sub index.");
383 case AMDGPU::sub0:
384 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
385 case AMDGPU::sub1:
386 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
387 }
388}
389
390static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
391 switch (Opc) {
392 case AMDGPU::G_AND:
393 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
394 case AMDGPU::G_OR:
395 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
396 case AMDGPU::G_XOR:
397 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
398 default:
399 llvm_unreachable("not a bit op");
400 }
401}
402
403bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
404 Register DstReg = I.getOperand(0).getReg();
405 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
406
407 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
408 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
409 DstRB->getID() != AMDGPU::VCCRegBankID)
410 return false;
411
412 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
413 STI.isWave64());
414 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
415
416 // Dead implicit-def of scc
417 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
418 true, // isImp
419 false, // isKill
420 true)); // isDead
421 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
422}
423
424bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
425 MachineBasicBlock *BB = I.getParent();
426 MachineFunction *MF = BB->getParent();
427 Register DstReg = I.getOperand(0).getReg();
428 const DebugLoc &DL = I.getDebugLoc();
429 LLT Ty = MRI->getType(DstReg);
430 if (Ty.isVector())
431 return false;
432
433 unsigned Size = Ty.getSizeInBits();
434 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
435 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
436 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
437
438 if (Size == 32) {
439 if (IsSALU) {
440 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
441 MachineInstr *Add =
442 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
443 .add(I.getOperand(1))
444 .add(I.getOperand(2))
445 .setOperandDead(3); // Dead scc
446 I.eraseFromParent();
447 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
448 }
449
450 if (STI.hasAddNoCarry()) {
451 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
452 I.setDesc(TII.get(Opc));
453 I.addOperand(*MF, MachineOperand::CreateImm(0));
454 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
456 }
457
458 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
459
460 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
461 MachineInstr *Add
462 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
463 .addDef(UnusedCarry, RegState::Dead)
464 .add(I.getOperand(1))
465 .add(I.getOperand(2))
466 .addImm(0);
467 I.eraseFromParent();
468 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
469 }
470
471 assert(!Sub && "illegal sub should not reach here");
472
473 const TargetRegisterClass &RC
474 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
475 const TargetRegisterClass &HalfRC
476 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
477
478 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
479 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
480 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
481 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
482
483 Register DstLo = MRI->createVirtualRegister(&HalfRC);
484 Register DstHi = MRI->createVirtualRegister(&HalfRC);
485
486 if (IsSALU) {
487 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
488 .add(Lo1)
489 .add(Lo2);
490 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
491 .add(Hi1)
492 .add(Hi2)
493 .setOperandDead(3); // Dead scc
494 } else {
495 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
496 Register CarryReg = MRI->createVirtualRegister(CarryRC);
497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
498 .addDef(CarryReg)
499 .add(Lo1)
500 .add(Lo2)
501 .addImm(0);
502 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
503 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
504 .add(Hi1)
505 .add(Hi2)
506 .addReg(CarryReg, RegState::Kill)
507 .addImm(0);
508
509 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
510 return false;
511 }
512
513 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
514 .addReg(DstLo)
515 .addImm(AMDGPU::sub0)
516 .addReg(DstHi)
517 .addImm(AMDGPU::sub1);
518
519
520 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
521 return false;
522
523 I.eraseFromParent();
524 return true;
525}
526
527bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
528 MachineInstr &I) const {
529 MachineBasicBlock *BB = I.getParent();
530 MachineFunction *MF = BB->getParent();
531 const DebugLoc &DL = I.getDebugLoc();
532 Register Dst0Reg = I.getOperand(0).getReg();
533 Register Dst1Reg = I.getOperand(1).getReg();
534 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
535 I.getOpcode() == AMDGPU::G_UADDE;
536 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
537 I.getOpcode() == AMDGPU::G_USUBE;
538
539 if (isVCC(Dst1Reg, *MRI)) {
540 unsigned NoCarryOpc =
541 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
542 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
543 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
544 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
545 I.addOperand(*MF, MachineOperand::CreateImm(0));
546 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
547 }
548
549 Register Src0Reg = I.getOperand(2).getReg();
550 Register Src1Reg = I.getOperand(3).getReg();
551
552 if (HasCarryIn) {
553 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
554 .addReg(I.getOperand(4).getReg());
555 }
556
557 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
558 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
559
560 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
561 .add(I.getOperand(2))
562 .add(I.getOperand(3));
563
564 if (MRI->use_nodbg_empty(Dst1Reg)) {
565 CarryInst.setOperandDead(3); // Dead scc
566 } else {
567 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
568 .addReg(AMDGPU::SCC);
569 if (!MRI->getRegClassOrNull(Dst1Reg))
570 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
571 }
572
573 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
574 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
575 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
576 return false;
577
578 if (HasCarryIn &&
579 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
580 AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 I.eraseFromParent();
584 return true;
585}
586
587bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
588 MachineInstr &I) const {
589 MachineBasicBlock *BB = I.getParent();
590 MachineFunction *MF = BB->getParent();
591 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
592 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
593 MRI->use_nodbg_empty(I.getOperand(1).getReg());
594
595 unsigned Opc;
596 if (Subtarget->hasMADIntraFwdBug())
597 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
598 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
599 else if (UseNoCarry)
600 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
601 : AMDGPU::V_MAD_NC_I64_I32_e64;
602 else
603 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
604
605 if (UseNoCarry)
606 I.removeOperand(1);
607
608 I.setDesc(TII.get(Opc));
609 I.addOperand(*MF, MachineOperand::CreateImm(0));
610 I.addImplicitDefUseOperands(*MF);
611 I.getOperand(0).setIsEarlyClobber(true);
612 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
613}
614
615// TODO: We should probably legalize these to only using 32-bit results.
616bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
617 MachineBasicBlock *BB = I.getParent();
618 Register DstReg = I.getOperand(0).getReg();
619 Register SrcReg = I.getOperand(1).getReg();
620 LLT DstTy = MRI->getType(DstReg);
621 LLT SrcTy = MRI->getType(SrcReg);
622 const unsigned SrcSize = SrcTy.getSizeInBits();
623 unsigned DstSize = DstTy.getSizeInBits();
624
625 // TODO: Should handle any multiple of 32 offset.
626 unsigned Offset = I.getOperand(2).getImm();
627 if (Offset % 32 != 0 || DstSize > 128)
628 return false;
629
630 // 16-bit operations really use 32-bit registers.
631 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
632 if (DstSize == 16)
633 DstSize = 32;
634
635 const TargetRegisterClass *DstRC =
636 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
637 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
638 return false;
639
640 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
641 const TargetRegisterClass *SrcRC =
642 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
643 if (!SrcRC)
644 return false;
646 DstSize / 32);
647 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
648 if (!SrcRC)
649 return false;
650
651 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
652 *SrcRC, I.getOperand(1));
653 const DebugLoc &DL = I.getDebugLoc();
654 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
655 .addReg(SrcReg, 0, SubReg);
656
657 I.eraseFromParent();
658 return true;
659}
660
661bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
662 MachineBasicBlock *BB = MI.getParent();
663 Register DstReg = MI.getOperand(0).getReg();
664 LLT DstTy = MRI->getType(DstReg);
665 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
666
667 const unsigned SrcSize = SrcTy.getSizeInBits();
668 if (SrcSize < 32)
669 return selectImpl(MI, *CoverageInfo);
670
671 const DebugLoc &DL = MI.getDebugLoc();
672 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
673 const unsigned DstSize = DstTy.getSizeInBits();
674 const TargetRegisterClass *DstRC =
675 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
676 if (!DstRC)
677 return false;
678
679 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
680 MachineInstrBuilder MIB =
681 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
682 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
683 MachineOperand &Src = MI.getOperand(I + 1);
684 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
685 MIB.addImm(SubRegs[I]);
686
687 const TargetRegisterClass *SrcRC
688 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
689 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
690 return false;
691 }
692
693 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
694 return false;
695
696 MI.eraseFromParent();
697 return true;
698}
699
700bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
701 MachineBasicBlock *BB = MI.getParent();
702 const int NumDst = MI.getNumOperands() - 1;
703
704 MachineOperand &Src = MI.getOperand(NumDst);
705
706 Register SrcReg = Src.getReg();
707 Register DstReg0 = MI.getOperand(0).getReg();
708 LLT DstTy = MRI->getType(DstReg0);
709 LLT SrcTy = MRI->getType(SrcReg);
710
711 const unsigned DstSize = DstTy.getSizeInBits();
712 const unsigned SrcSize = SrcTy.getSizeInBits();
713 const DebugLoc &DL = MI.getDebugLoc();
714 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
715
716 const TargetRegisterClass *SrcRC =
717 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
718 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
719 return false;
720
721 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
722 // source, and this relies on the fact that the same subregister indices are
723 // used for both.
724 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
725 for (int I = 0, E = NumDst; I != E; ++I) {
726 MachineOperand &Dst = MI.getOperand(I);
727 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
728 .addReg(SrcReg, 0, SubRegs[I]);
729
730 // Make sure the subregister index is valid for the source register.
731 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
732 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
733 return false;
734
735 const TargetRegisterClass *DstRC =
736 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
737 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
738 return false;
739 }
740
741 MI.eraseFromParent();
742 return true;
743}
744
745bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
746 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
747 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
748
749 Register Src0 = MI.getOperand(1).getReg();
750 Register Src1 = MI.getOperand(2).getReg();
751 LLT SrcTy = MRI->getType(Src0);
752 const unsigned SrcSize = SrcTy.getSizeInBits();
753
754 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
755 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
756 return selectG_MERGE_VALUES(MI);
757 }
758
759 // Selection logic below is for V2S16 only.
760 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
761 Register Dst = MI.getOperand(0).getReg();
762 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
763 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
764 SrcTy != LLT::scalar(32)))
765 return selectImpl(MI, *CoverageInfo);
766
767 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
768 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
769 return false;
770
771 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
772 DstBank->getID() == AMDGPU::VGPRRegBankID);
773 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
774
775 const DebugLoc &DL = MI.getDebugLoc();
776 MachineBasicBlock *BB = MI.getParent();
777
778 // First, before trying TableGen patterns, check if both sources are
779 // constants. In those cases, we can trivially compute the final constant
780 // and emit a simple move.
781 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
782 if (ConstSrc1) {
783 auto ConstSrc0 =
784 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
785 if (ConstSrc0) {
786 const int64_t K0 = ConstSrc0->Value.getSExtValue();
787 const int64_t K1 = ConstSrc1->Value.getSExtValue();
788 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
789 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
790 uint32_t Imm = Lo16 | (Hi16 << 16);
791
792 // VALU
793 if (IsVector) {
794 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
795 MI.eraseFromParent();
796 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
797 }
798
799 // SALU
800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
803 }
804 }
805
806 // Now try TableGen patterns.
807 if (selectImpl(MI, *CoverageInfo))
808 return true;
809
810 // TODO: This should probably be a combine somewhere
811 // (build_vector $src0, undef) -> copy $src0
812 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
813 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
814 MI.setDesc(TII.get(AMDGPU::COPY));
815 MI.removeOperand(2);
816 const auto &RC =
817 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
818 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
819 RBI.constrainGenericRegister(Src0, RC, *MRI);
820 }
821
822 // TODO: Can be improved?
823 if (IsVector) {
824 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
825 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
826 .addImm(0xFFFF)
827 .addReg(Src0);
828 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
829 return false;
830
831 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
832 .addReg(Src1)
833 .addImm(16)
834 .addReg(TmpReg);
835 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
836 return false;
837
838 MI.eraseFromParent();
839 return true;
840 }
841
842 Register ShiftSrc0;
843 Register ShiftSrc1;
844
845 // With multiple uses of the shift, this will duplicate the shift and
846 // increase register pressure.
847 //
848 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
849 // => (S_PACK_HH_B32_B16 $src0, $src1)
850 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
851 // => (S_PACK_HL_B32_B16 $src0, $src1)
852 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
853 // => (S_PACK_LH_B32_B16 $src0, $src1)
854 // (build_vector $src0, $src1)
855 // => (S_PACK_LL_B32_B16 $src0, $src1)
856
857 bool Shift0 = mi_match(
858 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
859
860 bool Shift1 = mi_match(
861 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
862
863 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
864 if (Shift0 && Shift1) {
865 Opc = AMDGPU::S_PACK_HH_B32_B16;
866 MI.getOperand(1).setReg(ShiftSrc0);
867 MI.getOperand(2).setReg(ShiftSrc1);
868 } else if (Shift1) {
869 Opc = AMDGPU::S_PACK_LH_B32_B16;
870 MI.getOperand(2).setReg(ShiftSrc1);
871 } else if (Shift0) {
872 auto ConstSrc1 =
873 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
874 if (ConstSrc1 && ConstSrc1->Value == 0) {
875 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
876 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
877 .addReg(ShiftSrc0)
878 .addImm(16)
879 .setOperandDead(3); // Dead scc
880
881 MI.eraseFromParent();
882 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
883 }
884 if (STI.hasSPackHL()) {
885 Opc = AMDGPU::S_PACK_HL_B32_B16;
886 MI.getOperand(1).setReg(ShiftSrc0);
887 }
888 }
889
890 MI.setDesc(TII.get(Opc));
891 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
892}
893
894bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
895 const MachineOperand &MO = I.getOperand(0);
896
897 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
898 // regbank check here is to know why getConstrainedRegClassForOperand failed.
899 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
900 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
901 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
902 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
903 return true;
904 }
905
906 return false;
907}
908
909bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
910 MachineBasicBlock *BB = I.getParent();
911
912 Register DstReg = I.getOperand(0).getReg();
913 Register Src0Reg = I.getOperand(1).getReg();
914 Register Src1Reg = I.getOperand(2).getReg();
915 LLT Src1Ty = MRI->getType(Src1Reg);
916
917 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
918 unsigned InsSize = Src1Ty.getSizeInBits();
919
920 int64_t Offset = I.getOperand(3).getImm();
921
922 // FIXME: These cases should have been illegal and unnecessary to check here.
923 if (Offset % 32 != 0 || InsSize % 32 != 0)
924 return false;
925
926 // Currently not handled by getSubRegFromChannel.
927 if (InsSize > 128)
928 return false;
929
930 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
931 if (SubReg == AMDGPU::NoSubRegister)
932 return false;
933
934 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
935 const TargetRegisterClass *DstRC =
936 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
937 if (!DstRC)
938 return false;
939
940 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
941 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
942 const TargetRegisterClass *Src0RC =
943 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
944 const TargetRegisterClass *Src1RC =
945 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
946
947 // Deal with weird cases where the class only partially supports the subreg
948 // index.
949 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
950 if (!Src0RC || !Src1RC)
951 return false;
952
953 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
954 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
955 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
956 return false;
957
958 const DebugLoc &DL = I.getDebugLoc();
959 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
960 .addReg(Src0Reg)
961 .addReg(Src1Reg)
962 .addImm(SubReg);
963
964 I.eraseFromParent();
965 return true;
966}
967
968bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
969 Register DstReg = MI.getOperand(0).getReg();
970 Register SrcReg = MI.getOperand(1).getReg();
971 Register OffsetReg = MI.getOperand(2).getReg();
972 Register WidthReg = MI.getOperand(3).getReg();
973
974 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
975 "scalar BFX instructions are expanded in regbankselect");
976 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
977 "64-bit vector BFX instructions are expanded in regbankselect");
978
979 const DebugLoc &DL = MI.getDebugLoc();
980 MachineBasicBlock *MBB = MI.getParent();
981
982 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
983 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
984 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
985 .addReg(SrcReg)
986 .addReg(OffsetReg)
987 .addReg(WidthReg);
988 MI.eraseFromParent();
989 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
990}
991
992bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
993 if (STI.getLDSBankCount() != 16)
994 return selectImpl(MI, *CoverageInfo);
995
996 Register Dst = MI.getOperand(0).getReg();
997 Register Src0 = MI.getOperand(2).getReg();
998 Register M0Val = MI.getOperand(6).getReg();
999 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1000 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1001 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1002 return false;
1003
1004 // This requires 2 instructions. It is possible to write a pattern to support
1005 // this, but the generated isel emitter doesn't correctly deal with multiple
1006 // output instructions using the same physical register input. The copy to m0
1007 // is incorrectly placed before the second instruction.
1008 //
1009 // TODO: Match source modifiers.
1010
1011 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1012 const DebugLoc &DL = MI.getDebugLoc();
1013 MachineBasicBlock *MBB = MI.getParent();
1014
1015 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1016 .addReg(M0Val);
1017 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1018 .addImm(2)
1019 .addImm(MI.getOperand(4).getImm()) // $attr
1020 .addImm(MI.getOperand(3).getImm()); // $attrchan
1021
1022 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1023 .addImm(0) // $src0_modifiers
1024 .addReg(Src0) // $src0
1025 .addImm(MI.getOperand(4).getImm()) // $attr
1026 .addImm(MI.getOperand(3).getImm()) // $attrchan
1027 .addImm(0) // $src2_modifiers
1028 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1029 .addImm(MI.getOperand(5).getImm()) // $high
1030 .addImm(0) // $clamp
1031 .addImm(0); // $omod
1032
1033 MI.eraseFromParent();
1034 return true;
1035}
1036
1037// Writelane is special in that it can use SGPR and M0 (which would normally
1038// count as using the constant bus twice - but in this case it is allowed since
1039// the lane selector doesn't count as a use of the constant bus). However, it is
1040// still required to abide by the 1 SGPR rule. Fix this up if we might have
1041// multiple SGPRs.
1042bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1043 // With a constant bus limit of at least 2, there's no issue.
1044 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1045 return selectImpl(MI, *CoverageInfo);
1046
1047 MachineBasicBlock *MBB = MI.getParent();
1048 const DebugLoc &DL = MI.getDebugLoc();
1049 Register VDst = MI.getOperand(0).getReg();
1050 Register Val = MI.getOperand(2).getReg();
1051 Register LaneSelect = MI.getOperand(3).getReg();
1052 Register VDstIn = MI.getOperand(4).getReg();
1053
1054 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1055
1056 std::optional<ValueAndVReg> ConstSelect =
1057 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1058 if (ConstSelect) {
1059 // The selector has to be an inline immediate, so we can use whatever for
1060 // the other operands.
1061 MIB.addReg(Val);
1062 MIB.addImm(ConstSelect->Value.getSExtValue() &
1063 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1064 } else {
1065 std::optional<ValueAndVReg> ConstVal =
1067
1068 // If the value written is an inline immediate, we can get away without a
1069 // copy to m0.
1070 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1071 STI.hasInv2PiInlineImm())) {
1072 MIB.addImm(ConstVal->Value.getSExtValue());
1073 MIB.addReg(LaneSelect);
1074 } else {
1075 MIB.addReg(Val);
1076
1077 // If the lane selector was originally in a VGPR and copied with
1078 // readfirstlane, there's a hazard to read the same SGPR from the
1079 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1080 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1081
1082 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1083 .addReg(LaneSelect);
1084 MIB.addReg(AMDGPU::M0);
1085 }
1086 }
1087
1088 MIB.addReg(VDstIn);
1089
1090 MI.eraseFromParent();
1091 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1092}
1093
1094// We need to handle this here because tablegen doesn't support matching
1095// instructions with multiple outputs.
1096bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1097 Register Dst0 = MI.getOperand(0).getReg();
1098 Register Dst1 = MI.getOperand(1).getReg();
1099
1100 LLT Ty = MRI->getType(Dst0);
1101 unsigned Opc;
1102 if (Ty == LLT::scalar(32))
1103 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1104 else if (Ty == LLT::scalar(64))
1105 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1106 else
1107 return false;
1108
1109 // TODO: Match source modifiers.
1110
1111 const DebugLoc &DL = MI.getDebugLoc();
1112 MachineBasicBlock *MBB = MI.getParent();
1113
1114 Register Numer = MI.getOperand(3).getReg();
1115 Register Denom = MI.getOperand(4).getReg();
1116 unsigned ChooseDenom = MI.getOperand(5).getImm();
1117
1118 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1119
1120 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1121 .addDef(Dst1)
1122 .addImm(0) // $src0_modifiers
1123 .addUse(Src0) // $src0
1124 .addImm(0) // $src1_modifiers
1125 .addUse(Denom) // $src1
1126 .addImm(0) // $src2_modifiers
1127 .addUse(Numer) // $src2
1128 .addImm(0) // $clamp
1129 .addImm(0); // $omod
1130
1131 MI.eraseFromParent();
1132 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1133}
1134
1135bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1136 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1137 switch (IntrinsicID) {
1138 case Intrinsic::amdgcn_if_break: {
1139 MachineBasicBlock *BB = I.getParent();
1140
1141 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1142 // SelectionDAG uses for wave32 vs wave64.
1143 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1144 .add(I.getOperand(0))
1145 .add(I.getOperand(2))
1146 .add(I.getOperand(3));
1147
1148 Register DstReg = I.getOperand(0).getReg();
1149 Register Src0Reg = I.getOperand(2).getReg();
1150 Register Src1Reg = I.getOperand(3).getReg();
1151
1152 I.eraseFromParent();
1153
1154 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1155 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1156
1157 return true;
1158 }
1159 case Intrinsic::amdgcn_interp_p1_f16:
1160 return selectInterpP1F16(I);
1161 case Intrinsic::amdgcn_wqm:
1162 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1163 case Intrinsic::amdgcn_softwqm:
1164 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1165 case Intrinsic::amdgcn_strict_wwm:
1166 case Intrinsic::amdgcn_wwm:
1167 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1168 case Intrinsic::amdgcn_strict_wqm:
1169 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1170 case Intrinsic::amdgcn_writelane:
1171 return selectWritelane(I);
1172 case Intrinsic::amdgcn_div_scale:
1173 return selectDivScale(I);
1174 case Intrinsic::amdgcn_icmp:
1175 case Intrinsic::amdgcn_fcmp:
1176 if (selectImpl(I, *CoverageInfo))
1177 return true;
1178 return selectIntrinsicCmp(I);
1179 case Intrinsic::amdgcn_ballot:
1180 return selectBallot(I);
1181 case Intrinsic::amdgcn_reloc_constant:
1182 return selectRelocConstant(I);
1183 case Intrinsic::amdgcn_groupstaticsize:
1184 return selectGroupStaticSize(I);
1185 case Intrinsic::returnaddress:
1186 return selectReturnAddress(I);
1187 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1189 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1190 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1191 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1192 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1193 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1194 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1195 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1198 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1200 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1201 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1202 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1204 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1205 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1206 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1207 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1208 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1209 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1212 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1214 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1215 return selectSMFMACIntrin(I);
1216 case Intrinsic::amdgcn_permlane16_swap:
1217 case Intrinsic::amdgcn_permlane32_swap:
1218 return selectPermlaneSwapIntrin(I, IntrinsicID);
1219 default:
1220 return selectImpl(I, *CoverageInfo);
1221 }
1222}
1223
1225 const GCNSubtarget &ST) {
1226 if (Size != 16 && Size != 32 && Size != 64)
1227 return -1;
1228
1229 if (Size == 16 && !ST.has16BitInsts())
1230 return -1;
1231
1232 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1233 unsigned FakeS16Opc, unsigned S32Opc,
1234 unsigned S64Opc) {
1235 if (Size == 16)
1236 return ST.hasTrue16BitInsts()
1237 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1238 : S16Opc;
1239 if (Size == 32)
1240 return S32Opc;
1241 return S64Opc;
1242 };
1243
1244 switch (P) {
1245 default:
1246 llvm_unreachable("Unknown condition code!");
1247 case CmpInst::ICMP_NE:
1248 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1249 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1250 AMDGPU::V_CMP_NE_U64_e64);
1251 case CmpInst::ICMP_EQ:
1252 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1253 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1254 AMDGPU::V_CMP_EQ_U64_e64);
1255 case CmpInst::ICMP_SGT:
1256 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1257 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1258 AMDGPU::V_CMP_GT_I64_e64);
1259 case CmpInst::ICMP_SGE:
1260 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1261 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1262 AMDGPU::V_CMP_GE_I64_e64);
1263 case CmpInst::ICMP_SLT:
1264 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1265 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1266 AMDGPU::V_CMP_LT_I64_e64);
1267 case CmpInst::ICMP_SLE:
1268 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1269 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1270 AMDGPU::V_CMP_LE_I64_e64);
1271 case CmpInst::ICMP_UGT:
1272 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1273 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1274 AMDGPU::V_CMP_GT_U64_e64);
1275 case CmpInst::ICMP_UGE:
1276 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1277 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1278 AMDGPU::V_CMP_GE_U64_e64);
1279 case CmpInst::ICMP_ULT:
1280 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1281 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1282 AMDGPU::V_CMP_LT_U64_e64);
1283 case CmpInst::ICMP_ULE:
1284 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1285 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1286 AMDGPU::V_CMP_LE_U64_e64);
1287
1288 case CmpInst::FCMP_OEQ:
1289 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1290 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1291 AMDGPU::V_CMP_EQ_F64_e64);
1292 case CmpInst::FCMP_OGT:
1293 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1294 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1295 AMDGPU::V_CMP_GT_F64_e64);
1296 case CmpInst::FCMP_OGE:
1297 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1298 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1299 AMDGPU::V_CMP_GE_F64_e64);
1300 case CmpInst::FCMP_OLT:
1301 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1302 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1303 AMDGPU::V_CMP_LT_F64_e64);
1304 case CmpInst::FCMP_OLE:
1305 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1306 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1307 AMDGPU::V_CMP_LE_F64_e64);
1308 case CmpInst::FCMP_ONE:
1309 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1310 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1311 AMDGPU::V_CMP_NEQ_F64_e64);
1312 case CmpInst::FCMP_ORD:
1313 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1314 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1315 AMDGPU::V_CMP_O_F64_e64);
1316 case CmpInst::FCMP_UNO:
1317 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1318 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1319 AMDGPU::V_CMP_U_F64_e64);
1320 case CmpInst::FCMP_UEQ:
1321 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1322 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1323 AMDGPU::V_CMP_NLG_F64_e64);
1324 case CmpInst::FCMP_UGT:
1325 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1326 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1327 AMDGPU::V_CMP_NLE_F64_e64);
1328 case CmpInst::FCMP_UGE:
1329 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1330 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1331 AMDGPU::V_CMP_NLT_F64_e64);
1332 case CmpInst::FCMP_ULT:
1333 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1334 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1335 AMDGPU::V_CMP_NGE_F64_e64);
1336 case CmpInst::FCMP_ULE:
1337 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1338 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1339 AMDGPU::V_CMP_NGT_F64_e64);
1340 case CmpInst::FCMP_UNE:
1341 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1342 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1343 AMDGPU::V_CMP_NEQ_F64_e64);
1344 case CmpInst::FCMP_TRUE:
1345 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1346 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1347 AMDGPU::V_CMP_TRU_F64_e64);
1349 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1350 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1351 AMDGPU::V_CMP_F_F64_e64);
1352 }
1353}
1354
1355int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1356 unsigned Size) const {
1357 if (Size == 64) {
1358 if (!STI.hasScalarCompareEq64())
1359 return -1;
1360
1361 switch (P) {
1362 case CmpInst::ICMP_NE:
1363 return AMDGPU::S_CMP_LG_U64;
1364 case CmpInst::ICMP_EQ:
1365 return AMDGPU::S_CMP_EQ_U64;
1366 default:
1367 return -1;
1368 }
1369 }
1370
1371 if (Size == 32) {
1372 switch (P) {
1373 case CmpInst::ICMP_NE:
1374 return AMDGPU::S_CMP_LG_U32;
1375 case CmpInst::ICMP_EQ:
1376 return AMDGPU::S_CMP_EQ_U32;
1377 case CmpInst::ICMP_SGT:
1378 return AMDGPU::S_CMP_GT_I32;
1379 case CmpInst::ICMP_SGE:
1380 return AMDGPU::S_CMP_GE_I32;
1381 case CmpInst::ICMP_SLT:
1382 return AMDGPU::S_CMP_LT_I32;
1383 case CmpInst::ICMP_SLE:
1384 return AMDGPU::S_CMP_LE_I32;
1385 case CmpInst::ICMP_UGT:
1386 return AMDGPU::S_CMP_GT_U32;
1387 case CmpInst::ICMP_UGE:
1388 return AMDGPU::S_CMP_GE_U32;
1389 case CmpInst::ICMP_ULT:
1390 return AMDGPU::S_CMP_LT_U32;
1391 case CmpInst::ICMP_ULE:
1392 return AMDGPU::S_CMP_LE_U32;
1393 case CmpInst::FCMP_OEQ:
1394 return AMDGPU::S_CMP_EQ_F32;
1395 case CmpInst::FCMP_OGT:
1396 return AMDGPU::S_CMP_GT_F32;
1397 case CmpInst::FCMP_OGE:
1398 return AMDGPU::S_CMP_GE_F32;
1399 case CmpInst::FCMP_OLT:
1400 return AMDGPU::S_CMP_LT_F32;
1401 case CmpInst::FCMP_OLE:
1402 return AMDGPU::S_CMP_LE_F32;
1403 case CmpInst::FCMP_ONE:
1404 return AMDGPU::S_CMP_LG_F32;
1405 case CmpInst::FCMP_ORD:
1406 return AMDGPU::S_CMP_O_F32;
1407 case CmpInst::FCMP_UNO:
1408 return AMDGPU::S_CMP_U_F32;
1409 case CmpInst::FCMP_UEQ:
1410 return AMDGPU::S_CMP_NLG_F32;
1411 case CmpInst::FCMP_UGT:
1412 return AMDGPU::S_CMP_NLE_F32;
1413 case CmpInst::FCMP_UGE:
1414 return AMDGPU::S_CMP_NLT_F32;
1415 case CmpInst::FCMP_ULT:
1416 return AMDGPU::S_CMP_NGE_F32;
1417 case CmpInst::FCMP_ULE:
1418 return AMDGPU::S_CMP_NGT_F32;
1419 case CmpInst::FCMP_UNE:
1420 return AMDGPU::S_CMP_NEQ_F32;
1421 default:
1422 llvm_unreachable("Unknown condition code!");
1423 }
1424 }
1425
1426 if (Size == 16) {
1427 if (!STI.hasSALUFloatInsts())
1428 return -1;
1429
1430 switch (P) {
1431 case CmpInst::FCMP_OEQ:
1432 return AMDGPU::S_CMP_EQ_F16;
1433 case CmpInst::FCMP_OGT:
1434 return AMDGPU::S_CMP_GT_F16;
1435 case CmpInst::FCMP_OGE:
1436 return AMDGPU::S_CMP_GE_F16;
1437 case CmpInst::FCMP_OLT:
1438 return AMDGPU::S_CMP_LT_F16;
1439 case CmpInst::FCMP_OLE:
1440 return AMDGPU::S_CMP_LE_F16;
1441 case CmpInst::FCMP_ONE:
1442 return AMDGPU::S_CMP_LG_F16;
1443 case CmpInst::FCMP_ORD:
1444 return AMDGPU::S_CMP_O_F16;
1445 case CmpInst::FCMP_UNO:
1446 return AMDGPU::S_CMP_U_F16;
1447 case CmpInst::FCMP_UEQ:
1448 return AMDGPU::S_CMP_NLG_F16;
1449 case CmpInst::FCMP_UGT:
1450 return AMDGPU::S_CMP_NLE_F16;
1451 case CmpInst::FCMP_UGE:
1452 return AMDGPU::S_CMP_NLT_F16;
1453 case CmpInst::FCMP_ULT:
1454 return AMDGPU::S_CMP_NGE_F16;
1455 case CmpInst::FCMP_ULE:
1456 return AMDGPU::S_CMP_NGT_F16;
1457 case CmpInst::FCMP_UNE:
1458 return AMDGPU::S_CMP_NEQ_F16;
1459 default:
1460 llvm_unreachable("Unknown condition code!");
1461 }
1462 }
1463
1464 return -1;
1465}
1466
1467bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1468
1469 MachineBasicBlock *BB = I.getParent();
1470 const DebugLoc &DL = I.getDebugLoc();
1471
1472 Register SrcReg = I.getOperand(2).getReg();
1473 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1474
1475 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1476
1477 Register CCReg = I.getOperand(0).getReg();
1478 if (!isVCC(CCReg, *MRI)) {
1479 int Opcode = getS_CMPOpcode(Pred, Size);
1480 if (Opcode == -1)
1481 return false;
1482 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1483 .add(I.getOperand(2))
1484 .add(I.getOperand(3));
1485 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1486 .addReg(AMDGPU::SCC);
1487 bool Ret =
1488 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1489 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1490 I.eraseFromParent();
1491 return Ret;
1492 }
1493
1494 if (I.getOpcode() == AMDGPU::G_FCMP)
1495 return false;
1496
1497 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1498 if (Opcode == -1)
1499 return false;
1500
1501 MachineInstrBuilder ICmp;
1502 // t16 instructions
1503 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1504 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1505 .addImm(0)
1506 .add(I.getOperand(2))
1507 .addImm(0)
1508 .add(I.getOperand(3))
1509 .addImm(0); // op_sel
1510 } else {
1511 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1512 .add(I.getOperand(2))
1513 .add(I.getOperand(3));
1514 }
1515
1516 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1517 *TRI.getBoolRC(), *MRI);
1518 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1519 I.eraseFromParent();
1520 return Ret;
1521}
1522
1523bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1524 Register Dst = I.getOperand(0).getReg();
1525 if (isVCC(Dst, *MRI))
1526 return false;
1527
1528 LLT DstTy = MRI->getType(Dst);
1529 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1530 return false;
1531
1532 MachineBasicBlock *BB = I.getParent();
1533 const DebugLoc &DL = I.getDebugLoc();
1534 Register SrcReg = I.getOperand(2).getReg();
1535 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1536
1537 // i1 inputs are not supported in GlobalISel.
1538 if (Size == 1)
1539 return false;
1540
1541 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1542 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1543 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1544 I.eraseFromParent();
1545 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1546 }
1547
1548 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1549 if (Opcode == -1)
1550 return false;
1551
1552 MachineInstrBuilder SelectedMI;
1553 MachineOperand &LHS = I.getOperand(2);
1554 MachineOperand &RHS = I.getOperand(3);
1555 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1556 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1557 Register Src0Reg =
1558 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1559 Register Src1Reg =
1560 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1561 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1562 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1563 SelectedMI.addImm(Src0Mods);
1564 SelectedMI.addReg(Src0Reg);
1565 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1566 SelectedMI.addImm(Src1Mods);
1567 SelectedMI.addReg(Src1Reg);
1568 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1569 SelectedMI.addImm(0); // clamp
1570 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1571 SelectedMI.addImm(0); // op_sel
1572
1573 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1574 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1575 return false;
1576
1577 I.eraseFromParent();
1578 return true;
1579}
1580
1581// Ballot has to zero bits in input lane-mask that are zero in current exec,
1582// Done as AND with exec. For inputs that are results of instruction that
1583// implicitly use same exec, for example compares in same basic block or SCC to
1584// VCC copy, use copy.
1587 MachineInstr *MI = MRI.getVRegDef(Reg);
1588 if (MI->getParent() != MBB)
1589 return false;
1590
1591 // Lane mask generated by SCC to VCC copy.
1592 if (MI->getOpcode() == AMDGPU::COPY) {
1593 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1594 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1595 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1596 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1597 return true;
1598 }
1599
1600 // Lane mask generated using compare with same exec.
1601 if (isa<GAnyCmp>(MI))
1602 return true;
1603
1604 Register LHS, RHS;
1605 // Look through AND.
1606 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1607 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1609
1610 return false;
1611}
1612
1613bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1614 MachineBasicBlock *BB = I.getParent();
1615 const DebugLoc &DL = I.getDebugLoc();
1616 Register DstReg = I.getOperand(0).getReg();
1617 Register SrcReg = I.getOperand(2).getReg();
1618 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1619 const unsigned WaveSize = STI.getWavefrontSize();
1620
1621 // In the common case, the return type matches the wave size.
1622 // However we also support emitting i64 ballots in wave32 mode.
1623 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1624 return false;
1625
1626 std::optional<ValueAndVReg> Arg =
1628
1629 Register Dst = DstReg;
1630 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1631 if (BallotSize != WaveSize) {
1632 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1633 }
1634
1635 if (Arg) {
1636 const int64_t Value = Arg->Value.getZExtValue();
1637 if (Value == 0) {
1638 // Dst = S_MOV 0
1639 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1640 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1641 } else {
1642 // Dst = COPY EXEC
1643 assert(Value == 1);
1644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1645 }
1646 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1647 return false;
1648 } else {
1649 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1650 // Dst = COPY SrcReg
1651 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1652 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1653 return false;
1654 } else {
1655 // Dst = S_AND SrcReg, EXEC
1656 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1657 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1658 .addReg(SrcReg)
1659 .addReg(TRI.getExec())
1660 .setOperandDead(3); // Dead scc
1661 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1662 return false;
1663 }
1664 }
1665
1666 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1667 if (BallotSize != WaveSize) {
1668 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1669 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1670 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1671 .addReg(Dst)
1672 .addImm(AMDGPU::sub0)
1673 .addReg(HiReg)
1674 .addImm(AMDGPU::sub1);
1675 }
1676
1677 I.eraseFromParent();
1678 return true;
1679}
1680
1681bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1682 Register DstReg = I.getOperand(0).getReg();
1683 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1684 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1685 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1686 return false;
1687
1688 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1689
1690 Module *M = MF->getFunction().getParent();
1691 const MDNode *Metadata = I.getOperand(2).getMetadata();
1692 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1693 auto *RelocSymbol = cast<GlobalVariable>(
1694 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1695
1696 MachineBasicBlock *BB = I.getParent();
1697 BuildMI(*BB, &I, I.getDebugLoc(),
1698 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1700
1701 I.eraseFromParent();
1702 return true;
1703}
1704
1705bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1706 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1707
1708 Register DstReg = I.getOperand(0).getReg();
1709 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1710 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1711 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1712
1713 MachineBasicBlock *MBB = I.getParent();
1714 const DebugLoc &DL = I.getDebugLoc();
1715
1716 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1717
1718 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1719 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1720 MIB.addImm(MFI->getLDSSize());
1721 } else {
1722 Module *M = MF->getFunction().getParent();
1723 const GlobalValue *GV =
1724 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1726 }
1727
1728 I.eraseFromParent();
1729 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1730}
1731
1732bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1733 MachineBasicBlock *MBB = I.getParent();
1734 MachineFunction &MF = *MBB->getParent();
1735 const DebugLoc &DL = I.getDebugLoc();
1736
1737 MachineOperand &Dst = I.getOperand(0);
1738 Register DstReg = Dst.getReg();
1739 unsigned Depth = I.getOperand(2).getImm();
1740
1741 const TargetRegisterClass *RC
1742 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1743 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1744 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1745 return false;
1746
1747 // Check for kernel and shader functions
1748 if (Depth != 0 ||
1749 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1750 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1751 .addImm(0);
1752 I.eraseFromParent();
1753 return true;
1754 }
1755
1756 MachineFrameInfo &MFI = MF.getFrameInfo();
1757 // There is a call to @llvm.returnaddress in this function
1758 MFI.setReturnAddressIsTaken(true);
1759
1760 // Get the return address reg and mark it as an implicit live-in
1761 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1762 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1763 AMDGPU::SReg_64RegClass, DL);
1764 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1765 .addReg(LiveIn);
1766 I.eraseFromParent();
1767 return true;
1768}
1769
1770bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1771 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1772 // SelectionDAG uses for wave32 vs wave64.
1773 MachineBasicBlock *BB = MI.getParent();
1774 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1775 .add(MI.getOperand(1));
1776
1777 Register Reg = MI.getOperand(1).getReg();
1778 MI.eraseFromParent();
1779
1780 if (!MRI->getRegClassOrNull(Reg))
1781 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1782 return true;
1783}
1784
1785bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1786 MachineInstr &MI, Intrinsic::ID IntrID) const {
1787 MachineBasicBlock *MBB = MI.getParent();
1788 MachineFunction *MF = MBB->getParent();
1789 const DebugLoc &DL = MI.getDebugLoc();
1790
1791 unsigned IndexOperand = MI.getOperand(7).getImm();
1792 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1793 bool WaveDone = MI.getOperand(9).getImm() != 0;
1794
1795 if (WaveDone && !WaveRelease) {
1796 // TODO: Move this to IR verifier
1797 const Function &Fn = MF->getFunction();
1798 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1799 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1800 }
1801
1802 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1803 IndexOperand &= ~0x3f;
1804 unsigned CountDw = 0;
1805
1806 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1807 CountDw = (IndexOperand >> 24) & 0xf;
1808 IndexOperand &= ~(0xf << 24);
1809
1810 if (CountDw < 1 || CountDw > 4) {
1811 const Function &Fn = MF->getFunction();
1812 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1813 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1814 CountDw = 1;
1815 }
1816 }
1817
1818 if (IndexOperand) {
1819 const Function &Fn = MF->getFunction();
1820 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1821 Fn, "ds_ordered_count: bad index operand", DL));
1822 }
1823
1824 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1825 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1826
1827 unsigned Offset0 = OrderedCountIndex << 2;
1828 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1829
1830 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1831 Offset1 |= (CountDw - 1) << 6;
1832
1833 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1834 Offset1 |= ShaderType << 2;
1835
1836 unsigned Offset = Offset0 | (Offset1 << 8);
1837
1838 Register M0Val = MI.getOperand(2).getReg();
1839 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1840 .addReg(M0Val);
1841
1842 Register DstReg = MI.getOperand(0).getReg();
1843 Register ValReg = MI.getOperand(3).getReg();
1844 MachineInstrBuilder DS =
1845 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1846 .addReg(ValReg)
1847 .addImm(Offset)
1848 .cloneMemRefs(MI);
1849
1850 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1851 return false;
1852
1853 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1854 MI.eraseFromParent();
1855 return Ret;
1856}
1857
1858static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1859 switch (IntrID) {
1860 case Intrinsic::amdgcn_ds_gws_init:
1861 return AMDGPU::DS_GWS_INIT;
1862 case Intrinsic::amdgcn_ds_gws_barrier:
1863 return AMDGPU::DS_GWS_BARRIER;
1864 case Intrinsic::amdgcn_ds_gws_sema_v:
1865 return AMDGPU::DS_GWS_SEMA_V;
1866 case Intrinsic::amdgcn_ds_gws_sema_br:
1867 return AMDGPU::DS_GWS_SEMA_BR;
1868 case Intrinsic::amdgcn_ds_gws_sema_p:
1869 return AMDGPU::DS_GWS_SEMA_P;
1870 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1871 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1872 default:
1873 llvm_unreachable("not a gws intrinsic");
1874 }
1875}
1876
1877bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1878 Intrinsic::ID IID) const {
1879 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1880 !STI.hasGWSSemaReleaseAll()))
1881 return false;
1882
1883 // intrinsic ID, vsrc, offset
1884 const bool HasVSrc = MI.getNumOperands() == 3;
1885 assert(HasVSrc || MI.getNumOperands() == 2);
1886
1887 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1888 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1889 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1890 return false;
1891
1892 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1893 unsigned ImmOffset;
1894
1895 MachineBasicBlock *MBB = MI.getParent();
1896 const DebugLoc &DL = MI.getDebugLoc();
1897
1898 MachineInstr *Readfirstlane = nullptr;
1899
1900 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1901 // incoming offset, in case there's an add of a constant. We'll have to put it
1902 // back later.
1903 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1904 Readfirstlane = OffsetDef;
1905 BaseOffset = OffsetDef->getOperand(1).getReg();
1906 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1907 }
1908
1909 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1910 // If we have a constant offset, try to use the 0 in m0 as the base.
1911 // TODO: Look into changing the default m0 initialization value. If the
1912 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1913 // the immediate offset.
1914
1915 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1916 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1917 .addImm(0);
1918 } else {
1919 std::tie(BaseOffset, ImmOffset) =
1920 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1921
1922 if (Readfirstlane) {
1923 // We have the constant offset now, so put the readfirstlane back on the
1924 // variable component.
1925 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1926 return false;
1927
1928 Readfirstlane->getOperand(1).setReg(BaseOffset);
1929 BaseOffset = Readfirstlane->getOperand(0).getReg();
1930 } else {
1931 if (!RBI.constrainGenericRegister(BaseOffset,
1932 AMDGPU::SReg_32RegClass, *MRI))
1933 return false;
1934 }
1935
1936 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1937 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1938 .addReg(BaseOffset)
1939 .addImm(16)
1940 .setOperandDead(3); // Dead scc
1941
1942 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1943 .addReg(M0Base);
1944 }
1945
1946 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1947 // offset field) % 64. Some versions of the programming guide omit the m0
1948 // part, or claim it's from offset 0.
1949
1950 unsigned Opc = gwsIntrinToOpcode(IID);
1951 const MCInstrDesc &InstrDesc = TII.get(Opc);
1952
1953 if (HasVSrc) {
1954 Register VSrc = MI.getOperand(1).getReg();
1955
1956 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1957 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1958 const TargetRegisterClass *SubRC =
1959 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1960
1961 if (!SubRC) {
1962 // 32-bit normal case.
1963 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1964 return false;
1965
1966 BuildMI(*MBB, &MI, DL, InstrDesc)
1967 .addReg(VSrc)
1968 .addImm(ImmOffset)
1969 .cloneMemRefs(MI);
1970 } else {
1971 // Requires even register alignment, so create 64-bit value and pad the
1972 // top half with undef.
1973 Register DataReg = MRI->createVirtualRegister(DataRC);
1974 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1975 return false;
1976
1977 Register UndefReg = MRI->createVirtualRegister(SubRC);
1978 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1979 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1980 .addReg(VSrc)
1981 .addImm(AMDGPU::sub0)
1982 .addReg(UndefReg)
1983 .addImm(AMDGPU::sub1);
1984
1985 BuildMI(*MBB, &MI, DL, InstrDesc)
1986 .addReg(DataReg)
1987 .addImm(ImmOffset)
1988 .cloneMemRefs(MI);
1989 }
1990 } else {
1991 BuildMI(*MBB, &MI, DL, InstrDesc)
1992 .addImm(ImmOffset)
1993 .cloneMemRefs(MI);
1994 }
1995
1996 MI.eraseFromParent();
1997 return true;
1998}
1999
2000bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2001 bool IsAppend) const {
2002 Register PtrBase = MI.getOperand(2).getReg();
2003 LLT PtrTy = MRI->getType(PtrBase);
2004 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2005
2006 unsigned Offset;
2007 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2008
2009 // TODO: Should this try to look through readfirstlane like GWS?
2010 if (!isDSOffsetLegal(PtrBase, Offset)) {
2011 PtrBase = MI.getOperand(2).getReg();
2012 Offset = 0;
2013 }
2014
2015 MachineBasicBlock *MBB = MI.getParent();
2016 const DebugLoc &DL = MI.getDebugLoc();
2017 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2018
2019 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2020 .addReg(PtrBase);
2021 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2022 return false;
2023
2024 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2025 .addImm(Offset)
2026 .addImm(IsGDS ? -1 : 0)
2027 .cloneMemRefs(MI);
2028 MI.eraseFromParent();
2029 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2030}
2031
2032bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2033 MachineFunction *MF = MI.getMF();
2034 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2035
2036 MFInfo->setInitWholeWave();
2037 return selectImpl(MI, *CoverageInfo);
2038}
2039
2040static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2041 bool &IsTexFail) {
2042 if (TexFailCtrl)
2043 IsTexFail = true;
2044
2045 TFE = TexFailCtrl & 0x1;
2046 TexFailCtrl &= ~(uint64_t)0x1;
2047 LWE = TexFailCtrl & 0x2;
2048 TexFailCtrl &= ~(uint64_t)0x2;
2049
2050 return TexFailCtrl == 0;
2051}
2052
2053bool AMDGPUInstructionSelector::selectImageIntrinsic(
2054 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2055 MachineBasicBlock *MBB = MI.getParent();
2056 const DebugLoc &DL = MI.getDebugLoc();
2057 unsigned IntrOpcode = Intr->BaseOpcode;
2058
2059 // For image atomic: use no-return opcode if result is unused.
2060 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2061 Register ResultDef = MI.getOperand(0).getReg();
2062 if (MRI->use_nodbg_empty(ResultDef))
2063 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2064 }
2065
2066 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2068
2069 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2070 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2071 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2072 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2073
2074 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2075
2076 Register VDataIn = AMDGPU::NoRegister;
2077 Register VDataOut = AMDGPU::NoRegister;
2078 LLT VDataTy;
2079 int NumVDataDwords = -1;
2080 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2081 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2082
2083 bool Unorm;
2084 if (!BaseOpcode->Sampler)
2085 Unorm = true;
2086 else
2087 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2088
2089 bool TFE;
2090 bool LWE;
2091 bool IsTexFail = false;
2092 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2093 TFE, LWE, IsTexFail))
2094 return false;
2095
2096 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2097 const bool IsA16 = (Flags & 1) != 0;
2098 const bool IsG16 = (Flags & 2) != 0;
2099
2100 // A16 implies 16 bit gradients if subtarget doesn't support G16
2101 if (IsA16 && !STI.hasG16() && !IsG16)
2102 return false;
2103
2104 unsigned DMask = 0;
2105 unsigned DMaskLanes = 0;
2106
2107 if (BaseOpcode->Atomic) {
2108 if (!BaseOpcode->NoReturn)
2109 VDataOut = MI.getOperand(0).getReg();
2110 VDataIn = MI.getOperand(2).getReg();
2111 LLT Ty = MRI->getType(VDataIn);
2112
2113 // Be careful to allow atomic swap on 16-bit element vectors.
2114 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2115 Ty.getSizeInBits() == 128 :
2116 Ty.getSizeInBits() == 64;
2117
2118 if (BaseOpcode->AtomicX2) {
2119 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2120
2121 DMask = Is64Bit ? 0xf : 0x3;
2122 NumVDataDwords = Is64Bit ? 4 : 2;
2123 } else {
2124 DMask = Is64Bit ? 0x3 : 0x1;
2125 NumVDataDwords = Is64Bit ? 2 : 1;
2126 }
2127 } else {
2128 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2129 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2130
2131 if (BaseOpcode->Store) {
2132 VDataIn = MI.getOperand(1).getReg();
2133 VDataTy = MRI->getType(VDataIn);
2134 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2135 } else if (BaseOpcode->NoReturn) {
2136 NumVDataDwords = 0;
2137 } else {
2138 VDataOut = MI.getOperand(0).getReg();
2139 VDataTy = MRI->getType(VDataOut);
2140 NumVDataDwords = DMaskLanes;
2141
2142 if (IsD16 && !STI.hasUnpackedD16VMem())
2143 NumVDataDwords = (DMaskLanes + 1) / 2;
2144 }
2145 }
2146
2147 // Set G16 opcode
2148 if (Subtarget->hasG16() && IsG16) {
2149 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2151 assert(G16MappingInfo);
2152 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2153 }
2154
2155 // TODO: Check this in verifier.
2156 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2157
2158 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2159 // Keep GLC only when the atomic's result is actually used.
2160 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2162 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2164 return false;
2165
2166 int NumVAddrRegs = 0;
2167 int NumVAddrDwords = 0;
2168 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2169 // Skip the $noregs and 0s inserted during legalization.
2170 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2171 if (!AddrOp.isReg())
2172 continue; // XXX - Break?
2173
2174 Register Addr = AddrOp.getReg();
2175 if (!Addr)
2176 break;
2177
2178 ++NumVAddrRegs;
2179 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2180 }
2181
2182 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2183 // NSA, these should have been packed into a single value in the first
2184 // address register
2185 const bool UseNSA =
2186 NumVAddrRegs != 1 &&
2187 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2188 : NumVAddrDwords == NumVAddrRegs);
2189 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2190 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2191 return false;
2192 }
2193
2194 if (IsTexFail)
2195 ++NumVDataDwords;
2196
2197 int Opcode = -1;
2198 if (IsGFX12Plus) {
2199 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2200 NumVDataDwords, NumVAddrDwords);
2201 } else if (IsGFX11Plus) {
2202 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2203 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2204 : AMDGPU::MIMGEncGfx11Default,
2205 NumVDataDwords, NumVAddrDwords);
2206 } else if (IsGFX10Plus) {
2207 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2208 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2209 : AMDGPU::MIMGEncGfx10Default,
2210 NumVDataDwords, NumVAddrDwords);
2211 } else {
2212 if (Subtarget->hasGFX90AInsts()) {
2213 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2214 NumVDataDwords, NumVAddrDwords);
2215 if (Opcode == -1) {
2216 LLVM_DEBUG(
2217 dbgs()
2218 << "requested image instruction is not supported on this GPU\n");
2219 return false;
2220 }
2221 }
2222 if (Opcode == -1 &&
2223 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2224 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2225 NumVDataDwords, NumVAddrDwords);
2226 if (Opcode == -1)
2227 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2228 NumVDataDwords, NumVAddrDwords);
2229 }
2230 if (Opcode == -1)
2231 return false;
2232
2233 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2234 .cloneMemRefs(MI);
2235
2236 if (VDataOut) {
2237 if (BaseOpcode->AtomicX2) {
2238 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2239
2240 Register TmpReg = MRI->createVirtualRegister(
2241 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2242 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2243
2244 MIB.addDef(TmpReg);
2245 if (!MRI->use_empty(VDataOut)) {
2246 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2247 .addReg(TmpReg, RegState::Kill, SubReg);
2248 }
2249
2250 } else {
2251 MIB.addDef(VDataOut); // vdata output
2252 }
2253 }
2254
2255 if (VDataIn)
2256 MIB.addReg(VDataIn); // vdata input
2257
2258 for (int I = 0; I != NumVAddrRegs; ++I) {
2259 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2260 if (SrcOp.isReg()) {
2261 assert(SrcOp.getReg() != 0);
2262 MIB.addReg(SrcOp.getReg());
2263 }
2264 }
2265
2266 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2267 if (BaseOpcode->Sampler)
2268 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2269
2270 MIB.addImm(DMask); // dmask
2271
2272 if (IsGFX10Plus)
2273 MIB.addImm(DimInfo->Encoding);
2274 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2275 MIB.addImm(Unorm);
2276
2277 MIB.addImm(CPol);
2278 MIB.addImm(IsA16 && // a16 or r128
2279 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2280 if (IsGFX10Plus)
2281 MIB.addImm(IsA16 ? -1 : 0);
2282
2283 if (!Subtarget->hasGFX90AInsts()) {
2284 MIB.addImm(TFE); // tfe
2285 } else if (TFE) {
2286 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2287 return false;
2288 }
2289
2290 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2291 MIB.addImm(LWE); // lwe
2292 if (!IsGFX10Plus)
2293 MIB.addImm(DimInfo->DA ? -1 : 0);
2294 if (BaseOpcode->HasD16)
2295 MIB.addImm(IsD16 ? -1 : 0);
2296
2297 MI.eraseFromParent();
2298 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2299 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2300 return true;
2301}
2302
2303// We need to handle this here because tablegen doesn't support matching
2304// instructions with multiple outputs.
2305bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2306 MachineInstr &MI) const {
2307 Register Dst0 = MI.getOperand(0).getReg();
2308 Register Dst1 = MI.getOperand(1).getReg();
2309
2310 const DebugLoc &DL = MI.getDebugLoc();
2311 MachineBasicBlock *MBB = MI.getParent();
2312
2313 Register Addr = MI.getOperand(3).getReg();
2314 Register Data0 = MI.getOperand(4).getReg();
2315 Register Data1 = MI.getOperand(5).getReg();
2316 unsigned Offset = MI.getOperand(6).getImm();
2317
2318 unsigned Opc;
2319 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2320 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2321 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2322 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2323 break;
2324 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2325 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2326 break;
2327 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2328 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2329 break;
2330 }
2331
2332 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2333 .addDef(Dst1)
2334 .addUse(Addr)
2335 .addUse(Data0)
2336 .addUse(Data1)
2337 .addImm(Offset)
2338 .cloneMemRefs(MI);
2339
2340 MI.eraseFromParent();
2341 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2342}
2343
2344bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2345 MachineInstr &I) const {
2346 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2347 switch (IntrinsicID) {
2348 case Intrinsic::amdgcn_end_cf:
2349 return selectEndCfIntrinsic(I);
2350 case Intrinsic::amdgcn_ds_ordered_add:
2351 case Intrinsic::amdgcn_ds_ordered_swap:
2352 return selectDSOrderedIntrinsic(I, IntrinsicID);
2353 case Intrinsic::amdgcn_ds_gws_init:
2354 case Intrinsic::amdgcn_ds_gws_barrier:
2355 case Intrinsic::amdgcn_ds_gws_sema_v:
2356 case Intrinsic::amdgcn_ds_gws_sema_br:
2357 case Intrinsic::amdgcn_ds_gws_sema_p:
2358 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2359 return selectDSGWSIntrinsic(I, IntrinsicID);
2360 case Intrinsic::amdgcn_ds_append:
2361 return selectDSAppendConsume(I, true);
2362 case Intrinsic::amdgcn_ds_consume:
2363 return selectDSAppendConsume(I, false);
2364 case Intrinsic::amdgcn_init_whole_wave:
2365 return selectInitWholeWave(I);
2366 case Intrinsic::amdgcn_raw_buffer_load_lds:
2367 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2368 case Intrinsic::amdgcn_struct_buffer_load_lds:
2369 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2370 return selectBufferLoadLds(I);
2371 // Until we can store both the address space of the global and the LDS
2372 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2373 // that the argument is a global pointer (buffer pointers have been handled by
2374 // a LLVM IR-level lowering).
2375 case Intrinsic::amdgcn_load_to_lds:
2376 case Intrinsic::amdgcn_global_load_lds:
2377 return selectGlobalLoadLds(I);
2378 case Intrinsic::amdgcn_exp_compr:
2379 if (!STI.hasCompressedExport()) {
2380 Function &F = I.getMF()->getFunction();
2381 F.getContext().diagnose(
2382 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2383 I.getDebugLoc(), DS_Error));
2384 return false;
2385 }
2386 break;
2387 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2388 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2389 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2390 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2391 return selectDSBvhStackIntrinsic(I);
2392 case Intrinsic::amdgcn_s_barrier_init:
2393 case Intrinsic::amdgcn_s_barrier_signal_var:
2394 return selectNamedBarrierInit(I, IntrinsicID);
2395 case Intrinsic::amdgcn_s_wakeup_barrier: {
2396 if (!STI.hasSWakeupBarrier()) {
2397 Function &F = I.getMF()->getFunction();
2398 F.getContext().diagnose(
2399 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2400 I.getDebugLoc(), DS_Error));
2401 return false;
2402 }
2403 return selectNamedBarrierInst(I, IntrinsicID);
2404 }
2405 case Intrinsic::amdgcn_s_barrier_join:
2406 case Intrinsic::amdgcn_s_get_named_barrier_state:
2407 return selectNamedBarrierInst(I, IntrinsicID);
2408 case Intrinsic::amdgcn_s_get_barrier_state:
2409 return selectSGetBarrierState(I, IntrinsicID);
2410 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2411 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2412 }
2413 return selectImpl(I, *CoverageInfo);
2414}
2415
2416bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2417 if (selectImpl(I, *CoverageInfo))
2418 return true;
2419
2420 MachineBasicBlock *BB = I.getParent();
2421 const DebugLoc &DL = I.getDebugLoc();
2422
2423 Register DstReg = I.getOperand(0).getReg();
2424 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2425 assert(Size <= 32 || Size == 64);
2426 const MachineOperand &CCOp = I.getOperand(1);
2427 Register CCReg = CCOp.getReg();
2428 if (!isVCC(CCReg, *MRI)) {
2429 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2430 AMDGPU::S_CSELECT_B32;
2431 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2432 .addReg(CCReg);
2433
2434 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2435 // bank, because it does not cover the register class that we used to represent
2436 // for it. So we need to manually set the register class here.
2437 if (!MRI->getRegClassOrNull(CCReg))
2438 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2439 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2440 .add(I.getOperand(2))
2441 .add(I.getOperand(3));
2442
2443 bool Ret = false;
2444 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2445 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2446 I.eraseFromParent();
2447 return Ret;
2448 }
2449
2450 // Wide VGPR select should have been split in RegBankSelect.
2451 if (Size > 32)
2452 return false;
2453
2454 MachineInstr *Select =
2455 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2456 .addImm(0)
2457 .add(I.getOperand(3))
2458 .addImm(0)
2459 .add(I.getOperand(2))
2460 .add(I.getOperand(1));
2461
2462 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2463 I.eraseFromParent();
2464 return Ret;
2465}
2466
2467bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2468 Register DstReg = I.getOperand(0).getReg();
2469 Register SrcReg = I.getOperand(1).getReg();
2470 const LLT DstTy = MRI->getType(DstReg);
2471 const LLT SrcTy = MRI->getType(SrcReg);
2472 const LLT S1 = LLT::scalar(1);
2473
2474 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2475 const RegisterBank *DstRB;
2476 if (DstTy == S1) {
2477 // This is a special case. We don't treat s1 for legalization artifacts as
2478 // vcc booleans.
2479 DstRB = SrcRB;
2480 } else {
2481 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2482 if (SrcRB != DstRB)
2483 return false;
2484 }
2485
2486 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2487
2488 unsigned DstSize = DstTy.getSizeInBits();
2489 unsigned SrcSize = SrcTy.getSizeInBits();
2490
2491 const TargetRegisterClass *SrcRC =
2492 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2493 const TargetRegisterClass *DstRC =
2494 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2495 if (!SrcRC || !DstRC)
2496 return false;
2497
2498 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2499 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2500 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2501 return false;
2502 }
2503
2504 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2505 assert(STI.useRealTrue16Insts());
2506 const DebugLoc &DL = I.getDebugLoc();
2507 MachineBasicBlock *MBB = I.getParent();
2508 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2509 .addReg(SrcReg, 0, AMDGPU::lo16);
2510 I.eraseFromParent();
2511 return true;
2512 }
2513
2514 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2515 MachineBasicBlock *MBB = I.getParent();
2516 const DebugLoc &DL = I.getDebugLoc();
2517
2518 Register LoReg = MRI->createVirtualRegister(DstRC);
2519 Register HiReg = MRI->createVirtualRegister(DstRC);
2520 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2521 .addReg(SrcReg, 0, AMDGPU::sub0);
2522 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2523 .addReg(SrcReg, 0, AMDGPU::sub1);
2524
2525 if (IsVALU && STI.hasSDWA()) {
2526 // Write the low 16-bits of the high element into the high 16-bits of the
2527 // low element.
2528 MachineInstr *MovSDWA =
2529 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2530 .addImm(0) // $src0_modifiers
2531 .addReg(HiReg) // $src0
2532 .addImm(0) // $clamp
2533 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2534 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2535 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2536 .addReg(LoReg, RegState::Implicit);
2537 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2538 } else {
2539 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2540 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2541 Register ImmReg = MRI->createVirtualRegister(DstRC);
2542 if (IsVALU) {
2543 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2544 .addImm(16)
2545 .addReg(HiReg);
2546 } else {
2547 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2548 .addReg(HiReg)
2549 .addImm(16)
2550 .setOperandDead(3); // Dead scc
2551 }
2552
2553 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2554 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2555 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2556
2557 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2558 .addImm(0xffff);
2559 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2560 .addReg(LoReg)
2561 .addReg(ImmReg);
2562 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2563 .addReg(TmpReg0)
2564 .addReg(TmpReg1);
2565
2566 if (!IsVALU) {
2567 And.setOperandDead(3); // Dead scc
2568 Or.setOperandDead(3); // Dead scc
2569 }
2570 }
2571
2572 I.eraseFromParent();
2573 return true;
2574 }
2575
2576 if (!DstTy.isScalar())
2577 return false;
2578
2579 if (SrcSize > 32) {
2580 unsigned SubRegIdx = DstSize < 32
2581 ? static_cast<unsigned>(AMDGPU::sub0)
2582 : TRI.getSubRegFromChannel(0, DstSize / 32);
2583 if (SubRegIdx == AMDGPU::NoSubRegister)
2584 return false;
2585
2586 // Deal with weird cases where the class only partially supports the subreg
2587 // index.
2588 const TargetRegisterClass *SrcWithSubRC
2589 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2590 if (!SrcWithSubRC)
2591 return false;
2592
2593 if (SrcWithSubRC != SrcRC) {
2594 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2595 return false;
2596 }
2597
2598 I.getOperand(1).setSubReg(SubRegIdx);
2599 }
2600
2601 I.setDesc(TII.get(TargetOpcode::COPY));
2602 return true;
2603}
2604
2605/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2606static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2608 int SignedMask = static_cast<int>(Mask);
2609 return SignedMask >= -16 && SignedMask <= 64;
2610}
2611
2612// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2613const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2615 const TargetRegisterInfo &TRI) const {
2616 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2617 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2618 return RB;
2619
2620 // Ignore the type, since we don't use vcc in artifacts.
2621 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2622 return &RBI.getRegBankFromRegClass(*RC, LLT());
2623 return nullptr;
2624}
2625
2626bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2627 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2628 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2629 const DebugLoc &DL = I.getDebugLoc();
2630 MachineBasicBlock &MBB = *I.getParent();
2631 const Register DstReg = I.getOperand(0).getReg();
2632 const Register SrcReg = I.getOperand(1).getReg();
2633
2634 const LLT DstTy = MRI->getType(DstReg);
2635 const LLT SrcTy = MRI->getType(SrcReg);
2636 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2637 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2638 const unsigned DstSize = DstTy.getSizeInBits();
2639 if (!DstTy.isScalar())
2640 return false;
2641
2642 // Artifact casts should never use vcc.
2643 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2644
2645 // FIXME: This should probably be illegal and split earlier.
2646 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2647 if (DstSize <= 32)
2648 return selectCOPY(I);
2649
2650 const TargetRegisterClass *SrcRC =
2651 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2652 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2653 const TargetRegisterClass *DstRC =
2654 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2655
2656 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2657 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2658 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2659 .addReg(SrcReg)
2660 .addImm(AMDGPU::sub0)
2661 .addReg(UndefReg)
2662 .addImm(AMDGPU::sub1);
2663 I.eraseFromParent();
2664
2665 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2666 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2667 }
2668
2669 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2670 // 64-bit should have been split up in RegBankSelect
2671
2672 // Try to use an and with a mask if it will save code size.
2673 unsigned Mask;
2674 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2675 MachineInstr *ExtI =
2676 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2677 .addImm(Mask)
2678 .addReg(SrcReg);
2679 I.eraseFromParent();
2680 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2681 }
2682
2683 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2684 MachineInstr *ExtI =
2685 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2686 .addReg(SrcReg)
2687 .addImm(0) // Offset
2688 .addImm(SrcSize); // Width
2689 I.eraseFromParent();
2690 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2691 }
2692
2693 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2694 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2695 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2696 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2697 return false;
2698
2699 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2700 const unsigned SextOpc = SrcSize == 8 ?
2701 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2702 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2703 .addReg(SrcReg);
2704 I.eraseFromParent();
2705 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2706 }
2707
2708 // Using a single 32-bit SALU to calculate the high half is smaller than
2709 // S_BFE with a literal constant operand.
2710 if (DstSize > 32 && SrcSize == 32) {
2711 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2712 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2713 if (Signed) {
2714 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2715 .addReg(SrcReg, 0, SubReg)
2716 .addImm(31)
2717 .setOperandDead(3); // Dead scc
2718 } else {
2719 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2720 .addImm(0);
2721 }
2722 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2723 .addReg(SrcReg, 0, SubReg)
2724 .addImm(AMDGPU::sub0)
2725 .addReg(HiReg)
2726 .addImm(AMDGPU::sub1);
2727 I.eraseFromParent();
2728 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2729 *MRI);
2730 }
2731
2732 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2733 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2734
2735 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2736 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2737 // We need a 64-bit register source, but the high bits don't matter.
2738 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2739 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2740 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2741
2742 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2743 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2744 .addReg(SrcReg, 0, SubReg)
2745 .addImm(AMDGPU::sub0)
2746 .addReg(UndefReg)
2747 .addImm(AMDGPU::sub1);
2748
2749 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2750 .addReg(ExtReg)
2751 .addImm(SrcSize << 16);
2752
2753 I.eraseFromParent();
2754 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2755 }
2756
2757 unsigned Mask;
2758 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2759 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2760 .addReg(SrcReg)
2761 .addImm(Mask)
2762 .setOperandDead(3); // Dead scc
2763 } else {
2764 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2765 .addReg(SrcReg)
2766 .addImm(SrcSize << 16);
2767 }
2768
2769 I.eraseFromParent();
2770 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2771 }
2772
2773 return false;
2774}
2775
2779
2781 Register BitcastSrc;
2782 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2783 Reg = BitcastSrc;
2784 return Reg;
2785}
2786
2788 Register &Out) {
2789 Register Trunc;
2790 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2791 return false;
2792
2793 Register LShlSrc;
2794 Register Cst;
2795 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2796 Cst = stripCopy(Cst, MRI);
2797 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2798 Out = stripBitCast(LShlSrc, MRI);
2799 return true;
2800 }
2801 }
2802
2803 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2804 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2805 return false;
2806
2807 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2808 LLT::fixed_vector(2, 16));
2809
2810 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2811 assert(Mask.size() == 2);
2812
2813 if (Mask[0] == 1 && Mask[1] <= 1) {
2814 Out = Shuffle->getOperand(0).getReg();
2815 return true;
2816 }
2817
2818 return false;
2819}
2820
2821bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2822 if (!Subtarget->hasSALUFloatInsts())
2823 return false;
2824
2825 Register Dst = I.getOperand(0).getReg();
2826 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2827 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2828 return false;
2829
2830 Register Src = I.getOperand(1).getReg();
2831
2832 if (MRI->getType(Dst) == LLT::scalar(32) &&
2833 MRI->getType(Src) == LLT::scalar(16)) {
2834 if (isExtractHiElt(*MRI, Src, Src)) {
2835 MachineBasicBlock *BB = I.getParent();
2836 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2837 .addUse(Src);
2838 I.eraseFromParent();
2839 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2840 }
2841 }
2842
2843 return false;
2844}
2845
2846bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2847 // Only manually handle the f64 SGPR case.
2848 //
2849 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2850 // the bit ops theoretically have a second result due to the implicit def of
2851 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2852 // that is easy by disabling the check. The result works, but uses a
2853 // nonsensical sreg32orlds_and_sreg_1 regclass.
2854 //
2855 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2856 // the variadic REG_SEQUENCE operands.
2857
2858 Register Dst = MI.getOperand(0).getReg();
2859 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2860 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2861 MRI->getType(Dst) != LLT::scalar(64))
2862 return false;
2863
2864 Register Src = MI.getOperand(1).getReg();
2865 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2866 if (Fabs)
2867 Src = Fabs->getOperand(1).getReg();
2868
2869 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2870 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2871 return false;
2872
2873 MachineBasicBlock *BB = MI.getParent();
2874 const DebugLoc &DL = MI.getDebugLoc();
2875 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2876 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2877 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2878 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2879
2880 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2881 .addReg(Src, 0, AMDGPU::sub0);
2882 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2883 .addReg(Src, 0, AMDGPU::sub1);
2884 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2885 .addImm(0x80000000);
2886
2887 // Set or toggle sign bit.
2888 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2889 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2890 .addReg(HiReg)
2891 .addReg(ConstReg)
2892 .setOperandDead(3); // Dead scc
2893 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2894 .addReg(LoReg)
2895 .addImm(AMDGPU::sub0)
2896 .addReg(OpReg)
2897 .addImm(AMDGPU::sub1);
2898 MI.eraseFromParent();
2899 return true;
2900}
2901
2902// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2903bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2904 Register Dst = MI.getOperand(0).getReg();
2905 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2906 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2907 MRI->getType(Dst) != LLT::scalar(64))
2908 return false;
2909
2910 Register Src = MI.getOperand(1).getReg();
2911 MachineBasicBlock *BB = MI.getParent();
2912 const DebugLoc &DL = MI.getDebugLoc();
2913 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2914 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2915 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2916 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2917
2918 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2919 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2920 return false;
2921
2922 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2923 .addReg(Src, 0, AMDGPU::sub0);
2924 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2925 .addReg(Src, 0, AMDGPU::sub1);
2926 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2927 .addImm(0x7fffffff);
2928
2929 // Clear sign bit.
2930 // TODO: Should this used S_BITSET0_*?
2931 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2932 .addReg(HiReg)
2933 .addReg(ConstReg)
2934 .setOperandDead(3); // Dead scc
2935 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2936 .addReg(LoReg)
2937 .addImm(AMDGPU::sub0)
2938 .addReg(OpReg)
2939 .addImm(AMDGPU::sub1);
2940
2941 MI.eraseFromParent();
2942 return true;
2943}
2944
2945static bool isConstant(const MachineInstr &MI) {
2946 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2947}
2948
2949void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2950 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2951
2952 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2953 const MachineInstr *PtrMI =
2954 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2955
2956 assert(PtrMI);
2957
2958 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2959 return;
2960
2961 GEPInfo GEPInfo;
2962
2963 for (unsigned i = 1; i != 3; ++i) {
2964 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2965 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2966 assert(OpDef);
2967 if (i == 2 && isConstant(*OpDef)) {
2968 // TODO: Could handle constant base + variable offset, but a combine
2969 // probably should have commuted it.
2970 assert(GEPInfo.Imm == 0);
2971 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2972 continue;
2973 }
2974 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2975 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2976 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2977 else
2978 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2979 }
2980
2981 AddrInfo.push_back(GEPInfo);
2982 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2983}
2984
2985bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2986 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2987}
2988
2989bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2990 if (!MI.hasOneMemOperand())
2991 return false;
2992
2993 const MachineMemOperand *MMO = *MI.memoperands_begin();
2994 const Value *Ptr = MMO->getValue();
2995
2996 // UndefValue means this is a load of a kernel input. These are uniform.
2997 // Sometimes LDS instructions have constant pointers.
2998 // If Ptr is null, then that means this mem operand contains a
2999 // PseudoSourceValue like GOT.
3001 return true;
3002
3004 return true;
3005
3006 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3007 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3008 AMDGPU::SGPRRegBankID;
3009
3010 const Instruction *I = dyn_cast<Instruction>(Ptr);
3011 return I && I->getMetadata("amdgpu.uniform");
3012}
3013
3014bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3015 for (const GEPInfo &GEPInfo : AddrInfo) {
3016 if (!GEPInfo.VgprParts.empty())
3017 return true;
3018 }
3019 return false;
3020}
3021
3022void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3023 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3024 unsigned AS = PtrTy.getAddressSpace();
3026 STI.ldsRequiresM0Init()) {
3027 MachineBasicBlock *BB = I.getParent();
3028
3029 // If DS instructions require M0 initialization, insert it before selecting.
3030 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3031 .addImm(-1);
3032 }
3033}
3034
3035bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3036 MachineInstr &I) const {
3037 initM0(I);
3038 return selectImpl(I, *CoverageInfo);
3039}
3040
3042 if (Reg.isPhysical())
3043 return false;
3044
3045 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3046 const unsigned Opcode = MI.getOpcode();
3047
3048 if (Opcode == AMDGPU::COPY)
3049 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3050
3051 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3052 Opcode == AMDGPU::G_XOR)
3053 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3054 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3055
3056 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3057 return GI->is(Intrinsic::amdgcn_class);
3058
3059 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3060}
3061
3062bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3063 MachineBasicBlock *BB = I.getParent();
3064 MachineOperand &CondOp = I.getOperand(0);
3065 Register CondReg = CondOp.getReg();
3066 const DebugLoc &DL = I.getDebugLoc();
3067
3068 unsigned BrOpcode;
3069 Register CondPhysReg;
3070 const TargetRegisterClass *ConstrainRC;
3071
3072 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3073 // whether the branch is uniform when selecting the instruction. In
3074 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3075 // RegBankSelect knows what it's doing if the branch condition is scc, even
3076 // though it currently does not.
3077 if (!isVCC(CondReg, *MRI)) {
3078 if (MRI->getType(CondReg) != LLT::scalar(32))
3079 return false;
3080
3081 CondPhysReg = AMDGPU::SCC;
3082 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3083 ConstrainRC = &AMDGPU::SReg_32RegClass;
3084 } else {
3085 // FIXME: Should scc->vcc copies and with exec?
3086
3087 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3088 // need to insert an and with exec.
3089 if (!isVCmpResult(CondReg, *MRI)) {
3090 const bool Is64 = STI.isWave64();
3091 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3092 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3093
3094 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3095 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3096 .addReg(CondReg)
3097 .addReg(Exec)
3098 .setOperandDead(3); // Dead scc
3099 CondReg = TmpReg;
3100 }
3101
3102 CondPhysReg = TRI.getVCC();
3103 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3104 ConstrainRC = TRI.getBoolRC();
3105 }
3106
3107 if (!MRI->getRegClassOrNull(CondReg))
3108 MRI->setRegClass(CondReg, ConstrainRC);
3109
3110 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3111 .addReg(CondReg);
3112 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3113 .addMBB(I.getOperand(1).getMBB());
3114
3115 I.eraseFromParent();
3116 return true;
3117}
3118
3119bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3120 MachineInstr &I) const {
3121 Register DstReg = I.getOperand(0).getReg();
3122 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3123 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3124 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3125 if (IsVGPR)
3126 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3127
3128 return RBI.constrainGenericRegister(
3129 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3130}
3131
3132bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3133 Register DstReg = I.getOperand(0).getReg();
3134 Register SrcReg = I.getOperand(1).getReg();
3135 Register MaskReg = I.getOperand(2).getReg();
3136 LLT Ty = MRI->getType(DstReg);
3137 LLT MaskTy = MRI->getType(MaskReg);
3138 MachineBasicBlock *BB = I.getParent();
3139 const DebugLoc &DL = I.getDebugLoc();
3140
3141 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3142 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3143 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3144 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3145 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3146 return false;
3147
3148 // Try to avoid emitting a bit operation when we only need to touch half of
3149 // the 64-bit pointer.
3150 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3151 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3152 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3153
3154 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3155 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3156
3157 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3158 !CanCopyLow32 && !CanCopyHi32) {
3159 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3160 .addReg(SrcReg)
3161 .addReg(MaskReg)
3162 .setOperandDead(3); // Dead scc
3163 I.eraseFromParent();
3164 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3165 }
3166
3167 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3168 const TargetRegisterClass &RegRC
3169 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3170
3171 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3172 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3173 const TargetRegisterClass *MaskRC =
3174 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3175
3176 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3177 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3178 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3179 return false;
3180
3181 if (Ty.getSizeInBits() == 32) {
3182 assert(MaskTy.getSizeInBits() == 32 &&
3183 "ptrmask should have been narrowed during legalize");
3184
3185 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3186 .addReg(SrcReg)
3187 .addReg(MaskReg);
3188
3189 if (!IsVGPR)
3190 NewOp.setOperandDead(3); // Dead scc
3191 I.eraseFromParent();
3192 return true;
3193 }
3194
3195 Register HiReg = MRI->createVirtualRegister(&RegRC);
3196 Register LoReg = MRI->createVirtualRegister(&RegRC);
3197
3198 // Extract the subregisters from the source pointer.
3199 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3200 .addReg(SrcReg, 0, AMDGPU::sub0);
3201 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3202 .addReg(SrcReg, 0, AMDGPU::sub1);
3203
3204 Register MaskedLo, MaskedHi;
3205
3206 if (CanCopyLow32) {
3207 // If all the bits in the low half are 1, we only need a copy for it.
3208 MaskedLo = LoReg;
3209 } else {
3210 // Extract the mask subregister and apply the and.
3211 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3212 MaskedLo = MRI->createVirtualRegister(&RegRC);
3213
3214 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3215 .addReg(MaskReg, 0, AMDGPU::sub0);
3216 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3217 .addReg(LoReg)
3218 .addReg(MaskLo);
3219 }
3220
3221 if (CanCopyHi32) {
3222 // If all the bits in the high half are 1, we only need a copy for it.
3223 MaskedHi = HiReg;
3224 } else {
3225 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3226 MaskedHi = MRI->createVirtualRegister(&RegRC);
3227
3228 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3229 .addReg(MaskReg, 0, AMDGPU::sub1);
3230 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3231 .addReg(HiReg)
3232 .addReg(MaskHi);
3233 }
3234
3235 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3236 .addReg(MaskedLo)
3237 .addImm(AMDGPU::sub0)
3238 .addReg(MaskedHi)
3239 .addImm(AMDGPU::sub1);
3240 I.eraseFromParent();
3241 return true;
3242}
3243
3244/// Return the register to use for the index value, and the subregister to use
3245/// for the indirectly accessed register.
3246static std::pair<Register, unsigned>
3248 const TargetRegisterClass *SuperRC, Register IdxReg,
3249 unsigned EltSize, GISelValueTracking &ValueTracking) {
3250 Register IdxBaseReg;
3251 int Offset;
3252
3253 std::tie(IdxBaseReg, Offset) =
3254 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3255 if (IdxBaseReg == AMDGPU::NoRegister) {
3256 // This will happen if the index is a known constant. This should ordinarily
3257 // be legalized out, but handle it as a register just in case.
3258 assert(Offset == 0);
3259 IdxBaseReg = IdxReg;
3260 }
3261
3262 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3263
3264 // Skip out of bounds offsets, or else we would end up using an undefined
3265 // register.
3266 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3267 return std::pair(IdxReg, SubRegs[0]);
3268 return std::pair(IdxBaseReg, SubRegs[Offset]);
3269}
3270
3271bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3272 MachineInstr &MI) const {
3273 Register DstReg = MI.getOperand(0).getReg();
3274 Register SrcReg = MI.getOperand(1).getReg();
3275 Register IdxReg = MI.getOperand(2).getReg();
3276
3277 LLT DstTy = MRI->getType(DstReg);
3278 LLT SrcTy = MRI->getType(SrcReg);
3279
3280 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3281 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3282 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3283
3284 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3285 // into a waterfall loop.
3286 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3287 return false;
3288
3289 const TargetRegisterClass *SrcRC =
3290 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3291 const TargetRegisterClass *DstRC =
3292 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3293 if (!SrcRC || !DstRC)
3294 return false;
3295 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3296 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3297 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3298 return false;
3299
3300 MachineBasicBlock *BB = MI.getParent();
3301 const DebugLoc &DL = MI.getDebugLoc();
3302 const bool Is64 = DstTy.getSizeInBits() == 64;
3303
3304 unsigned SubReg;
3305 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3306 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3307
3308 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3309 if (DstTy.getSizeInBits() != 32 && !Is64)
3310 return false;
3311
3312 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3313 .addReg(IdxReg);
3314
3315 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3316 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3317 .addReg(SrcReg, 0, SubReg)
3318 .addReg(SrcReg, RegState::Implicit);
3319 MI.eraseFromParent();
3320 return true;
3321 }
3322
3323 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3324 return false;
3325
3326 if (!STI.useVGPRIndexMode()) {
3327 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3328 .addReg(IdxReg);
3329 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3330 .addReg(SrcReg, 0, SubReg)
3331 .addReg(SrcReg, RegState::Implicit);
3332 MI.eraseFromParent();
3333 return true;
3334 }
3335
3336 const MCInstrDesc &GPRIDXDesc =
3337 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3338 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3339 .addReg(SrcReg)
3340 .addReg(IdxReg)
3341 .addImm(SubReg);
3342
3343 MI.eraseFromParent();
3344 return true;
3345}
3346
3347// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3348bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3349 MachineInstr &MI) const {
3350 Register DstReg = MI.getOperand(0).getReg();
3351 Register VecReg = MI.getOperand(1).getReg();
3352 Register ValReg = MI.getOperand(2).getReg();
3353 Register IdxReg = MI.getOperand(3).getReg();
3354
3355 LLT VecTy = MRI->getType(DstReg);
3356 LLT ValTy = MRI->getType(ValReg);
3357 unsigned VecSize = VecTy.getSizeInBits();
3358 unsigned ValSize = ValTy.getSizeInBits();
3359
3360 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3361 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3362 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3363
3364 assert(VecTy.getElementType() == ValTy);
3365
3366 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3367 // into a waterfall loop.
3368 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3369 return false;
3370
3371 const TargetRegisterClass *VecRC =
3372 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3373 const TargetRegisterClass *ValRC =
3374 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3375
3376 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3377 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3378 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3379 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3380 return false;
3381
3382 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3383 return false;
3384
3385 unsigned SubReg;
3386 std::tie(IdxReg, SubReg) =
3387 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3388
3389 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3390 STI.useVGPRIndexMode();
3391
3392 MachineBasicBlock *BB = MI.getParent();
3393 const DebugLoc &DL = MI.getDebugLoc();
3394
3395 if (!IndexMode) {
3396 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3397 .addReg(IdxReg);
3398
3399 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3400 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3401 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3402 .addReg(VecReg)
3403 .addReg(ValReg)
3404 .addImm(SubReg);
3405 MI.eraseFromParent();
3406 return true;
3407 }
3408
3409 const MCInstrDesc &GPRIDXDesc =
3410 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3411 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3412 .addReg(VecReg)
3413 .addReg(ValReg)
3414 .addReg(IdxReg)
3415 .addImm(SubReg);
3416
3417 MI.eraseFromParent();
3418 return true;
3419}
3420
3421bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3422 if (!Subtarget->hasVMemToLDSLoad())
3423 return false;
3424 unsigned Opc;
3425 unsigned Size = MI.getOperand(3).getImm();
3426
3427 // The struct intrinsic variants add one additional operand over raw.
3428 const bool HasVIndex = MI.getNumOperands() == 9;
3429 Register VIndex;
3430 int OpOffset = 0;
3431 if (HasVIndex) {
3432 VIndex = MI.getOperand(4).getReg();
3433 OpOffset = 1;
3434 }
3435
3436 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3437 std::optional<ValueAndVReg> MaybeVOffset =
3439 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3440
3441 switch (Size) {
3442 default:
3443 return false;
3444 case 1:
3445 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3446 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3447 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3448 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3449 break;
3450 case 2:
3451 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3452 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3453 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3454 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3455 break;
3456 case 4:
3457 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3458 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3459 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3460 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3461 break;
3462 case 12:
3463 if (!Subtarget->hasLDSLoadB96_B128())
3464 return false;
3465
3466 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3467 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3468 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3469 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3470 break;
3471 case 16:
3472 if (!Subtarget->hasLDSLoadB96_B128())
3473 return false;
3474
3475 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3476 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3477 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3478 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3479 break;
3480 }
3481
3482 MachineBasicBlock *MBB = MI.getParent();
3483 const DebugLoc &DL = MI.getDebugLoc();
3484 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3485 .add(MI.getOperand(2));
3486
3487 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3488
3489 if (HasVIndex && HasVOffset) {
3490 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3491 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3492 .addReg(VIndex)
3493 .addImm(AMDGPU::sub0)
3494 .addReg(VOffset)
3495 .addImm(AMDGPU::sub1);
3496
3497 MIB.addReg(IdxReg);
3498 } else if (HasVIndex) {
3499 MIB.addReg(VIndex);
3500 } else if (HasVOffset) {
3501 MIB.addReg(VOffset);
3502 }
3503
3504 MIB.add(MI.getOperand(1)); // rsrc
3505 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3506 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3507 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3508 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3509 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3510 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3511 MIB.addImm(
3512 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3513 ? 1
3514 : 0); // swz
3515
3516 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3517 // Don't set the offset value here because the pointer points to the base of
3518 // the buffer.
3519 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3520
3521 MachinePointerInfo StorePtrI = LoadPtrI;
3522 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3526
3527 auto F = LoadMMO->getFlags() &
3529 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3530 Size, LoadMMO->getBaseAlign());
3531
3532 MachineMemOperand *StoreMMO =
3533 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3534 sizeof(int32_t), LoadMMO->getBaseAlign());
3535
3536 MIB.setMemRefs({LoadMMO, StoreMMO});
3537
3538 MI.eraseFromParent();
3539 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3540}
3541
3542/// Match a zero extend from a 32-bit value to 64-bits.
3543Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3544 Register ZExtSrc;
3545 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3546 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3547
3548 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3549 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3550 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3551 return Register();
3552
3553 assert(Def->getNumOperands() == 3 &&
3554 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3555 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3556 return Def->getOperand(1).getReg();
3557 }
3558
3559 return Register();
3560}
3561
3562/// Match a sign extend from a 32-bit value to 64-bits.
3563Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3564 Register SExtSrc;
3565 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3566 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3567
3568 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3569 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3570 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3571 return Register();
3572
3573 assert(Def->getNumOperands() == 3 &&
3574 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3575 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3576 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3577 m_SpecificICst(31))))
3578 return Def->getOperand(1).getReg();
3579
3580 if (VT->signBitIsZero(Reg))
3581 return matchZeroExtendFromS32(Reg);
3582
3583 return Register();
3584}
3585
3586/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3587/// is 32-bit.
3589AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3590 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3591 : matchZeroExtendFromS32(Reg);
3592}
3593
3594/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3595/// is 32-bit.
3597AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3598 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3599 : matchSignExtendFromS32(Reg);
3600}
3601
3603AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3604 bool IsSigned) const {
3605 if (IsSigned)
3606 return matchSignExtendFromS32OrS32(Reg);
3607
3608 return matchZeroExtendFromS32OrS32(Reg);
3609}
3610
3611Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3612 Register AnyExtSrc;
3613 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3614 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3615
3616 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3617 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3618 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3619 return Register();
3620
3621 assert(Def->getNumOperands() == 3 &&
3622 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3623
3624 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3625 return Def->getOperand(1).getReg();
3626
3627 return Register();
3628}
3629
3630bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3631 if (!Subtarget->hasVMemToLDSLoad())
3632 return false;
3633
3634 unsigned Opc;
3635 unsigned Size = MI.getOperand(3).getImm();
3636
3637 switch (Size) {
3638 default:
3639 return false;
3640 case 1:
3641 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3642 break;
3643 case 2:
3644 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3645 break;
3646 case 4:
3647 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3648 break;
3649 case 12:
3650 if (!Subtarget->hasLDSLoadB96_B128())
3651 return false;
3652 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3653 break;
3654 case 16:
3655 if (!Subtarget->hasLDSLoadB96_B128())
3656 return false;
3657 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3658 break;
3659 }
3660
3661 MachineBasicBlock *MBB = MI.getParent();
3662 const DebugLoc &DL = MI.getDebugLoc();
3663 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3664 .add(MI.getOperand(2));
3665
3666 Register Addr = MI.getOperand(1).getReg();
3667 Register VOffset;
3668 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3669 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3670 if (!isSGPR(Addr)) {
3671 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3672 if (isSGPR(AddrDef->Reg)) {
3673 Addr = AddrDef->Reg;
3674 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3675 Register SAddr =
3676 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3677 if (isSGPR(SAddr)) {
3678 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3679 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3680 Addr = SAddr;
3681 VOffset = Off;
3682 }
3683 }
3684 }
3685 }
3686
3687 if (isSGPR(Addr)) {
3689 if (!VOffset) {
3690 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3691 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3692 .addImm(0);
3693 }
3694 }
3695
3696 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3697 .addReg(Addr);
3698
3699 if (isSGPR(Addr))
3700 MIB.addReg(VOffset);
3701
3702 MIB.add(MI.getOperand(4)); // offset
3703
3704 unsigned Aux = MI.getOperand(5).getImm();
3705 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3706
3707 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3708 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3709 LoadPtrI.Offset = MI.getOperand(4).getImm();
3710 MachinePointerInfo StorePtrI = LoadPtrI;
3711 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3715 auto F = LoadMMO->getFlags() &
3717 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3718 Size, LoadMMO->getBaseAlign());
3719 MachineMemOperand *StoreMMO =
3720 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3721 sizeof(int32_t), Align(4));
3722
3723 MIB.setMemRefs({LoadMMO, StoreMMO});
3724
3725 MI.eraseFromParent();
3726 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3727}
3728
3729bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3730 MachineInstr &MI) const {
3731 unsigned OpcodeOpIdx =
3732 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3733 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3734 MI.removeOperand(OpcodeOpIdx);
3735 MI.addImplicitDefUseOperands(*MI.getMF());
3736 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3737}
3738
3739// FIXME: This should be removed and let the patterns select. We just need the
3740// AGPR/VGPR combination versions.
3741bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3742 unsigned Opc;
3743 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3744 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3745 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3746 break;
3747 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3748 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3749 break;
3750 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3751 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3752 break;
3753 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3754 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3755 break;
3756 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3757 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3758 break;
3759 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3760 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3761 break;
3762 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3763 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3764 break;
3765 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3766 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3767 break;
3768 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3769 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3770 break;
3771 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3772 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3773 break;
3774 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3775 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3776 break;
3777 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3778 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3779 break;
3780 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3781 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3782 break;
3783 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3784 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3785 break;
3786 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3787 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3788 break;
3789 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3790 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3791 break;
3792 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3793 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3794 break;
3795 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3796 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3797 break;
3798 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3799 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3800 break;
3801 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3802 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3803 break;
3804 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3805 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3806 break;
3807 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3808 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3809 break;
3810 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3811 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3812 break;
3813 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3814 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3815 break;
3816 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3817 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3818 break;
3819 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3820 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3821 break;
3822 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3823 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3824 break;
3825 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3826 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3827 break;
3828 default:
3829 llvm_unreachable("unhandled smfmac intrinsic");
3830 }
3831
3832 auto VDst_In = MI.getOperand(4);
3833
3834 MI.setDesc(TII.get(Opc));
3835 MI.removeOperand(4); // VDst_In
3836 MI.removeOperand(1); // Intrinsic ID
3837 MI.addOperand(VDst_In); // Readd VDst_In to the end
3838 MI.addImplicitDefUseOperands(*MI.getMF());
3839 const MCInstrDesc &MCID = MI.getDesc();
3840 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3841 MI.getOperand(0).setIsEarlyClobber(true);
3842 }
3843 return true;
3844}
3845
3846bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3847 MachineInstr &MI, Intrinsic::ID IntrID) const {
3848 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3849 !Subtarget->hasPermlane16Swap())
3850 return false;
3851 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3852 !Subtarget->hasPermlane32Swap())
3853 return false;
3854
3855 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3856 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3857 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3858
3859 MI.removeOperand(2);
3860 MI.setDesc(TII.get(Opcode));
3861 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3862
3863 MachineOperand &FI = MI.getOperand(4);
3865
3866 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3867}
3868
3869bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3870 Register DstReg = MI.getOperand(0).getReg();
3871 Register SrcReg = MI.getOperand(1).getReg();
3872 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3873 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3874 MachineBasicBlock *MBB = MI.getParent();
3875 const DebugLoc &DL = MI.getDebugLoc();
3876
3877 if (IsVALU) {
3878 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3879 .addImm(Subtarget->getWavefrontSizeLog2())
3880 .addReg(SrcReg);
3881 } else {
3882 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3883 .addReg(SrcReg)
3884 .addImm(Subtarget->getWavefrontSizeLog2())
3885 .setOperandDead(3); // Dead scc
3886 }
3887
3888 const TargetRegisterClass &RC =
3889 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3890 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3891 return false;
3892
3893 MI.eraseFromParent();
3894 return true;
3895}
3896
3897// Match BITOP3 operation and return a number of matched instructions plus
3898// truth table.
3899static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3901 const MachineRegisterInfo &MRI) {
3902 unsigned NumOpcodes = 0;
3903 uint8_t LHSBits, RHSBits;
3904
3905 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3906 // Define truth table given Src0, Src1, Src2 bits permutations:
3907 // 0 0 0
3908 // 0 0 1
3909 // 0 1 0
3910 // 0 1 1
3911 // 1 0 0
3912 // 1 0 1
3913 // 1 1 0
3914 // 1 1 1
3915 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3916
3917 if (mi_match(Op, MRI, m_AllOnesInt())) {
3918 Bits = 0xff;
3919 return true;
3920 }
3921 if (mi_match(Op, MRI, m_ZeroInt())) {
3922 Bits = 0;
3923 return true;
3924 }
3925
3926 for (unsigned I = 0; I < Src.size(); ++I) {
3927 // Try to find existing reused operand
3928 if (Src[I] == Op) {
3929 Bits = SrcBits[I];
3930 return true;
3931 }
3932 // Try to replace parent operator
3933 if (Src[I] == R) {
3934 Bits = SrcBits[I];
3935 Src[I] = Op;
3936 return true;
3937 }
3938 }
3939
3940 if (Src.size() == 3) {
3941 // No room left for operands. Try one last time, there can be a 'not' of
3942 // one of our source operands. In this case we can compute the bits
3943 // without growing Src vector.
3944 Register LHS;
3945 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3947 for (unsigned I = 0; I < Src.size(); ++I) {
3948 if (Src[I] == LHS) {
3949 Bits = ~SrcBits[I];
3950 return true;
3951 }
3952 }
3953 }
3954
3955 return false;
3956 }
3957
3958 Bits = SrcBits[Src.size()];
3959 Src.push_back(Op);
3960 return true;
3961 };
3962
3963 MachineInstr *MI = MRI.getVRegDef(R);
3964 switch (MI->getOpcode()) {
3965 case TargetOpcode::G_AND:
3966 case TargetOpcode::G_OR:
3967 case TargetOpcode::G_XOR: {
3968 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3969 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3970
3971 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3972 if (!getOperandBits(LHS, LHSBits) ||
3973 !getOperandBits(RHS, RHSBits)) {
3974 Src = Backup;
3975 return std::make_pair(0, 0);
3976 }
3977
3978 // Recursion is naturally limited by the size of the operand vector.
3979 auto Op = BitOp3_Op(LHS, Src, MRI);
3980 if (Op.first) {
3981 NumOpcodes += Op.first;
3982 LHSBits = Op.second;
3983 }
3984
3985 Op = BitOp3_Op(RHS, Src, MRI);
3986 if (Op.first) {
3987 NumOpcodes += Op.first;
3988 RHSBits = Op.second;
3989 }
3990 break;
3991 }
3992 default:
3993 return std::make_pair(0, 0);
3994 }
3995
3996 uint8_t TTbl;
3997 switch (MI->getOpcode()) {
3998 case TargetOpcode::G_AND:
3999 TTbl = LHSBits & RHSBits;
4000 break;
4001 case TargetOpcode::G_OR:
4002 TTbl = LHSBits | RHSBits;
4003 break;
4004 case TargetOpcode::G_XOR:
4005 TTbl = LHSBits ^ RHSBits;
4006 break;
4007 default:
4008 break;
4009 }
4010
4011 return std::make_pair(NumOpcodes + 1, TTbl);
4012}
4013
4014bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4015 if (!Subtarget->hasBitOp3Insts())
4016 return false;
4017
4018 Register DstReg = MI.getOperand(0).getReg();
4019 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4020 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4021 if (!IsVALU)
4022 return false;
4023
4025 uint8_t TTbl;
4026 unsigned NumOpcodes;
4027
4028 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4029
4030 // Src.empty() case can happen if all operands are all zero or all ones.
4031 // Normally it shall be optimized out before reaching this.
4032 if (NumOpcodes < 2 || Src.empty())
4033 return false;
4034
4035 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4036 if (NumOpcodes == 2 && IsB32) {
4037 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4038 // asm more readable. This cannot be modeled with AddedComplexity because
4039 // selector does not know how many operations did we match.
4040 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4041 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4042 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4043 return false;
4044 } else if (NumOpcodes < 4) {
4045 // For a uniform case threshold should be higher to account for moves
4046 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4047 // in SGPRs and a readtfirstlane after.
4048 return false;
4049 }
4050
4051 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4052 if (!IsB32 && STI.hasTrue16BitInsts())
4053 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4054 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4055 unsigned CBL = STI.getConstantBusLimit(Opc);
4056 MachineBasicBlock *MBB = MI.getParent();
4057 const DebugLoc &DL = MI.getDebugLoc();
4058
4059 for (unsigned I = 0; I < Src.size(); ++I) {
4060 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4061 if (RB->getID() != AMDGPU::SGPRRegBankID)
4062 continue;
4063 if (CBL > 0) {
4064 --CBL;
4065 continue;
4066 }
4067 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4068 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4069 .addReg(Src[I]);
4070 Src[I] = NewReg;
4071 }
4072
4073 // Last operand can be ignored, turning a ternary operation into a binary.
4074 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4075 // 'c' with 'a' here without changing the answer. In some pathological
4076 // cases it should be possible to get an operation with a single operand
4077 // too if optimizer would not catch it.
4078 while (Src.size() < 3)
4079 Src.push_back(Src[0]);
4080
4081 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4082 if (!IsB32)
4083 MIB.addImm(0); // src_mod0
4084 MIB.addReg(Src[0]);
4085 if (!IsB32)
4086 MIB.addImm(0); // src_mod1
4087 MIB.addReg(Src[1]);
4088 if (!IsB32)
4089 MIB.addImm(0); // src_mod2
4090 MIB.addReg(Src[2])
4091 .addImm(TTbl);
4092 if (!IsB32)
4093 MIB.addImm(0); // op_sel
4094
4095 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4096 MI.eraseFromParent();
4097
4098 return true;
4099}
4100
4101bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4102 Register SrcReg = MI.getOperand(0).getReg();
4103 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4104 return false;
4105
4106 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4107 Register SP =
4108 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4109 Register WaveAddr = getWaveAddress(DefMI);
4110 MachineBasicBlock *MBB = MI.getParent();
4111 const DebugLoc &DL = MI.getDebugLoc();
4112
4113 if (!WaveAddr) {
4114 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4115 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4116 .addReg(SrcReg)
4117 .addImm(Subtarget->getWavefrontSizeLog2())
4118 .setOperandDead(3); // Dead scc
4119 }
4120
4121 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4122 .addReg(WaveAddr);
4123
4124 MI.eraseFromParent();
4125 return true;
4126}
4127
4129
4130 if (!I.isPreISelOpcode()) {
4131 if (I.isCopy())
4132 return selectCOPY(I);
4133 return true;
4134 }
4135
4136 switch (I.getOpcode()) {
4137 case TargetOpcode::G_AND:
4138 case TargetOpcode::G_OR:
4139 case TargetOpcode::G_XOR:
4140 if (selectBITOP3(I))
4141 return true;
4142 if (selectImpl(I, *CoverageInfo))
4143 return true;
4144 return selectG_AND_OR_XOR(I);
4145 case TargetOpcode::G_ADD:
4146 case TargetOpcode::G_SUB:
4147 case TargetOpcode::G_PTR_ADD:
4148 if (selectImpl(I, *CoverageInfo))
4149 return true;
4150 return selectG_ADD_SUB(I);
4151 case TargetOpcode::G_UADDO:
4152 case TargetOpcode::G_USUBO:
4153 case TargetOpcode::G_UADDE:
4154 case TargetOpcode::G_USUBE:
4155 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4156 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4157 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4158 return selectG_AMDGPU_MAD_64_32(I);
4159 case TargetOpcode::G_INTTOPTR:
4160 case TargetOpcode::G_BITCAST:
4161 case TargetOpcode::G_PTRTOINT:
4162 case TargetOpcode::G_FREEZE:
4163 return selectCOPY(I);
4164 case TargetOpcode::G_FNEG:
4165 if (selectImpl(I, *CoverageInfo))
4166 return true;
4167 return selectG_FNEG(I);
4168 case TargetOpcode::G_FABS:
4169 if (selectImpl(I, *CoverageInfo))
4170 return true;
4171 return selectG_FABS(I);
4172 case TargetOpcode::G_EXTRACT:
4173 return selectG_EXTRACT(I);
4174 case TargetOpcode::G_MERGE_VALUES:
4175 case TargetOpcode::G_CONCAT_VECTORS:
4176 return selectG_MERGE_VALUES(I);
4177 case TargetOpcode::G_UNMERGE_VALUES:
4178 return selectG_UNMERGE_VALUES(I);
4179 case TargetOpcode::G_BUILD_VECTOR:
4180 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4181 return selectG_BUILD_VECTOR(I);
4182 case TargetOpcode::G_IMPLICIT_DEF:
4183 return selectG_IMPLICIT_DEF(I);
4184 case TargetOpcode::G_INSERT:
4185 return selectG_INSERT(I);
4186 case TargetOpcode::G_INTRINSIC:
4187 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4188 return selectG_INTRINSIC(I);
4189 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4190 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4191 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4192 case TargetOpcode::G_ICMP:
4193 case TargetOpcode::G_FCMP:
4194 if (selectG_ICMP_or_FCMP(I))
4195 return true;
4196 return selectImpl(I, *CoverageInfo);
4197 case TargetOpcode::G_LOAD:
4198 case TargetOpcode::G_ZEXTLOAD:
4199 case TargetOpcode::G_SEXTLOAD:
4200 case TargetOpcode::G_STORE:
4201 case TargetOpcode::G_ATOMIC_CMPXCHG:
4202 case TargetOpcode::G_ATOMICRMW_XCHG:
4203 case TargetOpcode::G_ATOMICRMW_ADD:
4204 case TargetOpcode::G_ATOMICRMW_SUB:
4205 case TargetOpcode::G_ATOMICRMW_AND:
4206 case TargetOpcode::G_ATOMICRMW_OR:
4207 case TargetOpcode::G_ATOMICRMW_XOR:
4208 case TargetOpcode::G_ATOMICRMW_MIN:
4209 case TargetOpcode::G_ATOMICRMW_MAX:
4210 case TargetOpcode::G_ATOMICRMW_UMIN:
4211 case TargetOpcode::G_ATOMICRMW_UMAX:
4212 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4213 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4214 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4215 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4216 case TargetOpcode::G_ATOMICRMW_FADD:
4217 case TargetOpcode::G_ATOMICRMW_FMIN:
4218 case TargetOpcode::G_ATOMICRMW_FMAX:
4219 return selectG_LOAD_STORE_ATOMICRMW(I);
4220 case TargetOpcode::G_SELECT:
4221 return selectG_SELECT(I);
4222 case TargetOpcode::G_TRUNC:
4223 return selectG_TRUNC(I);
4224 case TargetOpcode::G_SEXT:
4225 case TargetOpcode::G_ZEXT:
4226 case TargetOpcode::G_ANYEXT:
4227 case TargetOpcode::G_SEXT_INREG:
4228 // This is a workaround. For extension from type i1, `selectImpl()` uses
4229 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4230 // i1 can only be hold in a SGPR class.
4231 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4232 selectImpl(I, *CoverageInfo))
4233 return true;
4234 return selectG_SZA_EXT(I);
4235 case TargetOpcode::G_FPEXT:
4236 if (selectG_FPEXT(I))
4237 return true;
4238 return selectImpl(I, *CoverageInfo);
4239 case TargetOpcode::G_BRCOND:
4240 return selectG_BRCOND(I);
4241 case TargetOpcode::G_GLOBAL_VALUE:
4242 return selectG_GLOBAL_VALUE(I);
4243 case TargetOpcode::G_PTRMASK:
4244 return selectG_PTRMASK(I);
4245 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4246 return selectG_EXTRACT_VECTOR_ELT(I);
4247 case TargetOpcode::G_INSERT_VECTOR_ELT:
4248 return selectG_INSERT_VECTOR_ELT(I);
4249 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4250 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4251 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4252 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4253 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4254 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4256 assert(Intr && "not an image intrinsic with image pseudo");
4257 return selectImageIntrinsic(I, Intr);
4258 }
4259 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4260 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4261 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4262 return selectBVHIntersectRayIntrinsic(I);
4263 case AMDGPU::G_SBFX:
4264 case AMDGPU::G_UBFX:
4265 return selectG_SBFX_UBFX(I);
4266 case AMDGPU::G_SI_CALL:
4267 I.setDesc(TII.get(AMDGPU::SI_CALL));
4268 return true;
4269 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4270 return selectWaveAddress(I);
4271 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4272 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4273 return true;
4274 }
4275 case AMDGPU::G_STACKRESTORE:
4276 return selectStackRestore(I);
4277 case AMDGPU::G_PHI:
4278 return selectPHI(I);
4279 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4280 return selectCOPY_SCC_VCC(I);
4281 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4282 return selectCOPY_VCC_SCC(I);
4283 case AMDGPU::G_AMDGPU_READANYLANE:
4284 return selectReadAnyLane(I);
4285 case TargetOpcode::G_CONSTANT:
4286 case TargetOpcode::G_FCONSTANT:
4287 default:
4288 return selectImpl(I, *CoverageInfo);
4289 }
4290 return false;
4291}
4292
4294AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4295 return {{
4296 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4297 }};
4298
4299}
4300
4301std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4302 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4303 unsigned Mods = 0;
4304 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4305
4306 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4307 Src = MI->getOperand(1).getReg();
4308 Mods |= SISrcMods::NEG;
4309 MI = getDefIgnoringCopies(Src, *MRI);
4310 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4311 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4312 // denormal mode, but we're implicitly canonicalizing in a source operand.
4313 const ConstantFP *LHS =
4314 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4315 if (LHS && LHS->isZero()) {
4316 Mods |= SISrcMods::NEG;
4317 Src = MI->getOperand(2).getReg();
4318 }
4319 }
4320
4321 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4322 Src = MI->getOperand(1).getReg();
4323 Mods |= SISrcMods::ABS;
4324 }
4325
4326 if (OpSel)
4327 Mods |= SISrcMods::OP_SEL_0;
4328
4329 return std::pair(Src, Mods);
4330}
4331
4332Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4333 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4334 bool ForceVGPR) const {
4335 if ((Mods != 0 || ForceVGPR) &&
4336 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4337
4338 // If we looked through copies to find source modifiers on an SGPR operand,
4339 // we now have an SGPR register source. To avoid potentially violating the
4340 // constant bus restriction, we need to insert a copy to a VGPR.
4341 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4342 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4343 TII.get(AMDGPU::COPY), VGPRSrc)
4344 .addReg(Src);
4345 Src = VGPRSrc;
4346 }
4347
4348 return Src;
4349}
4350
4351///
4352/// This will select either an SGPR or VGPR operand and will save us from
4353/// having to write an extra tablegen pattern.
4355AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4356 return {{
4357 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4358 }};
4359}
4360
4362AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4363 Register Src;
4364 unsigned Mods;
4365 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4366
4367 return {{
4368 [=](MachineInstrBuilder &MIB) {
4369 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4370 },
4371 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4372 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4373 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4374 }};
4375}
4376
4378AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4379 Register Src;
4380 unsigned Mods;
4381 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4382 /*IsCanonicalizing=*/true,
4383 /*AllowAbs=*/false);
4384
4385 return {{
4386 [=](MachineInstrBuilder &MIB) {
4387 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4388 },
4389 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4390 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4391 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4392 }};
4393}
4394
4396AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4397 return {{
4398 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4399 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4400 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4401 }};
4402}
4403
4405AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4406 Register Src;
4407 unsigned Mods;
4408 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4409
4410 return {{
4411 [=](MachineInstrBuilder &MIB) {
4412 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4413 },
4414 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4415 }};
4416}
4417
4419AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4420 MachineOperand &Root) const {
4421 Register Src;
4422 unsigned Mods;
4423 std::tie(Src, Mods) =
4424 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4425
4426 return {{
4427 [=](MachineInstrBuilder &MIB) {
4428 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4429 },
4430 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4431 }};
4432}
4433
4435AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4436 Register Src;
4437 unsigned Mods;
4438 std::tie(Src, Mods) =
4439 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4440 /*AllowAbs=*/false);
4441
4442 return {{
4443 [=](MachineInstrBuilder &MIB) {
4444 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4445 },
4446 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4447 }};
4448}
4449
4451AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4452 Register Reg = Root.getReg();
4453 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4454 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4455 return {};
4456 return {{
4457 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4458 }};
4459}
4460
4461enum class SrcStatus {
4466 // This means current op = [op_upper, op_lower] and src = -op_lower.
4469 // This means current op = [op_upper, op_lower] and src = [op_upper,
4470 // -op_lower].
4478};
4479/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4480static bool isTruncHalf(const MachineInstr *MI,
4481 const MachineRegisterInfo &MRI) {
4482 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4483 return false;
4484
4485 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4486 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4487 return DstSize * 2 == SrcSize;
4488}
4489
4490/// Test if the MI is logic shift right with half bits,
4491/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4492static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4493 if (MI->getOpcode() != AMDGPU::G_LSHR)
4494 return false;
4495
4496 Register ShiftSrc;
4497 std::optional<ValueAndVReg> ShiftAmt;
4498 if (mi_match(MI->getOperand(0).getReg(), MRI,
4499 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4500 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4501 unsigned Shift = ShiftAmt->Value.getZExtValue();
4502 return Shift * 2 == SrcSize;
4503 }
4504 return false;
4505}
4506
4507/// Test if the MI is shift left with half bits,
4508/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4509static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4510 if (MI->getOpcode() != AMDGPU::G_SHL)
4511 return false;
4512
4513 Register ShiftSrc;
4514 std::optional<ValueAndVReg> ShiftAmt;
4515 if (mi_match(MI->getOperand(0).getReg(), MRI,
4516 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4517 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4518 unsigned Shift = ShiftAmt->Value.getZExtValue();
4519 return Shift * 2 == SrcSize;
4520 }
4521 return false;
4522}
4523
4524/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4525static bool isUnmergeHalf(const MachineInstr *MI,
4526 const MachineRegisterInfo &MRI) {
4527 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4528 return false;
4529 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4530 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4531}
4532
4534
4536 const MachineRegisterInfo &MRI) {
4537 LLT OpTy = MRI.getType(Reg);
4538 if (OpTy.isScalar())
4539 return TypeClass::SCALAR;
4540 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4543}
4544
4546 const MachineRegisterInfo &MRI) {
4548 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4549 return SrcStatus::INVALID;
4550
4551 switch (S) {
4552 case SrcStatus::IS_SAME:
4553 if (NegType == TypeClass::VECTOR_OF_TWO) {
4554 // Vector of 2:
4555 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4556 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4557 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4558 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4560 }
4561 if (NegType == TypeClass::SCALAR) {
4562 // Scalar:
4563 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4564 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4565 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4566 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4567 return SrcStatus::IS_HI_NEG;
4568 }
4569 break;
4571 if (NegType == TypeClass::VECTOR_OF_TWO) {
4572 // Vector of 2:
4573 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4574 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4575 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4576 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4577 return SrcStatus::IS_LO_NEG;
4578 }
4579 if (NegType == TypeClass::SCALAR) {
4580 // Scalar:
4581 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4582 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4583 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4584 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4585 return SrcStatus::IS_SAME;
4586 }
4587 break;
4589 if (NegType == TypeClass::VECTOR_OF_TWO) {
4590 // Vector of 2:
4591 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4592 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4593 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4594 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4595 return SrcStatus::IS_HI_NEG;
4596 }
4597 if (NegType == TypeClass::SCALAR) {
4598 // Scalar:
4599 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4600 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4601 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4602 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4604 }
4605 break;
4607 if (NegType == TypeClass::VECTOR_OF_TWO) {
4608 // Vector of 2:
4609 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4610 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4611 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4612 // [SrcHi, SrcLo] = [OpHi, OpLo]
4613 return SrcStatus::IS_SAME;
4614 }
4615 if (NegType == TypeClass::SCALAR) {
4616 // Scalar:
4617 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4618 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4619 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4620 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4621 return SrcStatus::IS_LO_NEG;
4622 }
4623 break;
4625 // Vector of 2:
4626 // Src = CurrUpper
4627 // Curr = [CurrUpper, CurrLower]
4628 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4629 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4630 // Src = -OpUpper
4631 //
4632 // Scalar:
4633 // Src = CurrUpper
4634 // Curr = [CurrUpper, CurrLower]
4635 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4636 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4637 // Src = -OpUpper
4640 if (NegType == TypeClass::VECTOR_OF_TWO) {
4641 // Vector of 2:
4642 // Src = CurrLower
4643 // Curr = [CurrUpper, CurrLower]
4644 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4645 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4646 // Src = -OpLower
4648 }
4649 if (NegType == TypeClass::SCALAR) {
4650 // Scalar:
4651 // Src = CurrLower
4652 // Curr = [CurrUpper, CurrLower]
4653 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4654 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4655 // Src = OpLower
4657 }
4658 break;
4660 // Vector of 2:
4661 // Src = -CurrUpper
4662 // Curr = [CurrUpper, CurrLower]
4663 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4664 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4665 // Src = -(-OpUpper) = OpUpper
4666 //
4667 // Scalar:
4668 // Src = -CurrUpper
4669 // Curr = [CurrUpper, CurrLower]
4670 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4671 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4672 // Src = -(-OpUpper) = OpUpper
4675 if (NegType == TypeClass::VECTOR_OF_TWO) {
4676 // Vector of 2:
4677 // Src = -CurrLower
4678 // Curr = [CurrUpper, CurrLower]
4679 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4680 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4681 // Src = -(-OpLower) = OpLower
4683 }
4684 if (NegType == TypeClass::SCALAR) {
4685 // Scalar:
4686 // Src = -CurrLower
4687 // Curr = [CurrUpper, CurrLower]
4688 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4689 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4690 // Src = -OpLower
4692 }
4693 break;
4694 default:
4695 break;
4696 }
4697 llvm_unreachable("unexpected SrcStatus & NegType combination");
4698}
4699
4700static std::optional<std::pair<Register, SrcStatus>>
4701calcNextStatus(std::pair<Register, SrcStatus> Curr,
4702 const MachineRegisterInfo &MRI) {
4703 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4704
4705 unsigned Opc = MI->getOpcode();
4706
4707 // Handle general Opc cases.
4708 switch (Opc) {
4709 case AMDGPU::G_BITCAST:
4710 return std::optional<std::pair<Register, SrcStatus>>(
4711 {MI->getOperand(1).getReg(), Curr.second});
4712 case AMDGPU::COPY:
4713 if (MI->getOperand(1).getReg().isPhysical())
4714 return std::nullopt;
4715 return std::optional<std::pair<Register, SrcStatus>>(
4716 {MI->getOperand(1).getReg(), Curr.second});
4717 case AMDGPU::G_FNEG: {
4718 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4719 if (Stat == SrcStatus::INVALID)
4720 return std::nullopt;
4721 return std::optional<std::pair<Register, SrcStatus>>(
4722 {MI->getOperand(1).getReg(), Stat});
4723 }
4724 default:
4725 break;
4726 }
4727
4728 // Calc next Stat from current Stat.
4729 switch (Curr.second) {
4730 case SrcStatus::IS_SAME:
4731 if (isTruncHalf(MI, MRI))
4732 return std::optional<std::pair<Register, SrcStatus>>(
4733 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4734 else if (isUnmergeHalf(MI, MRI)) {
4735 if (Curr.first == MI->getOperand(0).getReg())
4736 return std::optional<std::pair<Register, SrcStatus>>(
4737 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4738 return std::optional<std::pair<Register, SrcStatus>>(
4739 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4740 }
4741 break;
4743 if (isTruncHalf(MI, MRI)) {
4744 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4745 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4746 // = [OpLowerHi, OpLowerLo]
4747 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4748 // = [-OpLowerHi, OpLowerLo]
4749 // = -OpLower
4750 return std::optional<std::pair<Register, SrcStatus>>(
4751 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4752 }
4753 if (isUnmergeHalf(MI, MRI)) {
4754 if (Curr.first == MI->getOperand(0).getReg())
4755 return std::optional<std::pair<Register, SrcStatus>>(
4756 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4757 return std::optional<std::pair<Register, SrcStatus>>(
4758 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4759 }
4760 break;
4762 if (isShlHalf(MI, MRI))
4763 return std::optional<std::pair<Register, SrcStatus>>(
4764 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4765 break;
4767 if (isLshrHalf(MI, MRI))
4768 return std::optional<std::pair<Register, SrcStatus>>(
4769 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4770 break;
4772 if (isShlHalf(MI, MRI))
4773 return std::optional<std::pair<Register, SrcStatus>>(
4774 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4775 break;
4777 if (isLshrHalf(MI, MRI))
4778 return std::optional<std::pair<Register, SrcStatus>>(
4779 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4780 break;
4781 default:
4782 break;
4783 }
4784 return std::nullopt;
4785}
4786
4787/// This is used to control valid status that current MI supports. For example,
4788/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4789/// bit on VOP3P.
4790/// The class can be further extended to recognize support on SEL, NEG, ABS bit
4791/// for different MI on different arch
4793private:
4794 bool HasNeg = false;
4795 // Assume all complex pattern of VOP3P have opsel.
4796 bool HasOpsel = true;
4797
4798public:
4800 const MachineInstr *MI = MRI.getVRegDef(Reg);
4801 unsigned Opc = MI->getOpcode();
4802
4803 if (Opc < TargetOpcode::GENERIC_OP_END) {
4804 // Keep same for generic op.
4805 HasNeg = true;
4806 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4807 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4808 // Only float point intrinsic has neg & neg_hi bits.
4809 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4810 HasNeg = true;
4811 }
4812 }
4813 bool checkOptions(SrcStatus Stat) const {
4814 if (!HasNeg &&
4815 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4816 return false;
4817 }
4818 if (!HasOpsel &&
4819 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4820 return false;
4821 }
4822 return true;
4823 }
4824};
4825
4828 int MaxDepth = 3) {
4829 int Depth = 0;
4830 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4832
4833 while (Depth <= MaxDepth && Curr.has_value()) {
4834 Depth++;
4835 if (SO.checkOptions(Curr.value().second))
4836 Statlist.push_back(Curr.value());
4837 Curr = calcNextStatus(Curr.value(), MRI);
4838 }
4839
4840 return Statlist;
4841}
4842
4843static std::pair<Register, SrcStatus>
4845 int MaxDepth = 3) {
4846 int Depth = 0;
4847 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4848 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4849
4850 while (Depth <= MaxDepth && Curr.has_value()) {
4851 Depth++;
4852 SrcStatus Stat = Curr.value().second;
4853 if (SO.checkOptions(Stat)) {
4854 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4856 LastSameOrNeg = Curr.value();
4857 }
4858 Curr = calcNextStatus(Curr.value(), MRI);
4859 }
4860
4861 return LastSameOrNeg;
4862}
4863
4864static bool isSameBitWidth(Register Reg1, Register Reg2,
4865 const MachineRegisterInfo &MRI) {
4866 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4867 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4868 return Width1 == Width2;
4869}
4870
4871static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
4872 // SrcStatus::IS_LOWER_HALF remain 0.
4873 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
4874 Mods ^= SISrcMods::NEG_HI;
4875 Mods |= SISrcMods::OP_SEL_1;
4876 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
4877 Mods |= SISrcMods::OP_SEL_1;
4878 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
4879 Mods ^= SISrcMods::NEG_HI;
4880 else if (HiStat == SrcStatus::IS_HI_NEG)
4881 Mods ^= SISrcMods::NEG_HI;
4882
4883 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
4884 Mods ^= SISrcMods::NEG;
4885 Mods |= SISrcMods::OP_SEL_0;
4886 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
4887 Mods |= SISrcMods::OP_SEL_0;
4888 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
4889 Mods |= SISrcMods::NEG;
4890 else if (LoStat == SrcStatus::IS_HI_NEG)
4891 Mods ^= SISrcMods::NEG;
4892
4893 return Mods;
4894}
4895
4896static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
4897 Register RootReg, const SIInstrInfo &TII,
4898 const MachineRegisterInfo &MRI) {
4899 auto IsHalfState = [](SrcStatus S) {
4902 };
4903 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
4904 IsHalfState(HiStat);
4905}
4906
4907std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
4908 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
4909 unsigned Mods = 0;
4910 // No modification if Root type is not form of <2 x Type>.
4911 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
4912 Mods |= SISrcMods::OP_SEL_1;
4913 return {RootReg, Mods};
4914 }
4915
4916 SearchOptions SO(RootReg, MRI);
4917
4918 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
4919
4920 if (Stat.second == SrcStatus::IS_BOTH_NEG)
4922 else if (Stat.second == SrcStatus::IS_HI_NEG)
4923 Mods ^= SISrcMods::NEG_HI;
4924 else if (Stat.second == SrcStatus::IS_LO_NEG)
4925 Mods ^= SISrcMods::NEG;
4926
4927 MachineInstr *MI = MRI.getVRegDef(Stat.first);
4928
4929 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
4930 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
4931 Mods |= SISrcMods::OP_SEL_1;
4932 return {Stat.first, Mods};
4933 }
4934
4936 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
4937
4938 if (StatlistHi.empty()) {
4939 Mods |= SISrcMods::OP_SEL_1;
4940 return {Stat.first, Mods};
4941 }
4942
4944 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
4945
4946 if (StatlistLo.empty()) {
4947 Mods |= SISrcMods::OP_SEL_1;
4948 return {Stat.first, Mods};
4949 }
4950
4951 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
4952 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
4953 if (StatlistHi[I].first == StatlistLo[J].first &&
4954 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
4955 StatlistHi[I].first, RootReg, TII, MRI))
4956 return {StatlistHi[I].first,
4957 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
4958 }
4959 }
4960 // Packed instructions do not have abs modifiers.
4961 Mods |= SISrcMods::OP_SEL_1;
4962
4963 return {Stat.first, Mods};
4964}
4965
4966// Removed unused function `getAllKindImm` to eliminate dead code.
4967
4968static bool checkRB(Register Reg, unsigned int RBNo,
4969 const AMDGPURegisterBankInfo &RBI,
4970 const MachineRegisterInfo &MRI,
4971 const TargetRegisterInfo &TRI) {
4972 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
4973 return RB->getID() == RBNo;
4974}
4975
4976// This function is used to get the correct register bank for returned reg.
4977// Assume:
4978// 1. VOP3P is always legal for VGPR.
4979// 2. RootOp's regbank is legal.
4980// Thus
4981// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
4982// 2. If RootOp is VGPR, then NewOp must be VGPR.
4984 const AMDGPURegisterBankInfo &RBI,
4986 const TargetRegisterInfo &TRI,
4987 const SIInstrInfo &TII) {
4988 // RootOp can only be VGPR or SGPR (some hand written cases such as.
4989 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
4990 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
4991 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
4992 return NewReg;
4993
4994 MachineInstr *MI = MRI.getVRegDef(RootReg);
4995 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
4996 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
4997 return RootReg;
4998 }
4999
5000 MachineBasicBlock *BB = MI->getParent();
5001 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5002
5004 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5005 .addReg(NewReg);
5006
5007 // Only accept VGPR.
5008 return MIB->getOperand(0).getReg();
5009}
5010
5012AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5013 bool IsDOT) const {
5014 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5015 Register Reg;
5016 unsigned Mods;
5017 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5018
5019 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5020 return {{
5021 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5022 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5023 }};
5024}
5025
5027AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5028
5029 return selectVOP3PRetHelper(Root);
5030}
5031
5033AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5034
5035 return selectVOP3PRetHelper(Root, true);
5036}
5037
5039AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5040 MachineOperand &Root) const {
5041 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5042 "expected i1 value");
5043 unsigned Mods = SISrcMods::OP_SEL_1;
5044 if (Root.getImm() != 0)
5045 Mods |= SISrcMods::OP_SEL_0;
5046
5047 return {{
5048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5049 }};
5050}
5051
5053 MachineInstr *InsertPt,
5055 const TargetRegisterClass *DstRegClass;
5056 switch (Elts.size()) {
5057 case 8:
5058 DstRegClass = &AMDGPU::VReg_256RegClass;
5059 break;
5060 case 4:
5061 DstRegClass = &AMDGPU::VReg_128RegClass;
5062 break;
5063 case 2:
5064 DstRegClass = &AMDGPU::VReg_64RegClass;
5065 break;
5066 default:
5067 llvm_unreachable("unhandled Reg sequence size");
5068 }
5069
5070 MachineIRBuilder B(*InsertPt);
5071 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5072 .addDef(MRI.createVirtualRegister(DstRegClass));
5073 for (unsigned i = 0; i < Elts.size(); ++i) {
5074 MIB.addReg(Elts[i]);
5076 }
5077 return MIB->getOperand(0).getReg();
5078}
5079
5080static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5082 MachineInstr *InsertPt,
5084 if (ModOpcode == TargetOpcode::G_FNEG) {
5085 Mods |= SISrcMods::NEG;
5086 // Check if all elements also have abs modifier
5087 SmallVector<Register, 8> NegAbsElts;
5088 for (auto El : Elts) {
5089 Register FabsSrc;
5090 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5091 break;
5092 NegAbsElts.push_back(FabsSrc);
5093 }
5094 if (Elts.size() != NegAbsElts.size()) {
5095 // Neg
5096 Src = buildRegSequence(Elts, InsertPt, MRI);
5097 } else {
5098 // Neg and Abs
5099 Mods |= SISrcMods::NEG_HI;
5100 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5101 }
5102 } else {
5103 assert(ModOpcode == TargetOpcode::G_FABS);
5104 // Abs
5105 Mods |= SISrcMods::NEG_HI;
5106 Src = buildRegSequence(Elts, InsertPt, MRI);
5107 }
5108}
5109
5111AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5112 Register Src = Root.getReg();
5113 unsigned Mods = SISrcMods::OP_SEL_1;
5115
5116 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5117 assert(BV->getNumSources() > 0);
5118 // Based on first element decide which mod we match, neg or abs
5119 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5120 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5121 ? AMDGPU::G_FNEG
5122 : AMDGPU::G_FABS;
5123 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5124 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5125 if (ElF32->getOpcode() != ModOpcode)
5126 break;
5127 EltsF32.push_back(ElF32->getOperand(1).getReg());
5128 }
5129
5130 // All elements had ModOpcode modifier
5131 if (BV->getNumSources() == EltsF32.size()) {
5132 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5133 *MRI);
5134 }
5135 }
5136
5137 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5138 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5139}
5140
5142AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5143 Register Src = Root.getReg();
5144 unsigned Mods = SISrcMods::OP_SEL_1;
5145 SmallVector<Register, 8> EltsV2F16;
5146
5147 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5148 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5149 Register FNegSrc;
5150 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5151 break;
5152 EltsV2F16.push_back(FNegSrc);
5153 }
5154
5155 // All elements had ModOpcode modifier
5156 if (CV->getNumSources() == EltsV2F16.size()) {
5157 Mods |= SISrcMods::NEG;
5158 Mods |= SISrcMods::NEG_HI;
5159 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5160 }
5161 }
5162
5163 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5164 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5165}
5166
5168AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5169 Register Src = Root.getReg();
5170 unsigned Mods = SISrcMods::OP_SEL_1;
5171 SmallVector<Register, 8> EltsV2F16;
5172
5173 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5174 assert(CV->getNumSources() > 0);
5175 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5176 // Based on first element decide which mod we match, neg or abs
5177 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5178 ? AMDGPU::G_FNEG
5179 : AMDGPU::G_FABS;
5180
5181 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5182 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5183 if (ElV2F16->getOpcode() != ModOpcode)
5184 break;
5185 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5186 }
5187
5188 // All elements had ModOpcode modifier
5189 if (CV->getNumSources() == EltsV2F16.size()) {
5190 MachineIRBuilder B(*Root.getParent());
5191 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5192 *MRI);
5193 }
5194 }
5195
5196 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5197 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5198}
5199
5201AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5202 std::optional<FPValueAndVReg> FPValReg;
5203 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5204 if (TII.isInlineConstant(FPValReg->Value)) {
5205 return {{[=](MachineInstrBuilder &MIB) {
5206 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5207 }}};
5208 }
5209 // Non-inlineable splat floats should not fall-through for integer immediate
5210 // checks.
5211 return {};
5212 }
5213
5214 APInt ICst;
5215 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5216 if (TII.isInlineConstant(ICst)) {
5217 return {
5218 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5219 }
5220 }
5221
5222 return {};
5223}
5224
5226AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5227 Register Src =
5228 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5229 unsigned Key = 0;
5230
5231 Register ShiftSrc;
5232 std::optional<ValueAndVReg> ShiftAmt;
5233 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5234 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5235 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5236 Key = ShiftAmt->Value.getZExtValue() / 8;
5237 Src = ShiftSrc;
5238 }
5239
5240 return {{
5241 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5242 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5243 }};
5244}
5245
5247AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5248
5249 Register Src =
5250 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5251 unsigned Key = 0;
5252
5253 Register ShiftSrc;
5254 std::optional<ValueAndVReg> ShiftAmt;
5255 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5256 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5257 ShiftAmt->Value.getZExtValue() == 16) {
5258 Src = ShiftSrc;
5259 Key = 1;
5260 }
5261
5262 return {{
5263 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5264 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5265 }};
5266}
5267
5269AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5270 Register Src =
5271 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5272 unsigned Key = 0;
5273
5274 Register S32 = matchZeroExtendFromS32(Src);
5275 if (!S32)
5276 S32 = matchAnyExtendFromS32(Src);
5277
5278 if (S32) {
5279 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5280 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5281 assert(Def->getNumOperands() == 3);
5282 Register DstReg1 = Def->getOperand(1).getReg();
5283 if (mi_match(S32, *MRI,
5284 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5285 Src = Def->getOperand(2).getReg();
5286 Key = 1;
5287 }
5288 }
5289 }
5290
5291 return {{
5292 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5293 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5294 }};
5295}
5296
5298AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5299 Register Src;
5300 unsigned Mods;
5301 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5302
5303 // FIXME: Handle op_sel
5304 return {{
5305 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5306 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5307 }};
5308}
5309
5310// FIXME-TRUE16 remove when fake16 is removed
5312AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5313 Register Src;
5314 unsigned Mods;
5315 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5316 /*IsCanonicalizing=*/true,
5317 /*AllowAbs=*/false,
5318 /*OpSel=*/false);
5319
5320 return {{
5321 [=](MachineInstrBuilder &MIB) {
5322 MIB.addReg(
5323 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5324 },
5325 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5326 }};
5327}
5328
5330AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5331 Register Src;
5332 unsigned Mods;
5333 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5334 /*IsCanonicalizing=*/true,
5335 /*AllowAbs=*/false,
5336 /*OpSel=*/true);
5337
5338 return {{
5339 [=](MachineInstrBuilder &MIB) {
5340 MIB.addReg(
5341 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5342 },
5343 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5344 }};
5345}
5346
5347// Given \p Offset and load specified by the \p Root operand check if \p Offset
5348// is a multiple of the load byte size. If it is update \p Offset to a
5349// pre-scaled value and return true.
5350bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5352 bool IsSigned) const {
5353 if (!Subtarget->hasScaleOffset())
5354 return false;
5355
5356 const MachineInstr &MI = *Root.getParent();
5357 MachineMemOperand *MMO = *MI.memoperands_begin();
5358
5359 if (!MMO->getSize().hasValue())
5360 return false;
5361
5362 uint64_t Size = MMO->getSize().getValue();
5363
5364 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5365 if (!OffsetReg)
5366 OffsetReg = Offset;
5367
5368 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5369 OffsetReg = Def->Reg;
5370
5371 Register Op0;
5372 MachineInstr *Mul;
5373 bool ScaleOffset =
5374 (isPowerOf2_64(Size) &&
5375 mi_match(OffsetReg, *MRI,
5376 m_GShl(m_Reg(Op0),
5379 mi_match(OffsetReg, *MRI,
5381 m_Copy(m_SpecificICst(Size))))) ||
5382 mi_match(
5383 OffsetReg, *MRI,
5384 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5385 m_Reg(Op0), m_SpecificICst(Size))) ||
5386 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5387 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5388 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5389 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5390 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5391 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5392 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5393 mi_match(Mul->getOperand(3).getReg(), *MRI,
5395 m_Copy(m_SpecificICst(Size))))) &&
5396 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5397
5398 if (ScaleOffset)
5399 Offset = Op0;
5400
5401 return ScaleOffset;
5402}
5403
5404bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5405 Register &Base,
5406 Register *SOffset,
5407 int64_t *Offset,
5408 bool *ScaleOffset) const {
5409 MachineInstr *MI = Root.getParent();
5410 MachineBasicBlock *MBB = MI->getParent();
5411
5412 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5413 // then we can select all ptr + 32-bit offsets.
5414 SmallVector<GEPInfo, 4> AddrInfo;
5415 getAddrModeInfo(*MI, *MRI, AddrInfo);
5416
5417 if (AddrInfo.empty())
5418 return false;
5419
5420 const GEPInfo &GEPI = AddrInfo[0];
5421 std::optional<int64_t> EncodedImm;
5422
5423 if (ScaleOffset)
5424 *ScaleOffset = false;
5425
5426 if (SOffset && Offset) {
5427 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5428 /*HasSOffset=*/true);
5429 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5430 AddrInfo.size() > 1) {
5431 const GEPInfo &GEPI2 = AddrInfo[1];
5432 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5433 Register OffsetReg = GEPI2.SgprParts[1];
5434 if (ScaleOffset)
5435 *ScaleOffset =
5436 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5437 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5438 if (OffsetReg) {
5439 Base = GEPI2.SgprParts[0];
5440 *SOffset = OffsetReg;
5441 *Offset = *EncodedImm;
5442 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5443 return true;
5444
5445 // For unbuffered smem loads, it is illegal for the Immediate Offset
5446 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5447 // is negative. Handle the case where the Immediate Offset + SOffset
5448 // is negative.
5449 auto SKnown = VT->getKnownBits(*SOffset);
5450 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5451 return false;
5452
5453 return true;
5454 }
5455 }
5456 }
5457 return false;
5458 }
5459
5460 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5461 /*HasSOffset=*/false);
5462 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5463 Base = GEPI.SgprParts[0];
5464 *Offset = *EncodedImm;
5465 return true;
5466 }
5467
5468 // SGPR offset is unsigned.
5469 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5470 GEPI.Imm != 0) {
5471 // If we make it this far we have a load with an 32-bit immediate offset.
5472 // It is OK to select this using a sgpr offset, because we have already
5473 // failed trying to select this load into one of the _IMM variants since
5474 // the _IMM Patterns are considered before the _SGPR patterns.
5475 Base = GEPI.SgprParts[0];
5476 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5477 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5478 .addImm(GEPI.Imm);
5479 return true;
5480 }
5481
5482 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5483 Register OffsetReg = GEPI.SgprParts[1];
5484 if (ScaleOffset)
5485 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5486 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5487 if (OffsetReg) {
5488 Base = GEPI.SgprParts[0];
5489 *SOffset = OffsetReg;
5490 return true;
5491 }
5492 }
5493
5494 return false;
5495}
5496
5498AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5499 Register Base;
5500 int64_t Offset;
5501 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5502 /* ScaleOffset */ nullptr))
5503 return std::nullopt;
5504
5505 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5506 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5507}
5508
5510AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5511 SmallVector<GEPInfo, 4> AddrInfo;
5512 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5513
5514 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5515 return std::nullopt;
5516
5517 const GEPInfo &GEPInfo = AddrInfo[0];
5518 Register PtrReg = GEPInfo.SgprParts[0];
5519 std::optional<int64_t> EncodedImm =
5520 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5521 if (!EncodedImm)
5522 return std::nullopt;
5523
5524 return {{
5525 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5526 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5527 }};
5528}
5529
5531AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5532 Register Base, SOffset;
5533 bool ScaleOffset;
5534 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5535 &ScaleOffset))
5536 return std::nullopt;
5537
5538 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5539 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5540 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5541 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5542}
5543
5545AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5546 Register Base, SOffset;
5547 int64_t Offset;
5548 bool ScaleOffset;
5549 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5550 return std::nullopt;
5551
5552 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5553 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5554 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5555 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5556 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5557}
5558
5559std::pair<Register, int>
5560AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5561 uint64_t FlatVariant) const {
5562 MachineInstr *MI = Root.getParent();
5563
5564 auto Default = std::pair(Root.getReg(), 0);
5565
5566 if (!STI.hasFlatInstOffsets())
5567 return Default;
5568
5569 Register PtrBase;
5570 int64_t ConstOffset;
5571 bool IsInBounds;
5572 std::tie(PtrBase, ConstOffset, IsInBounds) =
5573 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5574
5575 // Adding the offset to the base address with an immediate in a FLAT
5576 // instruction must not change the memory aperture in which the address falls.
5577 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5578 // instructions.
5579 if (ConstOffset == 0 ||
5580 (FlatVariant == SIInstrFlags::FlatScratch &&
5581 !isFlatScratchBaseLegal(Root.getReg())) ||
5582 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5583 return Default;
5584
5585 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5586 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5587 return Default;
5588
5589 return std::pair(PtrBase, ConstOffset);
5590}
5591
5593AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5594 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5595
5596 return {{
5597 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5598 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5599 }};
5600}
5601
5603AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5604 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5605
5606 return {{
5607 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5608 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5609 }};
5610}
5611
5613AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5614 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5615
5616 return {{
5617 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5618 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5619 }};
5620}
5621
5622// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5624AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5625 unsigned CPolBits,
5626 bool NeedIOffset) const {
5627 Register Addr = Root.getReg();
5628 Register PtrBase;
5629 int64_t ConstOffset;
5630 int64_t ImmOffset = 0;
5631
5632 // Match the immediate offset first, which canonically is moved as low as
5633 // possible.
5634 std::tie(PtrBase, ConstOffset, std::ignore) =
5635 getPtrBaseWithConstantOffset(Addr, *MRI);
5636
5637 if (ConstOffset != 0) {
5638 if (NeedIOffset &&
5639 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5641 Addr = PtrBase;
5642 ImmOffset = ConstOffset;
5643 } else {
5644 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5645 if (isSGPR(PtrBaseDef->Reg)) {
5646 if (ConstOffset > 0) {
5647 // Offset is too large.
5648 //
5649 // saddr + large_offset -> saddr +
5650 // (voffset = large_offset & ~MaxOffset) +
5651 // (large_offset & MaxOffset);
5652 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5653 if (NeedIOffset) {
5654 std::tie(SplitImmOffset, RemainderOffset) =
5655 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5657 }
5658
5659 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5660 : isUInt<32>(RemainderOffset)) {
5661 MachineInstr *MI = Root.getParent();
5662 MachineBasicBlock *MBB = MI->getParent();
5663 Register HighBits =
5664 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5665
5666 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5667 HighBits)
5668 .addImm(RemainderOffset);
5669
5670 if (NeedIOffset)
5671 return {{
5672 [=](MachineInstrBuilder &MIB) {
5673 MIB.addReg(PtrBase);
5674 }, // saddr
5675 [=](MachineInstrBuilder &MIB) {
5676 MIB.addReg(HighBits);
5677 }, // voffset
5678 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5679 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5680 }};
5681 return {{
5682 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5683 [=](MachineInstrBuilder &MIB) {
5684 MIB.addReg(HighBits);
5685 }, // voffset
5686 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5687 }};
5688 }
5689 }
5690
5691 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5692 // is 1 we would need to perform 1 or 2 extra moves for each half of
5693 // the constant and it is better to do a scalar add and then issue a
5694 // single VALU instruction to materialize zero. Otherwise it is less
5695 // instructions to perform VALU adds with immediates or inline literals.
5696 unsigned NumLiterals =
5697 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5698 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5699 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5700 return std::nullopt;
5701 }
5702 }
5703 }
5704
5705 // Match the variable offset.
5706 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5707 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5708 // Look through the SGPR->VGPR copy.
5709 Register SAddr =
5710 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5711
5712 if (isSGPR(SAddr)) {
5713 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5714
5715 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5716 // inserted later.
5717 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5718 Subtarget->hasSignedGVSOffset());
5719 if (Register VOffset = matchExtendFromS32OrS32(
5720 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5721 if (NeedIOffset)
5722 return {{[=](MachineInstrBuilder &MIB) { // saddr
5723 MIB.addReg(SAddr);
5724 },
5725 [=](MachineInstrBuilder &MIB) { // voffset
5726 MIB.addReg(VOffset);
5727 },
5728 [=](MachineInstrBuilder &MIB) { // offset
5729 MIB.addImm(ImmOffset);
5730 },
5731 [=](MachineInstrBuilder &MIB) { // cpol
5732 MIB.addImm(CPolBits |
5733 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5734 }}};
5735 return {{[=](MachineInstrBuilder &MIB) { // saddr
5736 MIB.addReg(SAddr);
5737 },
5738 [=](MachineInstrBuilder &MIB) { // voffset
5739 MIB.addReg(VOffset);
5740 },
5741 [=](MachineInstrBuilder &MIB) { // cpol
5742 MIB.addImm(CPolBits |
5743 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5744 }}};
5745 }
5746 }
5747 }
5748
5749 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5750 // drop this.
5751 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5752 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5753 return std::nullopt;
5754
5755 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5756 // moves required to copy a 64-bit SGPR to VGPR.
5757 MachineInstr *MI = Root.getParent();
5758 MachineBasicBlock *MBB = MI->getParent();
5759 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5760
5761 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5762 .addImm(0);
5763
5764 if (NeedIOffset)
5765 return {{
5766 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5767 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5768 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5769 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5770 }};
5771 return {{
5772 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5773 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5774 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5775 }};
5776}
5777
5779AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5780 return selectGlobalSAddr(Root, 0);
5781}
5782
5784AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5785 const MachineInstr &I = *Root.getParent();
5786
5787 // We are assuming CPol is always the last operand of the intrinsic.
5788 auto PassedCPol =
5789 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5790 return selectGlobalSAddr(Root, PassedCPol);
5791}
5792
5794AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
5795 const MachineInstr &I = *Root.getParent();
5796
5797 // We are assuming CPol is second from last operand of the intrinsic.
5798 auto PassedCPol =
5799 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5800 return selectGlobalSAddr(Root, PassedCPol);
5801}
5802
5804AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
5805 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
5806}
5807
5809AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5810 MachineOperand &Root) const {
5811 const MachineInstr &I = *Root.getParent();
5812
5813 // We are assuming CPol is always the last operand of the intrinsic.
5814 auto PassedCPol =
5815 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5816 return selectGlobalSAddr(Root, PassedCPol, false);
5817}
5818
5820AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
5821 MachineOperand &Root) const {
5822 const MachineInstr &I = *Root.getParent();
5823
5824 // We are assuming CPol is second from last operand of the intrinsic.
5825 auto PassedCPol =
5826 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5827 return selectGlobalSAddr(Root, PassedCPol, false);
5828}
5829
5831AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5832 Register Addr = Root.getReg();
5833 Register PtrBase;
5834 int64_t ConstOffset;
5835 int64_t ImmOffset = 0;
5836
5837 // Match the immediate offset first, which canonically is moved as low as
5838 // possible.
5839 std::tie(PtrBase, ConstOffset, std::ignore) =
5840 getPtrBaseWithConstantOffset(Addr, *MRI);
5841
5842 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5843 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5845 Addr = PtrBase;
5846 ImmOffset = ConstOffset;
5847 }
5848
5849 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5850 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5851 int FI = AddrDef->MI->getOperand(1).getIndex();
5852 return {{
5853 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5854 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5855 }};
5856 }
5857
5858 Register SAddr = AddrDef->Reg;
5859
5860 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5861 Register LHS = AddrDef->MI->getOperand(1).getReg();
5862 Register RHS = AddrDef->MI->getOperand(2).getReg();
5863 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5864 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5865
5866 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5867 isSGPR(RHSDef->Reg)) {
5868 int FI = LHSDef->MI->getOperand(1).getIndex();
5869 MachineInstr &I = *Root.getParent();
5870 MachineBasicBlock *BB = I.getParent();
5871 const DebugLoc &DL = I.getDebugLoc();
5872 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5873
5874 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
5875 .addFrameIndex(FI)
5876 .addReg(RHSDef->Reg)
5877 .setOperandDead(3); // Dead scc
5878 }
5879 }
5880
5881 if (!isSGPR(SAddr))
5882 return std::nullopt;
5883
5884 return {{
5885 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
5886 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5887 }};
5888}
5889
5890// Check whether the flat scratch SVS swizzle bug affects this access.
5891bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5892 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5893 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5894 return false;
5895
5896 // The bug affects the swizzling of SVS accesses if there is any carry out
5897 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5898 // voffset to (soffset + inst_offset).
5899 auto VKnown = VT->getKnownBits(VAddr);
5900 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
5901 KnownBits::makeConstant(APInt(32, ImmOffset)));
5902 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5903 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5904 return (VMax & 3) + (SMax & 3) >= 4;
5905}
5906
5908AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5909 Register Addr = Root.getReg();
5910 Register PtrBase;
5911 int64_t ConstOffset;
5912 int64_t ImmOffset = 0;
5913
5914 // Match the immediate offset first, which canonically is moved as low as
5915 // possible.
5916 std::tie(PtrBase, ConstOffset, std::ignore) =
5917 getPtrBaseWithConstantOffset(Addr, *MRI);
5918
5919 Register OrigAddr = Addr;
5920 if (ConstOffset != 0 &&
5921 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5923 Addr = PtrBase;
5924 ImmOffset = ConstOffset;
5925 }
5926
5927 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5928 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5929 return std::nullopt;
5930
5931 Register RHS = AddrDef->MI->getOperand(2).getReg();
5932 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5933 return std::nullopt;
5934
5935 Register LHS = AddrDef->MI->getOperand(1).getReg();
5936 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5937
5938 if (OrigAddr != Addr) {
5939 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5940 return std::nullopt;
5941 } else {
5942 if (!isFlatScratchBaseLegalSV(OrigAddr))
5943 return std::nullopt;
5944 }
5945
5946 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5947 return std::nullopt;
5948
5949 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
5951 : 0;
5952
5953 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5954 int FI = LHSDef->MI->getOperand(1).getIndex();
5955 return {{
5956 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5957 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5958 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5959 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5960 }};
5961 }
5962
5963 if (!isSGPR(LHS))
5964 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
5965 LHS = Def->Reg;
5966
5967 if (!isSGPR(LHS))
5968 return std::nullopt;
5969
5970 return {{
5971 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5972 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5973 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5974 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5975 }};
5976}
5977
5979AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5980 MachineInstr *MI = Root.getParent();
5981 MachineBasicBlock *MBB = MI->getParent();
5982 MachineFunction *MF = MBB->getParent();
5983 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5984
5985 int64_t Offset = 0;
5986 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5987 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5988 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5989
5990 // TODO: Should this be inside the render function? The iterator seems to
5991 // move.
5992 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5993 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5994 HighBits)
5995 .addImm(Offset & ~MaxOffset);
5996
5997 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5998 MIB.addReg(Info->getScratchRSrcReg());
5999 },
6000 [=](MachineInstrBuilder &MIB) { // vaddr
6001 MIB.addReg(HighBits);
6002 },
6003 [=](MachineInstrBuilder &MIB) { // soffset
6004 // Use constant zero for soffset and rely on eliminateFrameIndex
6005 // to choose the appropriate frame register if need be.
6006 MIB.addImm(0);
6007 },
6008 [=](MachineInstrBuilder &MIB) { // offset
6009 MIB.addImm(Offset & MaxOffset);
6010 }}};
6011 }
6012
6013 assert(Offset == 0 || Offset == -1);
6014
6015 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6016 // offsets.
6017 std::optional<int> FI;
6018 Register VAddr = Root.getReg();
6019
6020 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6021 Register PtrBase;
6022 int64_t ConstOffset;
6023 std::tie(PtrBase, ConstOffset, std::ignore) =
6024 getPtrBaseWithConstantOffset(VAddr, *MRI);
6025 if (ConstOffset != 0) {
6026 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6027 (!STI.privateMemoryResourceIsRangeChecked() ||
6028 VT->signBitIsZero(PtrBase))) {
6029 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6030 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6031 FI = PtrBaseDef->getOperand(1).getIndex();
6032 else
6033 VAddr = PtrBase;
6034 Offset = ConstOffset;
6035 }
6036 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6037 FI = RootDef->getOperand(1).getIndex();
6038 }
6039
6040 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6041 MIB.addReg(Info->getScratchRSrcReg());
6042 },
6043 [=](MachineInstrBuilder &MIB) { // vaddr
6044 if (FI)
6045 MIB.addFrameIndex(*FI);
6046 else
6047 MIB.addReg(VAddr);
6048 },
6049 [=](MachineInstrBuilder &MIB) { // soffset
6050 // Use constant zero for soffset and rely on eliminateFrameIndex
6051 // to choose the appropriate frame register if need be.
6052 MIB.addImm(0);
6053 },
6054 [=](MachineInstrBuilder &MIB) { // offset
6055 MIB.addImm(Offset);
6056 }}};
6057}
6058
6059bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6060 int64_t Offset) const {
6061 if (!isUInt<16>(Offset))
6062 return false;
6063
6064 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6065 return true;
6066
6067 // On Southern Islands instruction with a negative base value and an offset
6068 // don't seem to work.
6069 return VT->signBitIsZero(Base);
6070}
6071
6072bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6073 int64_t Offset1,
6074 unsigned Size) const {
6075 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6076 return false;
6077 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6078 return false;
6079
6080 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6081 return true;
6082
6083 // On Southern Islands instruction with a negative base value and an offset
6084 // don't seem to work.
6085 return VT->signBitIsZero(Base);
6086}
6087
6088// Return whether the operation has NoUnsignedWrap property.
6089static bool isNoUnsignedWrap(MachineInstr *Addr) {
6090 return Addr->getOpcode() == TargetOpcode::G_OR ||
6091 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6093}
6094
6095// Check that the base address of flat scratch load/store in the form of `base +
6096// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6097// requirement). We always treat the first operand as the base address here.
6098bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6099 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6100
6101 if (isNoUnsignedWrap(AddrMI))
6102 return true;
6103
6104 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6105 // values.
6106 if (STI.hasSignedScratchOffsets())
6107 return true;
6108
6109 Register LHS = AddrMI->getOperand(1).getReg();
6110 Register RHS = AddrMI->getOperand(2).getReg();
6111
6112 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6113 std::optional<ValueAndVReg> RhsValReg =
6115 // If the immediate offset is negative and within certain range, the base
6116 // address cannot also be negative. If the base is also negative, the sum
6117 // would be either negative or much larger than the valid range of scratch
6118 // memory a thread can access.
6119 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6120 RhsValReg->Value.getSExtValue() > -0x40000000)
6121 return true;
6122 }
6123
6124 return VT->signBitIsZero(LHS);
6125}
6126
6127// Check address value in SGPR/VGPR are legal for flat scratch in the form
6128// of: SGPR + VGPR.
6129bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6130 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6131
6132 if (isNoUnsignedWrap(AddrMI))
6133 return true;
6134
6135 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6136 // values.
6137 if (STI.hasSignedScratchOffsets())
6138 return true;
6139
6140 Register LHS = AddrMI->getOperand(1).getReg();
6141 Register RHS = AddrMI->getOperand(2).getReg();
6142 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6143}
6144
6145// Check address value in SGPR/VGPR are legal for flat scratch in the form
6146// of: SGPR + VGPR + Imm.
6147bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6148 Register Addr) const {
6149 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6150 // values.
6151 if (STI.hasSignedScratchOffsets())
6152 return true;
6153
6154 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6155 Register Base = AddrMI->getOperand(1).getReg();
6156 std::optional<DefinitionAndSourceRegister> BaseDef =
6158 std::optional<ValueAndVReg> RHSOffset =
6160 assert(RHSOffset);
6161
6162 // If the immediate offset is negative and within certain range, the base
6163 // address cannot also be negative. If the base is also negative, the sum
6164 // would be either negative or much larger than the valid range of scratch
6165 // memory a thread can access.
6166 if (isNoUnsignedWrap(BaseDef->MI) &&
6167 (isNoUnsignedWrap(AddrMI) ||
6168 (RHSOffset->Value.getSExtValue() < 0 &&
6169 RHSOffset->Value.getSExtValue() > -0x40000000)))
6170 return true;
6171
6172 Register LHS = BaseDef->MI->getOperand(1).getReg();
6173 Register RHS = BaseDef->MI->getOperand(2).getReg();
6174 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6175}
6176
6177bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6178 unsigned ShAmtBits) const {
6179 assert(MI.getOpcode() == TargetOpcode::G_AND);
6180
6181 std::optional<APInt> RHS =
6182 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6183 if (!RHS)
6184 return false;
6185
6186 if (RHS->countr_one() >= ShAmtBits)
6187 return true;
6188
6189 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6190 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6191}
6192
6194AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6195 MachineOperand &Root) const {
6196 Register Reg = Root.getReg();
6197 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6198
6199 std::optional<DefinitionAndSourceRegister> Def =
6201 assert(Def && "this shouldn't be an optional result");
6202 Reg = Def->Reg;
6203
6204 if (Register WaveBase = getWaveAddress(Def->MI)) {
6205 return {{
6206 [=](MachineInstrBuilder &MIB) { // rsrc
6207 MIB.addReg(Info->getScratchRSrcReg());
6208 },
6209 [=](MachineInstrBuilder &MIB) { // soffset
6210 MIB.addReg(WaveBase);
6211 },
6212 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6213 }};
6214 }
6215
6216 int64_t Offset = 0;
6217
6218 // FIXME: Copy check is a hack
6220 if (mi_match(Reg, *MRI,
6221 m_GPtrAdd(m_Reg(BasePtr),
6223 if (!TII.isLegalMUBUFImmOffset(Offset))
6224 return {};
6225 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6226 Register WaveBase = getWaveAddress(BasePtrDef);
6227 if (!WaveBase)
6228 return {};
6229
6230 return {{
6231 [=](MachineInstrBuilder &MIB) { // rsrc
6232 MIB.addReg(Info->getScratchRSrcReg());
6233 },
6234 [=](MachineInstrBuilder &MIB) { // soffset
6235 MIB.addReg(WaveBase);
6236 },
6237 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6238 }};
6239 }
6240
6241 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6242 !TII.isLegalMUBUFImmOffset(Offset))
6243 return {};
6244
6245 return {{
6246 [=](MachineInstrBuilder &MIB) { // rsrc
6247 MIB.addReg(Info->getScratchRSrcReg());
6248 },
6249 [=](MachineInstrBuilder &MIB) { // soffset
6250 MIB.addImm(0);
6251 },
6252 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6253 }};
6254}
6255
6256std::pair<Register, unsigned>
6257AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6258 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6259 int64_t ConstAddr = 0;
6260
6261 Register PtrBase;
6262 int64_t Offset;
6263 std::tie(PtrBase, Offset, std::ignore) =
6264 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6265
6266 if (Offset) {
6267 if (isDSOffsetLegal(PtrBase, Offset)) {
6268 // (add n0, c0)
6269 return std::pair(PtrBase, Offset);
6270 }
6271 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6272 // TODO
6273
6274
6275 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6276 // TODO
6277
6278 }
6279
6280 return std::pair(Root.getReg(), 0);
6281}
6282
6284AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6285 Register Reg;
6286 unsigned Offset;
6287 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6288 return {{
6289 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6290 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6291 }};
6292}
6293
6295AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6296 return selectDSReadWrite2(Root, 4);
6297}
6298
6300AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6301 return selectDSReadWrite2(Root, 8);
6302}
6303
6305AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6306 unsigned Size) const {
6307 Register Reg;
6308 unsigned Offset;
6309 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6310 return {{
6311 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6312 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6313 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6314 }};
6315}
6316
6317std::pair<Register, unsigned>
6318AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6319 unsigned Size) const {
6320 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6321 int64_t ConstAddr = 0;
6322
6323 Register PtrBase;
6324 int64_t Offset;
6325 std::tie(PtrBase, Offset, std::ignore) =
6326 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6327
6328 if (Offset) {
6329 int64_t OffsetValue0 = Offset;
6330 int64_t OffsetValue1 = Offset + Size;
6331 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6332 // (add n0, c0)
6333 return std::pair(PtrBase, OffsetValue0 / Size);
6334 }
6335 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6336 // TODO
6337
6338 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6339 // TODO
6340
6341 }
6342
6343 return std::pair(Root.getReg(), 0);
6344}
6345
6346/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6347/// the base value with the constant offset, and if the offset computation is
6348/// known to be inbounds. There may be intervening copies between \p Root and
6349/// the identified constant. Returns \p Root, 0, false if this does not match
6350/// the pattern.
6351std::tuple<Register, int64_t, bool>
6352AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6353 Register Root, const MachineRegisterInfo &MRI) const {
6354 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6355 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6356 return {Root, 0, false};
6357
6358 MachineOperand &RHS = RootI->getOperand(2);
6359 std::optional<ValueAndVReg> MaybeOffset =
6361 if (!MaybeOffset)
6362 return {Root, 0, false};
6363 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6364 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6365 IsInBounds};
6366}
6367
6369 MIB.addImm(0);
6370}
6371
6372/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6373/// BasePtr is not valid, a null base pointer will be used.
6375 uint32_t FormatLo, uint32_t FormatHi,
6376 Register BasePtr) {
6377 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6378 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6379 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6380 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6381
6382 B.buildInstr(AMDGPU::S_MOV_B32)
6383 .addDef(RSrc2)
6384 .addImm(FormatLo);
6385 B.buildInstr(AMDGPU::S_MOV_B32)
6386 .addDef(RSrc3)
6387 .addImm(FormatHi);
6388
6389 // Build the half of the subregister with the constants before building the
6390 // full 128-bit register. If we are building multiple resource descriptors,
6391 // this will allow CSEing of the 2-component register.
6392 B.buildInstr(AMDGPU::REG_SEQUENCE)
6393 .addDef(RSrcHi)
6394 .addReg(RSrc2)
6395 .addImm(AMDGPU::sub0)
6396 .addReg(RSrc3)
6397 .addImm(AMDGPU::sub1);
6398
6399 Register RSrcLo = BasePtr;
6400 if (!BasePtr) {
6401 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6402 B.buildInstr(AMDGPU::S_MOV_B64)
6403 .addDef(RSrcLo)
6404 .addImm(0);
6405 }
6406
6407 B.buildInstr(AMDGPU::REG_SEQUENCE)
6408 .addDef(RSrc)
6409 .addReg(RSrcLo)
6410 .addImm(AMDGPU::sub0_sub1)
6411 .addReg(RSrcHi)
6412 .addImm(AMDGPU::sub2_sub3);
6413
6414 return RSrc;
6415}
6416
6418 const SIInstrInfo &TII, Register BasePtr) {
6419 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6420
6421 // FIXME: Why are half the "default" bits ignored based on the addressing
6422 // mode?
6423 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6424}
6425
6427 const SIInstrInfo &TII, Register BasePtr) {
6428 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6429
6430 // FIXME: Why are half the "default" bits ignored based on the addressing
6431 // mode?
6432 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6433}
6434
6435AMDGPUInstructionSelector::MUBUFAddressData
6436AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6437 MUBUFAddressData Data;
6438 Data.N0 = Src;
6439
6440 Register PtrBase;
6441 int64_t Offset;
6442
6443 std::tie(PtrBase, Offset, std::ignore) =
6444 getPtrBaseWithConstantOffset(Src, *MRI);
6445 if (isUInt<32>(Offset)) {
6446 Data.N0 = PtrBase;
6447 Data.Offset = Offset;
6448 }
6449
6450 if (MachineInstr *InputAdd
6451 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6452 Data.N2 = InputAdd->getOperand(1).getReg();
6453 Data.N3 = InputAdd->getOperand(2).getReg();
6454
6455 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6456 // FIXME: Don't know this was defined by operand 0
6457 //
6458 // TODO: Remove this when we have copy folding optimizations after
6459 // RegBankSelect.
6460 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6461 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6462 }
6463
6464 return Data;
6465}
6466
6467/// Return if the addr64 mubuf mode should be used for the given address.
6468bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6469 // (ptr_add N2, N3) -> addr64, or
6470 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6471 if (Addr.N2)
6472 return true;
6473
6474 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6475 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6476}
6477
6478/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6479/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6480/// component.
6481void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6482 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6483 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6484 return;
6485
6486 // Illegal offset, store it in soffset.
6487 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6488 B.buildInstr(AMDGPU::S_MOV_B32)
6489 .addDef(SOffset)
6490 .addImm(ImmOffset);
6491 ImmOffset = 0;
6492}
6493
6494bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6495 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6496 Register &SOffset, int64_t &Offset) const {
6497 // FIXME: Predicates should stop this from reaching here.
6498 // addr64 bit was removed for volcanic islands.
6499 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6500 return false;
6501
6502 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6503 if (!shouldUseAddr64(AddrData))
6504 return false;
6505
6506 Register N0 = AddrData.N0;
6507 Register N2 = AddrData.N2;
6508 Register N3 = AddrData.N3;
6509 Offset = AddrData.Offset;
6510
6511 // Base pointer for the SRD.
6512 Register SRDPtr;
6513
6514 if (N2) {
6515 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6516 assert(N3);
6517 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6518 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6519 // addr64, and construct the default resource from a 0 address.
6520 VAddr = N0;
6521 } else {
6522 SRDPtr = N3;
6523 VAddr = N2;
6524 }
6525 } else {
6526 // N2 is not divergent.
6527 SRDPtr = N2;
6528 VAddr = N3;
6529 }
6530 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6531 // Use the default null pointer in the resource
6532 VAddr = N0;
6533 } else {
6534 // N0 -> offset, or
6535 // (N0 + C1) -> offset
6536 SRDPtr = N0;
6537 }
6538
6539 MachineIRBuilder B(*Root.getParent());
6540 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6541 splitIllegalMUBUFOffset(B, SOffset, Offset);
6542 return true;
6543}
6544
6545bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6546 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6547 int64_t &Offset) const {
6548
6549 // FIXME: Pattern should not reach here.
6550 if (STI.useFlatForGlobal())
6551 return false;
6552
6553 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6554 if (shouldUseAddr64(AddrData))
6555 return false;
6556
6557 // N0 -> offset, or
6558 // (N0 + C1) -> offset
6559 Register SRDPtr = AddrData.N0;
6560 Offset = AddrData.Offset;
6561
6562 // TODO: Look through extensions for 32-bit soffset.
6563 MachineIRBuilder B(*Root.getParent());
6564
6565 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6566 splitIllegalMUBUFOffset(B, SOffset, Offset);
6567 return true;
6568}
6569
6571AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6572 Register VAddr;
6573 Register RSrcReg;
6574 Register SOffset;
6575 int64_t Offset = 0;
6576
6577 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6578 return {};
6579
6580 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6581 // pattern.
6582 return {{
6583 [=](MachineInstrBuilder &MIB) { // rsrc
6584 MIB.addReg(RSrcReg);
6585 },
6586 [=](MachineInstrBuilder &MIB) { // vaddr
6587 MIB.addReg(VAddr);
6588 },
6589 [=](MachineInstrBuilder &MIB) { // soffset
6590 if (SOffset)
6591 MIB.addReg(SOffset);
6592 else if (STI.hasRestrictedSOffset())
6593 MIB.addReg(AMDGPU::SGPR_NULL);
6594 else
6595 MIB.addImm(0);
6596 },
6597 [=](MachineInstrBuilder &MIB) { // offset
6598 MIB.addImm(Offset);
6599 },
6600 addZeroImm, // cpol
6601 addZeroImm, // tfe
6602 addZeroImm // swz
6603 }};
6604}
6605
6607AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6608 Register RSrcReg;
6609 Register SOffset;
6610 int64_t Offset = 0;
6611
6612 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6613 return {};
6614
6615 return {{
6616 [=](MachineInstrBuilder &MIB) { // rsrc
6617 MIB.addReg(RSrcReg);
6618 },
6619 [=](MachineInstrBuilder &MIB) { // soffset
6620 if (SOffset)
6621 MIB.addReg(SOffset);
6622 else if (STI.hasRestrictedSOffset())
6623 MIB.addReg(AMDGPU::SGPR_NULL);
6624 else
6625 MIB.addImm(0);
6626 },
6627 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6628 addZeroImm, // cpol
6629 addZeroImm, // tfe
6630 addZeroImm, // swz
6631 }};
6632}
6633
6635AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6636
6637 Register SOffset = Root.getReg();
6638
6639 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6640 SOffset = AMDGPU::SGPR_NULL;
6641
6642 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6643}
6644
6645/// Get an immediate that must be 32-bits, and treated as zero extended.
6646static std::optional<uint64_t>
6648 // getIConstantVRegVal sexts any values, so see if that matters.
6649 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6650 if (!OffsetVal || !isInt<32>(*OffsetVal))
6651 return std::nullopt;
6652 return Lo_32(*OffsetVal);
6653}
6654
6656AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6657 std::optional<uint64_t> OffsetVal =
6658 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6659 if (!OffsetVal)
6660 return {};
6661
6662 std::optional<int64_t> EncodedImm =
6663 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6664 if (!EncodedImm)
6665 return {};
6666
6667 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6668}
6669
6671AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6672 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6673
6674 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6675 if (!OffsetVal)
6676 return {};
6677
6678 std::optional<int64_t> EncodedImm =
6680 if (!EncodedImm)
6681 return {};
6682
6683 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6684}
6685
6687AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6688 // Match the (soffset + offset) pair as a 32-bit register base and
6689 // an immediate offset.
6690 Register SOffset;
6691 unsigned Offset;
6692 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6693 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6694 if (!SOffset)
6695 return std::nullopt;
6696
6697 std::optional<int64_t> EncodedOffset =
6698 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6699 if (!EncodedOffset)
6700 return std::nullopt;
6701
6702 assert(MRI->getType(SOffset) == LLT::scalar(32));
6703 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6704 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6705}
6706
6707std::pair<Register, unsigned>
6708AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6709 bool &Matched) const {
6710 Matched = false;
6711
6712 Register Src;
6713 unsigned Mods;
6714 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6715
6716 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6717 assert(MRI->getType(Src) == LLT::scalar(16));
6718
6719 // Only change Src if src modifier could be gained. In such cases new Src
6720 // could be sgpr but this does not violate constant bus restriction for
6721 // instruction that is being selected.
6722 Src = stripBitCast(Src, *MRI);
6723
6724 const auto CheckAbsNeg = [&]() {
6725 // Be careful about folding modifiers if we already have an abs. fneg is
6726 // applied last, so we don't want to apply an earlier fneg.
6727 if ((Mods & SISrcMods::ABS) == 0) {
6728 unsigned ModsTmp;
6729 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6730
6731 if ((ModsTmp & SISrcMods::NEG) != 0)
6732 Mods ^= SISrcMods::NEG;
6733
6734 if ((ModsTmp & SISrcMods::ABS) != 0)
6735 Mods |= SISrcMods::ABS;
6736 }
6737 };
6738
6739 CheckAbsNeg();
6740
6741 // op_sel/op_sel_hi decide the source type and source.
6742 // If the source's op_sel_hi is set, it indicates to do a conversion from
6743 // fp16. If the sources's op_sel is set, it picks the high half of the
6744 // source register.
6745
6746 Mods |= SISrcMods::OP_SEL_1;
6747
6748 if (isExtractHiElt(*MRI, Src, Src)) {
6749 Mods |= SISrcMods::OP_SEL_0;
6750 CheckAbsNeg();
6751 }
6752
6753 Matched = true;
6754 }
6755
6756 return {Src, Mods};
6757}
6758
6760AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6761 MachineOperand &Root) const {
6762 Register Src;
6763 unsigned Mods;
6764 bool Matched;
6765 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6766 if (!Matched)
6767 return {};
6768
6769 return {{
6770 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6771 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6772 }};
6773}
6774
6776AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6777 Register Src;
6778 unsigned Mods;
6779 bool Matched;
6780 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6781
6782 return {{
6783 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6784 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6785 }};
6786}
6787
6788bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6789 MachineInstr &I, Intrinsic::ID IntrID) const {
6790 MachineBasicBlock *MBB = I.getParent();
6791 const DebugLoc &DL = I.getDebugLoc();
6792 Register CCReg = I.getOperand(0).getReg();
6793
6794 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6795 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6796
6797 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6798 .addImm(I.getOperand(2).getImm());
6799
6800 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6801
6802 I.eraseFromParent();
6803 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6804 *MRI);
6805}
6806
6807bool AMDGPUInstructionSelector::selectSGetBarrierState(
6808 MachineInstr &I, Intrinsic::ID IntrID) const {
6809 MachineBasicBlock *MBB = I.getParent();
6810 const DebugLoc &DL = I.getDebugLoc();
6811 const MachineOperand &BarOp = I.getOperand(2);
6812 std::optional<int64_t> BarValImm =
6813 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6814
6815 if (!BarValImm) {
6816 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6817 .addReg(BarOp.getReg());
6818 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6819 }
6820 MachineInstrBuilder MIB;
6821 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6822 : AMDGPU::S_GET_BARRIER_STATE_M0;
6823 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6824
6825 auto DstReg = I.getOperand(0).getReg();
6826 const TargetRegisterClass *DstRC =
6827 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6828 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6829 return false;
6830 MIB.addDef(DstReg);
6831 if (BarValImm) {
6832 MIB.addImm(*BarValImm);
6833 }
6834 I.eraseFromParent();
6835 return true;
6836}
6837
6838unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6839 if (HasInlineConst) {
6840 switch (IntrID) {
6841 default:
6842 llvm_unreachable("not a named barrier op");
6843 case Intrinsic::amdgcn_s_barrier_join:
6844 return AMDGPU::S_BARRIER_JOIN_IMM;
6845 case Intrinsic::amdgcn_s_wakeup_barrier:
6846 return AMDGPU::S_WAKEUP_BARRIER_IMM;
6847 case Intrinsic::amdgcn_s_get_named_barrier_state:
6848 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6849 };
6850 } else {
6851 switch (IntrID) {
6852 default:
6853 llvm_unreachable("not a named barrier op");
6854 case Intrinsic::amdgcn_s_barrier_join:
6855 return AMDGPU::S_BARRIER_JOIN_M0;
6856 case Intrinsic::amdgcn_s_wakeup_barrier:
6857 return AMDGPU::S_WAKEUP_BARRIER_M0;
6858 case Intrinsic::amdgcn_s_get_named_barrier_state:
6859 return AMDGPU::S_GET_BARRIER_STATE_M0;
6860 };
6861 }
6862}
6863
6864bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6865 MachineInstr &I, Intrinsic::ID IntrID) const {
6866 MachineBasicBlock *MBB = I.getParent();
6867 const DebugLoc &DL = I.getDebugLoc();
6868 const MachineOperand &BarOp = I.getOperand(1);
6869 const MachineOperand &CntOp = I.getOperand(2);
6870
6871 // BarID = (BarOp >> 4) & 0x3F
6872 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6873 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6874 .add(BarOp)
6875 .addImm(4u)
6876 .setOperandDead(3); // Dead scc
6877
6878 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6879 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6880 .addReg(TmpReg0)
6881 .addImm(0x3F)
6882 .setOperandDead(3); // Dead scc
6883
6884 // MO = ((CntOp & 0x3F) << shAmt) | BarID
6885 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6886 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
6887 .add(CntOp)
6888 .addImm(0x3F)
6889 .setOperandDead(3); // Dead scc
6890
6891 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6892 constexpr unsigned ShAmt = 16;
6893 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
6894 .addReg(TmpReg2)
6895 .addImm(ShAmt)
6896 .setOperandDead(3); // Dead scc
6897
6898 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6899 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
6900 .addReg(TmpReg1)
6901 .addReg(TmpReg3)
6902 .setOperandDead(3); // Dead scc;
6903
6904 auto CopyMIB =
6905 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
6906 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6907
6908 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
6909 ? AMDGPU::S_BARRIER_INIT_M0
6910 : AMDGPU::S_BARRIER_SIGNAL_M0;
6911 MachineInstrBuilder MIB;
6912 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6913
6914 I.eraseFromParent();
6915 return true;
6916}
6917
6918bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6919 MachineInstr &I, Intrinsic::ID IntrID) const {
6920 MachineBasicBlock *MBB = I.getParent();
6921 const DebugLoc &DL = I.getDebugLoc();
6922 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6923 ? I.getOperand(2)
6924 : I.getOperand(1);
6925 std::optional<int64_t> BarValImm =
6926 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6927
6928 if (!BarValImm) {
6929 // BarID = (BarOp >> 4) & 0x3F
6930 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6931 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6932 .addReg(BarOp.getReg())
6933 .addImm(4u)
6934 .setOperandDead(3); // Dead scc;
6935
6936 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6937 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6938 .addReg(TmpReg0)
6939 .addImm(0x3F)
6940 .setOperandDead(3); // Dead scc;
6941
6942 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6943 .addReg(TmpReg1);
6944 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6945 }
6946
6947 MachineInstrBuilder MIB;
6948 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6949 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6950
6951 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6952 auto DstReg = I.getOperand(0).getReg();
6953 const TargetRegisterClass *DstRC =
6954 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6955 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6956 return false;
6957 MIB.addDef(DstReg);
6958 }
6959
6960 if (BarValImm) {
6961 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6962 MIB.addImm(BarId);
6963 }
6964
6965 I.eraseFromParent();
6966 return true;
6967}
6968
6969void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6970 const MachineInstr &MI,
6971 int OpIdx) const {
6972 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6973 "Expected G_CONSTANT");
6974 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6975}
6976
6977void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6978 const MachineInstr &MI,
6979 int OpIdx) const {
6980 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6981 "Expected G_CONSTANT");
6982 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6983}
6984
6985void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6986 const MachineInstr &MI,
6987 int OpIdx) const {
6988 const MachineOperand &Op = MI.getOperand(1);
6989 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6990 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6991}
6992
6993void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6994 const MachineInstr &MI,
6995 int OpIdx) const {
6996 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6997 "Expected G_CONSTANT");
6998 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6999}
7000
7001/// This only really exists to satisfy DAG type checking machinery, so is a
7002/// no-op here.
7003void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7004 const MachineInstr &MI,
7005 int OpIdx) const {
7006 const MachineOperand &Op = MI.getOperand(OpIdx);
7007 int64_t Imm;
7008 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7009 MIB.addImm(Imm);
7010 else
7011 MIB.addImm(Op.getImm());
7012}
7013
7014void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7015 const MachineInstr &MI,
7016 int OpIdx) const {
7017 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7018}
7019
7020void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7021 const MachineInstr &MI,
7022 int OpIdx) const {
7023 assert(OpIdx >= 0 && "expected to match an immediate operand");
7024 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7025}
7026
7027void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7028 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7029 assert(OpIdx >= 0 && "expected to match an immediate operand");
7030 MIB.addImm(
7031 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7032}
7033
7034void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7035 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7036 assert(OpIdx >= 0 && "expected to match an immediate operand");
7037 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7039 : (int64_t)SISrcMods::DST_OP_SEL);
7040}
7041
7042void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7043 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7044 assert(OpIdx >= 0 && "expected to match an immediate operand");
7045 MIB.addImm(
7046 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7047}
7048
7049void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7050 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7051 assert(OpIdx >= 0 && "expected to match an immediate operand");
7052 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7053 ? (int64_t)(SISrcMods::OP_SEL_0)
7054 : 0);
7055}
7056
7057void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7058 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7059 assert(OpIdx >= 0 && "expected to match an immediate operand");
7060 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7061 : 0);
7062}
7063
7064void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7065 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7066 assert(OpIdx >= 0 && "expected to match an immediate operand");
7067 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7068 : 0);
7069}
7070
7071void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7072 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7073 assert(OpIdx >= 0 && "expected to match an immediate operand");
7074 MIB.addImm(
7075 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7076}
7077
7078void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7079 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7080 assert(OpIdx >= 0 && "expected to match an immediate operand");
7081 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7082 ? (int64_t)SISrcMods::DST_OP_SEL
7083 : 0);
7084}
7085
7086void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7087 const MachineInstr &MI,
7088 int OpIdx) const {
7089 assert(OpIdx >= 0 && "expected to match an immediate operand");
7090 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7093}
7094
7095void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7096 const MachineInstr &MI,
7097 int OpIdx) const {
7098 assert(OpIdx >= 0 && "expected to match an immediate operand");
7099 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7102 MIB.addImm(Swizzle);
7103}
7104
7105void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7106 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7107 assert(OpIdx >= 0 && "expected to match an immediate operand");
7108 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7111 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7112}
7113
7114void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7115 const MachineInstr &MI,
7116 int OpIdx) const {
7117 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7118}
7119
7120void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7121 const MachineInstr &MI,
7122 int OpIdx) const {
7123 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7124 int ExpVal = APF.getExactLog2Abs();
7125 assert(ExpVal != INT_MIN);
7126 MIB.addImm(ExpVal);
7127}
7128
7129void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7130 const MachineInstr &MI,
7131 int OpIdx) const {
7132 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7133 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7134 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7135 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7136 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7137}
7138
7139void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7140 const MachineInstr &MI,
7141 int OpIdx) const {
7142 unsigned Mods = SISrcMods::OP_SEL_1;
7143 if (MI.getOperand(OpIdx).getImm())
7144 Mods ^= SISrcMods::NEG;
7145 MIB.addImm((int64_t)Mods);
7146}
7147
7148void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7149 const MachineInstr &MI,
7150 int OpIdx) const {
7151 unsigned Mods = SISrcMods::OP_SEL_1;
7152 if (MI.getOperand(OpIdx).getImm())
7154 MIB.addImm((int64_t)Mods);
7155}
7156
7157void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7158 const MachineInstr &MI,
7159 int OpIdx) const {
7160 unsigned Val = MI.getOperand(OpIdx).getImm();
7161 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7162 if (Val == 1) // neg
7163 Mods ^= SISrcMods::NEG;
7164 if (Val == 2) // abs
7165 Mods ^= SISrcMods::ABS;
7166 if (Val == 3) // neg and abs
7167 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7168 MIB.addImm((int64_t)Mods);
7169}
7170
7171void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7172 const MachineInstr &MI,
7173 int OpIdx) const {
7174 uint32_t V = MI.getOperand(2).getImm();
7177 if (!Subtarget->hasSafeCUPrefetch())
7178 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7179 MIB.addImm(V);
7180}
7181
7182/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7183void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7184 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7185 unsigned Val = MI.getOperand(OpIdx).getImm();
7186 unsigned New = 0;
7187 if (Val & 0x1)
7189 if (Val & 0x2)
7191 MIB.addImm(New);
7192}
7193
7194bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7195 return TII.isInlineConstant(Imm);
7196}
7197
7198bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7199 return TII.isInlineConstant(Imm);
7200}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:920
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:652
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:460
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:295
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:493
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:315
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:440
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:468
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:500
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.