LLVM 22.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
246 return false;
247
248 Register DstReg = I.getOperand(0).getReg();
249 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
250
251 I.eraseFromParent();
252 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
253}
254
255bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
256 const DebugLoc &DL = I.getDebugLoc();
257 MachineBasicBlock *BB = I.getParent();
258
259 Register DstReg = I.getOperand(0).getReg();
260 Register SrcReg = I.getOperand(1).getReg();
261 std::optional<ValueAndVReg> Arg =
262 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
263
264 if (Arg) {
265 const int64_t Value = Arg->Value.getZExtValue();
266 if (Value == 0) {
267 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
268 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
269 } else {
270 assert(Value == 1);
271 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
272 }
273 I.eraseFromParent();
274 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
275 }
276
277 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
278 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
279
280 unsigned SelectOpcode =
281 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
282 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
283 .addReg(TRI.getExec())
284 .addImm(0);
285
286 I.eraseFromParent();
287 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302}
303
304bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
305 const Register DefReg = I.getOperand(0).getReg();
306 const LLT DefTy = MRI->getType(DefReg);
307
308 // S1 G_PHIs should not be selected in instruction-select, instead:
309 // - divergent S1 G_PHI should go through lane mask merging algorithm
310 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
311 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
312 if (DefTy == LLT::scalar(1))
313 return false;
314
315 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
316
317 const RegClassOrRegBank &RegClassOrBank =
318 MRI->getRegClassOrRegBank(DefReg);
319
320 const TargetRegisterClass *DefRC =
322 if (!DefRC) {
323 if (!DefTy.isValid()) {
324 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
325 return false;
326 }
327
328 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
329 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
330 if (!DefRC) {
331 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
332 return false;
333 }
334 }
335
336 // If inputs have register bank, assign corresponding reg class.
337 // Note: registers don't need to have the same reg bank.
338 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
339 const Register SrcReg = I.getOperand(i).getReg();
340
341 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
342 if (RB) {
343 const LLT SrcTy = MRI->getType(SrcReg);
344 const TargetRegisterClass *SrcRC =
345 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
346 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
347 return false;
348 }
349 }
350
351 I.setDesc(TII.get(TargetOpcode::PHI));
352 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
353}
354
356AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
357 const TargetRegisterClass &SubRC,
358 unsigned SubIdx) const {
359
360 MachineInstr *MI = MO.getParent();
361 MachineBasicBlock *BB = MO.getParent()->getParent();
362 Register DstReg = MRI->createVirtualRegister(&SubRC);
363
364 if (MO.isReg()) {
365 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
366 Register Reg = MO.getReg();
367 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
368 .addReg(Reg, 0, ComposedSubIdx);
369
370 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
371 MO.isKill(), MO.isDead(), MO.isUndef(),
372 MO.isEarlyClobber(), 0, MO.isDebug(),
373 MO.isInternalRead());
374 }
375
376 assert(MO.isImm());
377
378 APInt Imm(64, MO.getImm());
379
380 switch (SubIdx) {
381 default:
382 llvm_unreachable("do not know to split immediate with this sub index.");
383 case AMDGPU::sub0:
384 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
385 case AMDGPU::sub1:
386 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
387 }
388}
389
390static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
391 switch (Opc) {
392 case AMDGPU::G_AND:
393 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
394 case AMDGPU::G_OR:
395 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
396 case AMDGPU::G_XOR:
397 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
398 default:
399 llvm_unreachable("not a bit op");
400 }
401}
402
403bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
404 Register DstReg = I.getOperand(0).getReg();
405 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
406
407 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
408 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
409 DstRB->getID() != AMDGPU::VCCRegBankID)
410 return false;
411
412 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
413 STI.isWave64());
414 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
415
416 // Dead implicit-def of scc
417 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
418 true, // isImp
419 false, // isKill
420 true)); // isDead
421 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
422}
423
424bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
425 MachineBasicBlock *BB = I.getParent();
426 MachineFunction *MF = BB->getParent();
427 Register DstReg = I.getOperand(0).getReg();
428 const DebugLoc &DL = I.getDebugLoc();
429 LLT Ty = MRI->getType(DstReg);
430 if (Ty.isVector())
431 return false;
432
433 unsigned Size = Ty.getSizeInBits();
434 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
435 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
436 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
437
438 if (Size == 32) {
439 if (IsSALU) {
440 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
441 MachineInstr *Add =
442 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
443 .add(I.getOperand(1))
444 .add(I.getOperand(2))
445 .setOperandDead(3); // Dead scc
446 I.eraseFromParent();
447 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
448 }
449
450 if (STI.hasAddNoCarry()) {
451 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
452 I.setDesc(TII.get(Opc));
453 I.addOperand(*MF, MachineOperand::CreateImm(0));
454 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
456 }
457
458 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
459
460 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
461 MachineInstr *Add
462 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
463 .addDef(UnusedCarry, RegState::Dead)
464 .add(I.getOperand(1))
465 .add(I.getOperand(2))
466 .addImm(0);
467 I.eraseFromParent();
468 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
469 }
470
471 assert(!Sub && "illegal sub should not reach here");
472
473 const TargetRegisterClass &RC
474 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
475 const TargetRegisterClass &HalfRC
476 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
477
478 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
479 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
480 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
481 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
482
483 Register DstLo = MRI->createVirtualRegister(&HalfRC);
484 Register DstHi = MRI->createVirtualRegister(&HalfRC);
485
486 if (IsSALU) {
487 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
488 .add(Lo1)
489 .add(Lo2);
490 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
491 .add(Hi1)
492 .add(Hi2)
493 .setOperandDead(3); // Dead scc
494 } else {
495 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
496 Register CarryReg = MRI->createVirtualRegister(CarryRC);
497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
498 .addDef(CarryReg)
499 .add(Lo1)
500 .add(Lo2)
501 .addImm(0);
502 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
503 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
504 .add(Hi1)
505 .add(Hi2)
506 .addReg(CarryReg, RegState::Kill)
507 .addImm(0);
508
509 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
510 return false;
511 }
512
513 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
514 .addReg(DstLo)
515 .addImm(AMDGPU::sub0)
516 .addReg(DstHi)
517 .addImm(AMDGPU::sub1);
518
519
520 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
521 return false;
522
523 I.eraseFromParent();
524 return true;
525}
526
527bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
528 MachineInstr &I) const {
529 MachineBasicBlock *BB = I.getParent();
530 MachineFunction *MF = BB->getParent();
531 const DebugLoc &DL = I.getDebugLoc();
532 Register Dst0Reg = I.getOperand(0).getReg();
533 Register Dst1Reg = I.getOperand(1).getReg();
534 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
535 I.getOpcode() == AMDGPU::G_UADDE;
536 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
537 I.getOpcode() == AMDGPU::G_USUBE;
538
539 if (isVCC(Dst1Reg, *MRI)) {
540 unsigned NoCarryOpc =
541 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
542 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
543 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
544 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
545 I.addOperand(*MF, MachineOperand::CreateImm(0));
546 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
547 }
548
549 Register Src0Reg = I.getOperand(2).getReg();
550 Register Src1Reg = I.getOperand(3).getReg();
551
552 if (HasCarryIn) {
553 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
554 .addReg(I.getOperand(4).getReg());
555 }
556
557 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
558 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
559
560 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
561 .add(I.getOperand(2))
562 .add(I.getOperand(3));
563
564 if (MRI->use_nodbg_empty(Dst1Reg)) {
565 CarryInst.setOperandDead(3); // Dead scc
566 } else {
567 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
568 .addReg(AMDGPU::SCC);
569 if (!MRI->getRegClassOrNull(Dst1Reg))
570 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
571 }
572
573 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
574 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
575 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
576 return false;
577
578 if (HasCarryIn &&
579 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
580 AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 I.eraseFromParent();
584 return true;
585}
586
587bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
588 MachineInstr &I) const {
589 MachineBasicBlock *BB = I.getParent();
590 MachineFunction *MF = BB->getParent();
591 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
592 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
593 MRI->use_nodbg_empty(I.getOperand(1).getReg());
594
595 unsigned Opc;
596 if (Subtarget->hasMADIntraFwdBug())
597 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
598 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
599 else if (UseNoCarry)
600 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
601 : AMDGPU::V_MAD_NC_I64_I32_e64;
602 else
603 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
604
605 if (UseNoCarry)
606 I.removeOperand(1);
607
608 I.setDesc(TII.get(Opc));
609 I.addOperand(*MF, MachineOperand::CreateImm(0));
610 I.addImplicitDefUseOperands(*MF);
611 I.getOperand(0).setIsEarlyClobber(true);
612 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
613}
614
615// TODO: We should probably legalize these to only using 32-bit results.
616bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
617 MachineBasicBlock *BB = I.getParent();
618 Register DstReg = I.getOperand(0).getReg();
619 Register SrcReg = I.getOperand(1).getReg();
620 LLT DstTy = MRI->getType(DstReg);
621 LLT SrcTy = MRI->getType(SrcReg);
622 const unsigned SrcSize = SrcTy.getSizeInBits();
623 unsigned DstSize = DstTy.getSizeInBits();
624
625 // TODO: Should handle any multiple of 32 offset.
626 unsigned Offset = I.getOperand(2).getImm();
627 if (Offset % 32 != 0 || DstSize > 128)
628 return false;
629
630 // 16-bit operations really use 32-bit registers.
631 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
632 if (DstSize == 16)
633 DstSize = 32;
634
635 const TargetRegisterClass *DstRC =
636 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
637 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
638 return false;
639
640 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
641 const TargetRegisterClass *SrcRC =
642 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
643 if (!SrcRC)
644 return false;
646 DstSize / 32);
647 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
648 if (!SrcRC)
649 return false;
650
651 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
652 *SrcRC, I.getOperand(1));
653 const DebugLoc &DL = I.getDebugLoc();
654 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
655 .addReg(SrcReg, 0, SubReg);
656
657 I.eraseFromParent();
658 return true;
659}
660
661bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
662 MachineBasicBlock *BB = MI.getParent();
663 Register DstReg = MI.getOperand(0).getReg();
664 LLT DstTy = MRI->getType(DstReg);
665 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
666
667 const unsigned SrcSize = SrcTy.getSizeInBits();
668 if (SrcSize < 32)
669 return selectImpl(MI, *CoverageInfo);
670
671 const DebugLoc &DL = MI.getDebugLoc();
672 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
673 const unsigned DstSize = DstTy.getSizeInBits();
674 const TargetRegisterClass *DstRC =
675 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
676 if (!DstRC)
677 return false;
678
679 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
680 MachineInstrBuilder MIB =
681 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
682 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
683 MachineOperand &Src = MI.getOperand(I + 1);
684 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
685 MIB.addImm(SubRegs[I]);
686
687 const TargetRegisterClass *SrcRC
688 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
689 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
690 return false;
691 }
692
693 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
694 return false;
695
696 MI.eraseFromParent();
697 return true;
698}
699
700bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
701 MachineBasicBlock *BB = MI.getParent();
702 const int NumDst = MI.getNumOperands() - 1;
703
704 MachineOperand &Src = MI.getOperand(NumDst);
705
706 Register SrcReg = Src.getReg();
707 Register DstReg0 = MI.getOperand(0).getReg();
708 LLT DstTy = MRI->getType(DstReg0);
709 LLT SrcTy = MRI->getType(SrcReg);
710
711 const unsigned DstSize = DstTy.getSizeInBits();
712 const unsigned SrcSize = SrcTy.getSizeInBits();
713 const DebugLoc &DL = MI.getDebugLoc();
714 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
715
716 const TargetRegisterClass *SrcRC =
717 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
718 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
719 return false;
720
721 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
722 // source, and this relies on the fact that the same subregister indices are
723 // used for both.
724 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
725 for (int I = 0, E = NumDst; I != E; ++I) {
726 MachineOperand &Dst = MI.getOperand(I);
727 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
728 .addReg(SrcReg, 0, SubRegs[I]);
729
730 // Make sure the subregister index is valid for the source register.
731 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
732 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
733 return false;
734
735 const TargetRegisterClass *DstRC =
736 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
737 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
738 return false;
739 }
740
741 MI.eraseFromParent();
742 return true;
743}
744
745bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
746 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
747 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
748
749 Register Src0 = MI.getOperand(1).getReg();
750 Register Src1 = MI.getOperand(2).getReg();
751 LLT SrcTy = MRI->getType(Src0);
752 const unsigned SrcSize = SrcTy.getSizeInBits();
753
754 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
755 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
756 return selectG_MERGE_VALUES(MI);
757 }
758
759 // Selection logic below is for V2S16 only.
760 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
761 Register Dst = MI.getOperand(0).getReg();
762 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
763 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
764 SrcTy != LLT::scalar(32)))
765 return selectImpl(MI, *CoverageInfo);
766
767 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
768 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
769 return false;
770
771 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
772 DstBank->getID() == AMDGPU::VGPRRegBankID);
773 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
774
775 const DebugLoc &DL = MI.getDebugLoc();
776 MachineBasicBlock *BB = MI.getParent();
777
778 // First, before trying TableGen patterns, check if both sources are
779 // constants. In those cases, we can trivially compute the final constant
780 // and emit a simple move.
781 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
782 if (ConstSrc1) {
783 auto ConstSrc0 =
784 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
785 if (ConstSrc0) {
786 const int64_t K0 = ConstSrc0->Value.getSExtValue();
787 const int64_t K1 = ConstSrc1->Value.getSExtValue();
788 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
789 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
790 uint32_t Imm = Lo16 | (Hi16 << 16);
791
792 // VALU
793 if (IsVector) {
794 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
795 MI.eraseFromParent();
796 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
797 }
798
799 // SALU
800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
803 }
804 }
805
806 // Now try TableGen patterns.
807 if (selectImpl(MI, *CoverageInfo))
808 return true;
809
810 // TODO: This should probably be a combine somewhere
811 // (build_vector $src0, undef) -> copy $src0
812 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
813 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
814 MI.setDesc(TII.get(AMDGPU::COPY));
815 MI.removeOperand(2);
816 const auto &RC =
817 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
818 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
819 RBI.constrainGenericRegister(Src0, RC, *MRI);
820 }
821
822 // TODO: Can be improved?
823 if (IsVector) {
824 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
825 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
826 .addImm(0xFFFF)
827 .addReg(Src0);
828 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
829 return false;
830
831 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
832 .addReg(Src1)
833 .addImm(16)
834 .addReg(TmpReg);
835 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
836 return false;
837
838 MI.eraseFromParent();
839 return true;
840 }
841
842 Register ShiftSrc0;
843 Register ShiftSrc1;
844
845 // With multiple uses of the shift, this will duplicate the shift and
846 // increase register pressure.
847 //
848 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
849 // => (S_PACK_HH_B32_B16 $src0, $src1)
850 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
851 // => (S_PACK_HL_B32_B16 $src0, $src1)
852 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
853 // => (S_PACK_LH_B32_B16 $src0, $src1)
854 // (build_vector $src0, $src1)
855 // => (S_PACK_LL_B32_B16 $src0, $src1)
856
857 bool Shift0 = mi_match(
858 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
859
860 bool Shift1 = mi_match(
861 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
862
863 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
864 if (Shift0 && Shift1) {
865 Opc = AMDGPU::S_PACK_HH_B32_B16;
866 MI.getOperand(1).setReg(ShiftSrc0);
867 MI.getOperand(2).setReg(ShiftSrc1);
868 } else if (Shift1) {
869 Opc = AMDGPU::S_PACK_LH_B32_B16;
870 MI.getOperand(2).setReg(ShiftSrc1);
871 } else if (Shift0) {
872 auto ConstSrc1 =
873 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
874 if (ConstSrc1 && ConstSrc1->Value == 0) {
875 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
876 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
877 .addReg(ShiftSrc0)
878 .addImm(16)
879 .setOperandDead(3); // Dead scc
880
881 MI.eraseFromParent();
882 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
883 }
884 if (STI.hasSPackHL()) {
885 Opc = AMDGPU::S_PACK_HL_B32_B16;
886 MI.getOperand(1).setReg(ShiftSrc0);
887 }
888 }
889
890 MI.setDesc(TII.get(Opc));
891 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
892}
893
894bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
895 const MachineOperand &MO = I.getOperand(0);
896
897 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
898 // regbank check here is to know why getConstrainedRegClassForOperand failed.
899 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
900 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
901 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
902 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
903 return true;
904 }
905
906 return false;
907}
908
909bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
910 MachineBasicBlock *BB = I.getParent();
911
912 Register DstReg = I.getOperand(0).getReg();
913 Register Src0Reg = I.getOperand(1).getReg();
914 Register Src1Reg = I.getOperand(2).getReg();
915 LLT Src1Ty = MRI->getType(Src1Reg);
916
917 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
918 unsigned InsSize = Src1Ty.getSizeInBits();
919
920 int64_t Offset = I.getOperand(3).getImm();
921
922 // FIXME: These cases should have been illegal and unnecessary to check here.
923 if (Offset % 32 != 0 || InsSize % 32 != 0)
924 return false;
925
926 // Currently not handled by getSubRegFromChannel.
927 if (InsSize > 128)
928 return false;
929
930 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
931 if (SubReg == AMDGPU::NoSubRegister)
932 return false;
933
934 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
935 const TargetRegisterClass *DstRC =
936 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
937 if (!DstRC)
938 return false;
939
940 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
941 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
942 const TargetRegisterClass *Src0RC =
943 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
944 const TargetRegisterClass *Src1RC =
945 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
946
947 // Deal with weird cases where the class only partially supports the subreg
948 // index.
949 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
950 if (!Src0RC || !Src1RC)
951 return false;
952
953 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
954 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
955 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
956 return false;
957
958 const DebugLoc &DL = I.getDebugLoc();
959 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
960 .addReg(Src0Reg)
961 .addReg(Src1Reg)
962 .addImm(SubReg);
963
964 I.eraseFromParent();
965 return true;
966}
967
968bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
969 Register DstReg = MI.getOperand(0).getReg();
970 Register SrcReg = MI.getOperand(1).getReg();
971 Register OffsetReg = MI.getOperand(2).getReg();
972 Register WidthReg = MI.getOperand(3).getReg();
973
974 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
975 "scalar BFX instructions are expanded in regbankselect");
976 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
977 "64-bit vector BFX instructions are expanded in regbankselect");
978
979 const DebugLoc &DL = MI.getDebugLoc();
980 MachineBasicBlock *MBB = MI.getParent();
981
982 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
983 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
984 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
985 .addReg(SrcReg)
986 .addReg(OffsetReg)
987 .addReg(WidthReg);
988 MI.eraseFromParent();
989 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
990}
991
992bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
993 if (STI.getLDSBankCount() != 16)
994 return selectImpl(MI, *CoverageInfo);
995
996 Register Dst = MI.getOperand(0).getReg();
997 Register Src0 = MI.getOperand(2).getReg();
998 Register M0Val = MI.getOperand(6).getReg();
999 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1000 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1001 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1002 return false;
1003
1004 // This requires 2 instructions. It is possible to write a pattern to support
1005 // this, but the generated isel emitter doesn't correctly deal with multiple
1006 // output instructions using the same physical register input. The copy to m0
1007 // is incorrectly placed before the second instruction.
1008 //
1009 // TODO: Match source modifiers.
1010
1011 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1012 const DebugLoc &DL = MI.getDebugLoc();
1013 MachineBasicBlock *MBB = MI.getParent();
1014
1015 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1016 .addReg(M0Val);
1017 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1018 .addImm(2)
1019 .addImm(MI.getOperand(4).getImm()) // $attr
1020 .addImm(MI.getOperand(3).getImm()); // $attrchan
1021
1022 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1023 .addImm(0) // $src0_modifiers
1024 .addReg(Src0) // $src0
1025 .addImm(MI.getOperand(4).getImm()) // $attr
1026 .addImm(MI.getOperand(3).getImm()) // $attrchan
1027 .addImm(0) // $src2_modifiers
1028 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1029 .addImm(MI.getOperand(5).getImm()) // $high
1030 .addImm(0) // $clamp
1031 .addImm(0); // $omod
1032
1033 MI.eraseFromParent();
1034 return true;
1035}
1036
1037// Writelane is special in that it can use SGPR and M0 (which would normally
1038// count as using the constant bus twice - but in this case it is allowed since
1039// the lane selector doesn't count as a use of the constant bus). However, it is
1040// still required to abide by the 1 SGPR rule. Fix this up if we might have
1041// multiple SGPRs.
1042bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1043 // With a constant bus limit of at least 2, there's no issue.
1044 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1045 return selectImpl(MI, *CoverageInfo);
1046
1047 MachineBasicBlock *MBB = MI.getParent();
1048 const DebugLoc &DL = MI.getDebugLoc();
1049 Register VDst = MI.getOperand(0).getReg();
1050 Register Val = MI.getOperand(2).getReg();
1051 Register LaneSelect = MI.getOperand(3).getReg();
1052 Register VDstIn = MI.getOperand(4).getReg();
1053
1054 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1055
1056 std::optional<ValueAndVReg> ConstSelect =
1057 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1058 if (ConstSelect) {
1059 // The selector has to be an inline immediate, so we can use whatever for
1060 // the other operands.
1061 MIB.addReg(Val);
1062 MIB.addImm(ConstSelect->Value.getSExtValue() &
1063 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1064 } else {
1065 std::optional<ValueAndVReg> ConstVal =
1067
1068 // If the value written is an inline immediate, we can get away without a
1069 // copy to m0.
1070 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1071 STI.hasInv2PiInlineImm())) {
1072 MIB.addImm(ConstVal->Value.getSExtValue());
1073 MIB.addReg(LaneSelect);
1074 } else {
1075 MIB.addReg(Val);
1076
1077 // If the lane selector was originally in a VGPR and copied with
1078 // readfirstlane, there's a hazard to read the same SGPR from the
1079 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1080 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1081
1082 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1083 .addReg(LaneSelect);
1084 MIB.addReg(AMDGPU::M0);
1085 }
1086 }
1087
1088 MIB.addReg(VDstIn);
1089
1090 MI.eraseFromParent();
1091 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1092}
1093
1094// We need to handle this here because tablegen doesn't support matching
1095// instructions with multiple outputs.
1096bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1097 Register Dst0 = MI.getOperand(0).getReg();
1098 Register Dst1 = MI.getOperand(1).getReg();
1099
1100 LLT Ty = MRI->getType(Dst0);
1101 unsigned Opc;
1102 if (Ty == LLT::scalar(32))
1103 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1104 else if (Ty == LLT::scalar(64))
1105 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1106 else
1107 return false;
1108
1109 // TODO: Match source modifiers.
1110
1111 const DebugLoc &DL = MI.getDebugLoc();
1112 MachineBasicBlock *MBB = MI.getParent();
1113
1114 Register Numer = MI.getOperand(3).getReg();
1115 Register Denom = MI.getOperand(4).getReg();
1116 unsigned ChooseDenom = MI.getOperand(5).getImm();
1117
1118 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1119
1120 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1121 .addDef(Dst1)
1122 .addImm(0) // $src0_modifiers
1123 .addUse(Src0) // $src0
1124 .addImm(0) // $src1_modifiers
1125 .addUse(Denom) // $src1
1126 .addImm(0) // $src2_modifiers
1127 .addUse(Numer) // $src2
1128 .addImm(0) // $clamp
1129 .addImm(0); // $omod
1130
1131 MI.eraseFromParent();
1132 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1133}
1134
1135bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1136 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1137 switch (IntrinsicID) {
1138 case Intrinsic::amdgcn_if_break: {
1139 MachineBasicBlock *BB = I.getParent();
1140
1141 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1142 // SelectionDAG uses for wave32 vs wave64.
1143 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1144 .add(I.getOperand(0))
1145 .add(I.getOperand(2))
1146 .add(I.getOperand(3));
1147
1148 Register DstReg = I.getOperand(0).getReg();
1149 Register Src0Reg = I.getOperand(2).getReg();
1150 Register Src1Reg = I.getOperand(3).getReg();
1151
1152 I.eraseFromParent();
1153
1154 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1155 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1156
1157 return true;
1158 }
1159 case Intrinsic::amdgcn_interp_p1_f16:
1160 return selectInterpP1F16(I);
1161 case Intrinsic::amdgcn_wqm:
1162 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1163 case Intrinsic::amdgcn_softwqm:
1164 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1165 case Intrinsic::amdgcn_strict_wwm:
1166 case Intrinsic::amdgcn_wwm:
1167 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1168 case Intrinsic::amdgcn_strict_wqm:
1169 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1170 case Intrinsic::amdgcn_writelane:
1171 return selectWritelane(I);
1172 case Intrinsic::amdgcn_div_scale:
1173 return selectDivScale(I);
1174 case Intrinsic::amdgcn_icmp:
1175 case Intrinsic::amdgcn_fcmp:
1176 if (selectImpl(I, *CoverageInfo))
1177 return true;
1178 return selectIntrinsicCmp(I);
1179 case Intrinsic::amdgcn_ballot:
1180 return selectBallot(I);
1181 case Intrinsic::amdgcn_reloc_constant:
1182 return selectRelocConstant(I);
1183 case Intrinsic::amdgcn_groupstaticsize:
1184 return selectGroupStaticSize(I);
1185 case Intrinsic::returnaddress:
1186 return selectReturnAddress(I);
1187 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1189 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1190 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1191 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1192 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1193 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1194 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1195 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1198 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1200 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1201 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1202 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1204 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1205 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1206 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1207 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1208 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1209 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1212 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1214 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1215 return selectSMFMACIntrin(I);
1216 case Intrinsic::amdgcn_permlane16_swap:
1217 case Intrinsic::amdgcn_permlane32_swap:
1218 return selectPermlaneSwapIntrin(I, IntrinsicID);
1219 default:
1220 return selectImpl(I, *CoverageInfo);
1221 }
1222}
1223
1225 const GCNSubtarget &ST) {
1226 if (Size != 16 && Size != 32 && Size != 64)
1227 return -1;
1228
1229 if (Size == 16 && !ST.has16BitInsts())
1230 return -1;
1231
1232 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1233 unsigned FakeS16Opc, unsigned S32Opc,
1234 unsigned S64Opc) {
1235 if (Size == 16)
1236 return ST.hasTrue16BitInsts()
1237 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1238 : S16Opc;
1239 if (Size == 32)
1240 return S32Opc;
1241 return S64Opc;
1242 };
1243
1244 switch (P) {
1245 default:
1246 llvm_unreachable("Unknown condition code!");
1247 case CmpInst::ICMP_NE:
1248 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1249 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1250 AMDGPU::V_CMP_NE_U64_e64);
1251 case CmpInst::ICMP_EQ:
1252 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1253 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1254 AMDGPU::V_CMP_EQ_U64_e64);
1255 case CmpInst::ICMP_SGT:
1256 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1257 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1258 AMDGPU::V_CMP_GT_I64_e64);
1259 case CmpInst::ICMP_SGE:
1260 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1261 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1262 AMDGPU::V_CMP_GE_I64_e64);
1263 case CmpInst::ICMP_SLT:
1264 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1265 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1266 AMDGPU::V_CMP_LT_I64_e64);
1267 case CmpInst::ICMP_SLE:
1268 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1269 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1270 AMDGPU::V_CMP_LE_I64_e64);
1271 case CmpInst::ICMP_UGT:
1272 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1273 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1274 AMDGPU::V_CMP_GT_U64_e64);
1275 case CmpInst::ICMP_UGE:
1276 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1277 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1278 AMDGPU::V_CMP_GE_U64_e64);
1279 case CmpInst::ICMP_ULT:
1280 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1281 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1282 AMDGPU::V_CMP_LT_U64_e64);
1283 case CmpInst::ICMP_ULE:
1284 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1285 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1286 AMDGPU::V_CMP_LE_U64_e64);
1287
1288 case CmpInst::FCMP_OEQ:
1289 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1290 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1291 AMDGPU::V_CMP_EQ_F64_e64);
1292 case CmpInst::FCMP_OGT:
1293 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1294 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1295 AMDGPU::V_CMP_GT_F64_e64);
1296 case CmpInst::FCMP_OGE:
1297 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1298 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1299 AMDGPU::V_CMP_GE_F64_e64);
1300 case CmpInst::FCMP_OLT:
1301 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1302 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1303 AMDGPU::V_CMP_LT_F64_e64);
1304 case CmpInst::FCMP_OLE:
1305 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1306 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1307 AMDGPU::V_CMP_LE_F64_e64);
1308 case CmpInst::FCMP_ONE:
1309 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1310 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1311 AMDGPU::V_CMP_NEQ_F64_e64);
1312 case CmpInst::FCMP_ORD:
1313 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1314 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1315 AMDGPU::V_CMP_O_F64_e64);
1316 case CmpInst::FCMP_UNO:
1317 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1318 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1319 AMDGPU::V_CMP_U_F64_e64);
1320 case CmpInst::FCMP_UEQ:
1321 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1322 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1323 AMDGPU::V_CMP_NLG_F64_e64);
1324 case CmpInst::FCMP_UGT:
1325 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1326 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1327 AMDGPU::V_CMP_NLE_F64_e64);
1328 case CmpInst::FCMP_UGE:
1329 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1330 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1331 AMDGPU::V_CMP_NLT_F64_e64);
1332 case CmpInst::FCMP_ULT:
1333 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1334 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1335 AMDGPU::V_CMP_NGE_F64_e64);
1336 case CmpInst::FCMP_ULE:
1337 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1338 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1339 AMDGPU::V_CMP_NGT_F64_e64);
1340 case CmpInst::FCMP_UNE:
1341 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1342 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1343 AMDGPU::V_CMP_NEQ_F64_e64);
1344 case CmpInst::FCMP_TRUE:
1345 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1346 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1347 AMDGPU::V_CMP_TRU_F64_e64);
1349 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1350 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1351 AMDGPU::V_CMP_F_F64_e64);
1352 }
1353}
1354
1355int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1356 unsigned Size) const {
1357 if (Size == 64) {
1358 if (!STI.hasScalarCompareEq64())
1359 return -1;
1360
1361 switch (P) {
1362 case CmpInst::ICMP_NE:
1363 return AMDGPU::S_CMP_LG_U64;
1364 case CmpInst::ICMP_EQ:
1365 return AMDGPU::S_CMP_EQ_U64;
1366 default:
1367 return -1;
1368 }
1369 }
1370
1371 if (Size == 32) {
1372 switch (P) {
1373 case CmpInst::ICMP_NE:
1374 return AMDGPU::S_CMP_LG_U32;
1375 case CmpInst::ICMP_EQ:
1376 return AMDGPU::S_CMP_EQ_U32;
1377 case CmpInst::ICMP_SGT:
1378 return AMDGPU::S_CMP_GT_I32;
1379 case CmpInst::ICMP_SGE:
1380 return AMDGPU::S_CMP_GE_I32;
1381 case CmpInst::ICMP_SLT:
1382 return AMDGPU::S_CMP_LT_I32;
1383 case CmpInst::ICMP_SLE:
1384 return AMDGPU::S_CMP_LE_I32;
1385 case CmpInst::ICMP_UGT:
1386 return AMDGPU::S_CMP_GT_U32;
1387 case CmpInst::ICMP_UGE:
1388 return AMDGPU::S_CMP_GE_U32;
1389 case CmpInst::ICMP_ULT:
1390 return AMDGPU::S_CMP_LT_U32;
1391 case CmpInst::ICMP_ULE:
1392 return AMDGPU::S_CMP_LE_U32;
1393 case CmpInst::FCMP_OEQ:
1394 return AMDGPU::S_CMP_EQ_F32;
1395 case CmpInst::FCMP_OGT:
1396 return AMDGPU::S_CMP_GT_F32;
1397 case CmpInst::FCMP_OGE:
1398 return AMDGPU::S_CMP_GE_F32;
1399 case CmpInst::FCMP_OLT:
1400 return AMDGPU::S_CMP_LT_F32;
1401 case CmpInst::FCMP_OLE:
1402 return AMDGPU::S_CMP_LE_F32;
1403 case CmpInst::FCMP_ONE:
1404 return AMDGPU::S_CMP_LG_F32;
1405 case CmpInst::FCMP_ORD:
1406 return AMDGPU::S_CMP_O_F32;
1407 case CmpInst::FCMP_UNO:
1408 return AMDGPU::S_CMP_U_F32;
1409 case CmpInst::FCMP_UEQ:
1410 return AMDGPU::S_CMP_NLG_F32;
1411 case CmpInst::FCMP_UGT:
1412 return AMDGPU::S_CMP_NLE_F32;
1413 case CmpInst::FCMP_UGE:
1414 return AMDGPU::S_CMP_NLT_F32;
1415 case CmpInst::FCMP_ULT:
1416 return AMDGPU::S_CMP_NGE_F32;
1417 case CmpInst::FCMP_ULE:
1418 return AMDGPU::S_CMP_NGT_F32;
1419 case CmpInst::FCMP_UNE:
1420 return AMDGPU::S_CMP_NEQ_F32;
1421 default:
1422 llvm_unreachable("Unknown condition code!");
1423 }
1424 }
1425
1426 if (Size == 16) {
1427 if (!STI.hasSALUFloatInsts())
1428 return -1;
1429
1430 switch (P) {
1431 case CmpInst::FCMP_OEQ:
1432 return AMDGPU::S_CMP_EQ_F16;
1433 case CmpInst::FCMP_OGT:
1434 return AMDGPU::S_CMP_GT_F16;
1435 case CmpInst::FCMP_OGE:
1436 return AMDGPU::S_CMP_GE_F16;
1437 case CmpInst::FCMP_OLT:
1438 return AMDGPU::S_CMP_LT_F16;
1439 case CmpInst::FCMP_OLE:
1440 return AMDGPU::S_CMP_LE_F16;
1441 case CmpInst::FCMP_ONE:
1442 return AMDGPU::S_CMP_LG_F16;
1443 case CmpInst::FCMP_ORD:
1444 return AMDGPU::S_CMP_O_F16;
1445 case CmpInst::FCMP_UNO:
1446 return AMDGPU::S_CMP_U_F16;
1447 case CmpInst::FCMP_UEQ:
1448 return AMDGPU::S_CMP_NLG_F16;
1449 case CmpInst::FCMP_UGT:
1450 return AMDGPU::S_CMP_NLE_F16;
1451 case CmpInst::FCMP_UGE:
1452 return AMDGPU::S_CMP_NLT_F16;
1453 case CmpInst::FCMP_ULT:
1454 return AMDGPU::S_CMP_NGE_F16;
1455 case CmpInst::FCMP_ULE:
1456 return AMDGPU::S_CMP_NGT_F16;
1457 case CmpInst::FCMP_UNE:
1458 return AMDGPU::S_CMP_NEQ_F16;
1459 default:
1460 llvm_unreachable("Unknown condition code!");
1461 }
1462 }
1463
1464 return -1;
1465}
1466
1467bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1468
1469 MachineBasicBlock *BB = I.getParent();
1470 const DebugLoc &DL = I.getDebugLoc();
1471
1472 Register SrcReg = I.getOperand(2).getReg();
1473 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1474
1475 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1476
1477 Register CCReg = I.getOperand(0).getReg();
1478 if (!isVCC(CCReg, *MRI)) {
1479 int Opcode = getS_CMPOpcode(Pred, Size);
1480 if (Opcode == -1)
1481 return false;
1482 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1483 .add(I.getOperand(2))
1484 .add(I.getOperand(3));
1485 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1486 .addReg(AMDGPU::SCC);
1487 bool Ret =
1488 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1489 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1490 I.eraseFromParent();
1491 return Ret;
1492 }
1493
1494 if (I.getOpcode() == AMDGPU::G_FCMP)
1495 return false;
1496
1497 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1498 if (Opcode == -1)
1499 return false;
1500
1501 MachineInstrBuilder ICmp;
1502 // t16 instructions
1503 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1504 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1505 .addImm(0)
1506 .add(I.getOperand(2))
1507 .addImm(0)
1508 .add(I.getOperand(3))
1509 .addImm(0); // op_sel
1510 } else {
1511 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1512 .add(I.getOperand(2))
1513 .add(I.getOperand(3));
1514 }
1515
1516 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1517 *TRI.getBoolRC(), *MRI);
1518 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1519 I.eraseFromParent();
1520 return Ret;
1521}
1522
1523bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1524 Register Dst = I.getOperand(0).getReg();
1525 if (isVCC(Dst, *MRI))
1526 return false;
1527
1528 LLT DstTy = MRI->getType(Dst);
1529 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1530 return false;
1531
1532 MachineBasicBlock *BB = I.getParent();
1533 const DebugLoc &DL = I.getDebugLoc();
1534 Register SrcReg = I.getOperand(2).getReg();
1535 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1536
1537 // i1 inputs are not supported in GlobalISel.
1538 if (Size == 1)
1539 return false;
1540
1541 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1542 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1543 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1544 I.eraseFromParent();
1545 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1546 }
1547
1548 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1549 if (Opcode == -1)
1550 return false;
1551
1552 MachineInstrBuilder SelectedMI;
1553 MachineOperand &LHS = I.getOperand(2);
1554 MachineOperand &RHS = I.getOperand(3);
1555 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1556 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1557 Register Src0Reg =
1558 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1559 Register Src1Reg =
1560 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1561 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1562 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1563 SelectedMI.addImm(Src0Mods);
1564 SelectedMI.addReg(Src0Reg);
1565 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1566 SelectedMI.addImm(Src1Mods);
1567 SelectedMI.addReg(Src1Reg);
1568 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1569 SelectedMI.addImm(0); // clamp
1570 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1571 SelectedMI.addImm(0); // op_sel
1572
1573 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1574 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1575 return false;
1576
1577 I.eraseFromParent();
1578 return true;
1579}
1580
1581// Ballot has to zero bits in input lane-mask that are zero in current exec,
1582// Done as AND with exec. For inputs that are results of instruction that
1583// implicitly use same exec, for example compares in same basic block or SCC to
1584// VCC copy, use copy.
1587 MachineInstr *MI = MRI.getVRegDef(Reg);
1588 if (MI->getParent() != MBB)
1589 return false;
1590
1591 // Lane mask generated by SCC to VCC copy.
1592 if (MI->getOpcode() == AMDGPU::COPY) {
1593 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1594 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1595 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1596 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1597 return true;
1598 }
1599
1600 // Lane mask generated using compare with same exec.
1601 if (isa<GAnyCmp>(MI))
1602 return true;
1603
1604 Register LHS, RHS;
1605 // Look through AND.
1606 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1607 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1609
1610 return false;
1611}
1612
1613bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1614 MachineBasicBlock *BB = I.getParent();
1615 const DebugLoc &DL = I.getDebugLoc();
1616 Register DstReg = I.getOperand(0).getReg();
1617 Register SrcReg = I.getOperand(2).getReg();
1618 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1619 const unsigned WaveSize = STI.getWavefrontSize();
1620
1621 // In the common case, the return type matches the wave size.
1622 // However we also support emitting i64 ballots in wave32 mode.
1623 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1624 return false;
1625
1626 std::optional<ValueAndVReg> Arg =
1628
1629 Register Dst = DstReg;
1630 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1631 if (BallotSize != WaveSize) {
1632 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1633 }
1634
1635 if (Arg) {
1636 const int64_t Value = Arg->Value.getZExtValue();
1637 if (Value == 0) {
1638 // Dst = S_MOV 0
1639 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1640 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1641 } else {
1642 // Dst = COPY EXEC
1643 assert(Value == 1);
1644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1645 }
1646 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1647 return false;
1648 } else {
1649 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1650 // Dst = COPY SrcReg
1651 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1652 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1653 return false;
1654 } else {
1655 // Dst = S_AND SrcReg, EXEC
1656 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1657 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1658 .addReg(SrcReg)
1659 .addReg(TRI.getExec())
1660 .setOperandDead(3); // Dead scc
1661 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1662 return false;
1663 }
1664 }
1665
1666 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1667 if (BallotSize != WaveSize) {
1668 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1669 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1670 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1671 .addReg(Dst)
1672 .addImm(AMDGPU::sub0)
1673 .addReg(HiReg)
1674 .addImm(AMDGPU::sub1);
1675 }
1676
1677 I.eraseFromParent();
1678 return true;
1679}
1680
1681bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1682 Register DstReg = I.getOperand(0).getReg();
1683 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1684 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1685 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1686 return false;
1687
1688 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1689
1690 Module *M = MF->getFunction().getParent();
1691 const MDNode *Metadata = I.getOperand(2).getMetadata();
1692 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1693 auto *RelocSymbol = cast<GlobalVariable>(
1694 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1695
1696 MachineBasicBlock *BB = I.getParent();
1697 BuildMI(*BB, &I, I.getDebugLoc(),
1698 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1700
1701 I.eraseFromParent();
1702 return true;
1703}
1704
1705bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1706 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1707
1708 Register DstReg = I.getOperand(0).getReg();
1709 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1710 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1711 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1712
1713 MachineBasicBlock *MBB = I.getParent();
1714 const DebugLoc &DL = I.getDebugLoc();
1715
1716 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1717
1718 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1719 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1720 MIB.addImm(MFI->getLDSSize());
1721 } else {
1722 Module *M = MF->getFunction().getParent();
1723 const GlobalValue *GV =
1724 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1726 }
1727
1728 I.eraseFromParent();
1729 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1730}
1731
1732bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1733 MachineBasicBlock *MBB = I.getParent();
1734 MachineFunction &MF = *MBB->getParent();
1735 const DebugLoc &DL = I.getDebugLoc();
1736
1737 MachineOperand &Dst = I.getOperand(0);
1738 Register DstReg = Dst.getReg();
1739 unsigned Depth = I.getOperand(2).getImm();
1740
1741 const TargetRegisterClass *RC
1742 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1743 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1744 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1745 return false;
1746
1747 // Check for kernel and shader functions
1748 if (Depth != 0 ||
1749 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1750 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1751 .addImm(0);
1752 I.eraseFromParent();
1753 return true;
1754 }
1755
1756 MachineFrameInfo &MFI = MF.getFrameInfo();
1757 // There is a call to @llvm.returnaddress in this function
1758 MFI.setReturnAddressIsTaken(true);
1759
1760 // Get the return address reg and mark it as an implicit live-in
1761 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1762 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1763 AMDGPU::SReg_64RegClass, DL);
1764 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1765 .addReg(LiveIn);
1766 I.eraseFromParent();
1767 return true;
1768}
1769
1770bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1771 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1772 // SelectionDAG uses for wave32 vs wave64.
1773 MachineBasicBlock *BB = MI.getParent();
1774 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1775 .add(MI.getOperand(1));
1776
1777 Register Reg = MI.getOperand(1).getReg();
1778 MI.eraseFromParent();
1779
1780 if (!MRI->getRegClassOrNull(Reg))
1781 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1782 return true;
1783}
1784
1785bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1786 MachineInstr &MI, Intrinsic::ID IntrID) const {
1787 MachineBasicBlock *MBB = MI.getParent();
1788 MachineFunction *MF = MBB->getParent();
1789 const DebugLoc &DL = MI.getDebugLoc();
1790
1791 unsigned IndexOperand = MI.getOperand(7).getImm();
1792 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1793 bool WaveDone = MI.getOperand(9).getImm() != 0;
1794
1795 if (WaveDone && !WaveRelease) {
1796 // TODO: Move this to IR verifier
1797 const Function &Fn = MF->getFunction();
1798 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1799 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1800 }
1801
1802 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1803 IndexOperand &= ~0x3f;
1804 unsigned CountDw = 0;
1805
1806 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1807 CountDw = (IndexOperand >> 24) & 0xf;
1808 IndexOperand &= ~(0xf << 24);
1809
1810 if (CountDw < 1 || CountDw > 4) {
1811 const Function &Fn = MF->getFunction();
1812 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1813 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1814 CountDw = 1;
1815 }
1816 }
1817
1818 if (IndexOperand) {
1819 const Function &Fn = MF->getFunction();
1820 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1821 Fn, "ds_ordered_count: bad index operand", DL));
1822 }
1823
1824 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1825 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1826
1827 unsigned Offset0 = OrderedCountIndex << 2;
1828 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1829
1830 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1831 Offset1 |= (CountDw - 1) << 6;
1832
1833 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1834 Offset1 |= ShaderType << 2;
1835
1836 unsigned Offset = Offset0 | (Offset1 << 8);
1837
1838 Register M0Val = MI.getOperand(2).getReg();
1839 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1840 .addReg(M0Val);
1841
1842 Register DstReg = MI.getOperand(0).getReg();
1843 Register ValReg = MI.getOperand(3).getReg();
1844 MachineInstrBuilder DS =
1845 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1846 .addReg(ValReg)
1847 .addImm(Offset)
1848 .cloneMemRefs(MI);
1849
1850 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1851 return false;
1852
1853 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1854 MI.eraseFromParent();
1855 return Ret;
1856}
1857
1858static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1859 switch (IntrID) {
1860 case Intrinsic::amdgcn_ds_gws_init:
1861 return AMDGPU::DS_GWS_INIT;
1862 case Intrinsic::amdgcn_ds_gws_barrier:
1863 return AMDGPU::DS_GWS_BARRIER;
1864 case Intrinsic::amdgcn_ds_gws_sema_v:
1865 return AMDGPU::DS_GWS_SEMA_V;
1866 case Intrinsic::amdgcn_ds_gws_sema_br:
1867 return AMDGPU::DS_GWS_SEMA_BR;
1868 case Intrinsic::amdgcn_ds_gws_sema_p:
1869 return AMDGPU::DS_GWS_SEMA_P;
1870 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1871 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1872 default:
1873 llvm_unreachable("not a gws intrinsic");
1874 }
1875}
1876
1877bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1878 Intrinsic::ID IID) const {
1879 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1880 !STI.hasGWSSemaReleaseAll()))
1881 return false;
1882
1883 // intrinsic ID, vsrc, offset
1884 const bool HasVSrc = MI.getNumOperands() == 3;
1885 assert(HasVSrc || MI.getNumOperands() == 2);
1886
1887 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1888 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1889 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1890 return false;
1891
1892 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1893 unsigned ImmOffset;
1894
1895 MachineBasicBlock *MBB = MI.getParent();
1896 const DebugLoc &DL = MI.getDebugLoc();
1897
1898 MachineInstr *Readfirstlane = nullptr;
1899
1900 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1901 // incoming offset, in case there's an add of a constant. We'll have to put it
1902 // back later.
1903 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1904 Readfirstlane = OffsetDef;
1905 BaseOffset = OffsetDef->getOperand(1).getReg();
1906 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1907 }
1908
1909 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1910 // If we have a constant offset, try to use the 0 in m0 as the base.
1911 // TODO: Look into changing the default m0 initialization value. If the
1912 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1913 // the immediate offset.
1914
1915 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1916 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1917 .addImm(0);
1918 } else {
1919 std::tie(BaseOffset, ImmOffset) =
1920 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1921
1922 if (Readfirstlane) {
1923 // We have the constant offset now, so put the readfirstlane back on the
1924 // variable component.
1925 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1926 return false;
1927
1928 Readfirstlane->getOperand(1).setReg(BaseOffset);
1929 BaseOffset = Readfirstlane->getOperand(0).getReg();
1930 } else {
1931 if (!RBI.constrainGenericRegister(BaseOffset,
1932 AMDGPU::SReg_32RegClass, *MRI))
1933 return false;
1934 }
1935
1936 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1937 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1938 .addReg(BaseOffset)
1939 .addImm(16)
1940 .setOperandDead(3); // Dead scc
1941
1942 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1943 .addReg(M0Base);
1944 }
1945
1946 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1947 // offset field) % 64. Some versions of the programming guide omit the m0
1948 // part, or claim it's from offset 0.
1949
1950 unsigned Opc = gwsIntrinToOpcode(IID);
1951 const MCInstrDesc &InstrDesc = TII.get(Opc);
1952
1953 if (HasVSrc) {
1954 Register VSrc = MI.getOperand(1).getReg();
1955
1956 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1957 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1958 const TargetRegisterClass *SubRC =
1959 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1960
1961 if (!SubRC) {
1962 // 32-bit normal case.
1963 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1964 return false;
1965
1966 BuildMI(*MBB, &MI, DL, InstrDesc)
1967 .addReg(VSrc)
1968 .addImm(ImmOffset)
1969 .cloneMemRefs(MI);
1970 } else {
1971 // Requires even register alignment, so create 64-bit value and pad the
1972 // top half with undef.
1973 Register DataReg = MRI->createVirtualRegister(DataRC);
1974 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1975 return false;
1976
1977 Register UndefReg = MRI->createVirtualRegister(SubRC);
1978 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1979 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1980 .addReg(VSrc)
1981 .addImm(AMDGPU::sub0)
1982 .addReg(UndefReg)
1983 .addImm(AMDGPU::sub1);
1984
1985 BuildMI(*MBB, &MI, DL, InstrDesc)
1986 .addReg(DataReg)
1987 .addImm(ImmOffset)
1988 .cloneMemRefs(MI);
1989 }
1990 } else {
1991 BuildMI(*MBB, &MI, DL, InstrDesc)
1992 .addImm(ImmOffset)
1993 .cloneMemRefs(MI);
1994 }
1995
1996 MI.eraseFromParent();
1997 return true;
1998}
1999
2000bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2001 bool IsAppend) const {
2002 Register PtrBase = MI.getOperand(2).getReg();
2003 LLT PtrTy = MRI->getType(PtrBase);
2004 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2005
2006 unsigned Offset;
2007 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2008
2009 // TODO: Should this try to look through readfirstlane like GWS?
2010 if (!isDSOffsetLegal(PtrBase, Offset)) {
2011 PtrBase = MI.getOperand(2).getReg();
2012 Offset = 0;
2013 }
2014
2015 MachineBasicBlock *MBB = MI.getParent();
2016 const DebugLoc &DL = MI.getDebugLoc();
2017 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2018
2019 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2020 .addReg(PtrBase);
2021 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2022 return false;
2023
2024 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2025 .addImm(Offset)
2026 .addImm(IsGDS ? -1 : 0)
2027 .cloneMemRefs(MI);
2028 MI.eraseFromParent();
2029 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2030}
2031
2032bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2033 MachineFunction *MF = MI.getMF();
2034 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2035
2036 MFInfo->setInitWholeWave();
2037 return selectImpl(MI, *CoverageInfo);
2038}
2039
2040static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2041 bool &IsTexFail) {
2042 if (TexFailCtrl)
2043 IsTexFail = true;
2044
2045 TFE = TexFailCtrl & 0x1;
2046 TexFailCtrl &= ~(uint64_t)0x1;
2047 LWE = TexFailCtrl & 0x2;
2048 TexFailCtrl &= ~(uint64_t)0x2;
2049
2050 return TexFailCtrl == 0;
2051}
2052
2053bool AMDGPUInstructionSelector::selectImageIntrinsic(
2054 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2055 MachineBasicBlock *MBB = MI.getParent();
2056 const DebugLoc &DL = MI.getDebugLoc();
2057 unsigned IntrOpcode = Intr->BaseOpcode;
2058
2059 // For image atomic: use no-return opcode if result is unused.
2060 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2061 Register ResultDef = MI.getOperand(0).getReg();
2062 if (MRI->use_nodbg_empty(ResultDef))
2063 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2064 }
2065
2066 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2068
2069 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2070 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2071 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2072 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2073
2074 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2075
2076 Register VDataIn = AMDGPU::NoRegister;
2077 Register VDataOut = AMDGPU::NoRegister;
2078 LLT VDataTy;
2079 int NumVDataDwords = -1;
2080 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2081 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2082
2083 bool Unorm;
2084 if (!BaseOpcode->Sampler)
2085 Unorm = true;
2086 else
2087 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2088
2089 bool TFE;
2090 bool LWE;
2091 bool IsTexFail = false;
2092 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2093 TFE, LWE, IsTexFail))
2094 return false;
2095
2096 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2097 const bool IsA16 = (Flags & 1) != 0;
2098 const bool IsG16 = (Flags & 2) != 0;
2099
2100 // A16 implies 16 bit gradients if subtarget doesn't support G16
2101 if (IsA16 && !STI.hasG16() && !IsG16)
2102 return false;
2103
2104 unsigned DMask = 0;
2105 unsigned DMaskLanes = 0;
2106
2107 if (BaseOpcode->Atomic) {
2108 if (!BaseOpcode->NoReturn)
2109 VDataOut = MI.getOperand(0).getReg();
2110 VDataIn = MI.getOperand(2).getReg();
2111 LLT Ty = MRI->getType(VDataIn);
2112
2113 // Be careful to allow atomic swap on 16-bit element vectors.
2114 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2115 Ty.getSizeInBits() == 128 :
2116 Ty.getSizeInBits() == 64;
2117
2118 if (BaseOpcode->AtomicX2) {
2119 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2120
2121 DMask = Is64Bit ? 0xf : 0x3;
2122 NumVDataDwords = Is64Bit ? 4 : 2;
2123 } else {
2124 DMask = Is64Bit ? 0x3 : 0x1;
2125 NumVDataDwords = Is64Bit ? 2 : 1;
2126 }
2127 } else {
2128 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2129 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2130
2131 if (BaseOpcode->Store) {
2132 VDataIn = MI.getOperand(1).getReg();
2133 VDataTy = MRI->getType(VDataIn);
2134 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2135 } else if (BaseOpcode->NoReturn) {
2136 NumVDataDwords = 0;
2137 } else {
2138 VDataOut = MI.getOperand(0).getReg();
2139 VDataTy = MRI->getType(VDataOut);
2140 NumVDataDwords = DMaskLanes;
2141
2142 if (IsD16 && !STI.hasUnpackedD16VMem())
2143 NumVDataDwords = (DMaskLanes + 1) / 2;
2144 }
2145 }
2146
2147 // Set G16 opcode
2148 if (Subtarget->hasG16() && IsG16) {
2149 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2151 assert(G16MappingInfo);
2152 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2153 }
2154
2155 // TODO: Check this in verifier.
2156 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2157
2158 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2159 // Keep GLC only when the atomic's result is actually used.
2160 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2162 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2164 return false;
2165
2166 int NumVAddrRegs = 0;
2167 int NumVAddrDwords = 0;
2168 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2169 // Skip the $noregs and 0s inserted during legalization.
2170 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2171 if (!AddrOp.isReg())
2172 continue; // XXX - Break?
2173
2174 Register Addr = AddrOp.getReg();
2175 if (!Addr)
2176 break;
2177
2178 ++NumVAddrRegs;
2179 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2180 }
2181
2182 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2183 // NSA, these should have been packed into a single value in the first
2184 // address register
2185 const bool UseNSA =
2186 NumVAddrRegs != 1 &&
2187 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2188 : NumVAddrDwords == NumVAddrRegs);
2189 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2190 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2191 return false;
2192 }
2193
2194 if (IsTexFail)
2195 ++NumVDataDwords;
2196
2197 int Opcode = -1;
2198 if (IsGFX12Plus) {
2199 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2200 NumVDataDwords, NumVAddrDwords);
2201 } else if (IsGFX11Plus) {
2202 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2203 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2204 : AMDGPU::MIMGEncGfx11Default,
2205 NumVDataDwords, NumVAddrDwords);
2206 } else if (IsGFX10Plus) {
2207 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2208 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2209 : AMDGPU::MIMGEncGfx10Default,
2210 NumVDataDwords, NumVAddrDwords);
2211 } else {
2212 if (Subtarget->hasGFX90AInsts()) {
2213 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2214 NumVDataDwords, NumVAddrDwords);
2215 if (Opcode == -1) {
2216 LLVM_DEBUG(
2217 dbgs()
2218 << "requested image instruction is not supported on this GPU\n");
2219 return false;
2220 }
2221 }
2222 if (Opcode == -1 &&
2223 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2224 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2225 NumVDataDwords, NumVAddrDwords);
2226 if (Opcode == -1)
2227 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2228 NumVDataDwords, NumVAddrDwords);
2229 }
2230 if (Opcode == -1)
2231 return false;
2232
2233 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2234 .cloneMemRefs(MI);
2235
2236 if (VDataOut) {
2237 if (BaseOpcode->AtomicX2) {
2238 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2239
2240 Register TmpReg = MRI->createVirtualRegister(
2241 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2242 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2243
2244 MIB.addDef(TmpReg);
2245 if (!MRI->use_empty(VDataOut)) {
2246 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2247 .addReg(TmpReg, RegState::Kill, SubReg);
2248 }
2249
2250 } else {
2251 MIB.addDef(VDataOut); // vdata output
2252 }
2253 }
2254
2255 if (VDataIn)
2256 MIB.addReg(VDataIn); // vdata input
2257
2258 for (int I = 0; I != NumVAddrRegs; ++I) {
2259 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2260 if (SrcOp.isReg()) {
2261 assert(SrcOp.getReg() != 0);
2262 MIB.addReg(SrcOp.getReg());
2263 }
2264 }
2265
2266 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2267 if (BaseOpcode->Sampler)
2268 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2269
2270 MIB.addImm(DMask); // dmask
2271
2272 if (IsGFX10Plus)
2273 MIB.addImm(DimInfo->Encoding);
2274 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2275 MIB.addImm(Unorm);
2276
2277 MIB.addImm(CPol);
2278 MIB.addImm(IsA16 && // a16 or r128
2279 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2280 if (IsGFX10Plus)
2281 MIB.addImm(IsA16 ? -1 : 0);
2282
2283 if (!Subtarget->hasGFX90AInsts()) {
2284 MIB.addImm(TFE); // tfe
2285 } else if (TFE) {
2286 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2287 return false;
2288 }
2289
2290 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2291 MIB.addImm(LWE); // lwe
2292 if (!IsGFX10Plus)
2293 MIB.addImm(DimInfo->DA ? -1 : 0);
2294 if (BaseOpcode->HasD16)
2295 MIB.addImm(IsD16 ? -1 : 0);
2296
2297 MI.eraseFromParent();
2298 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2299 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2300 return true;
2301}
2302
2303// We need to handle this here because tablegen doesn't support matching
2304// instructions with multiple outputs.
2305bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2306 MachineInstr &MI) const {
2307 Register Dst0 = MI.getOperand(0).getReg();
2308 Register Dst1 = MI.getOperand(1).getReg();
2309
2310 const DebugLoc &DL = MI.getDebugLoc();
2311 MachineBasicBlock *MBB = MI.getParent();
2312
2313 Register Addr = MI.getOperand(3).getReg();
2314 Register Data0 = MI.getOperand(4).getReg();
2315 Register Data1 = MI.getOperand(5).getReg();
2316 unsigned Offset = MI.getOperand(6).getImm();
2317
2318 unsigned Opc;
2319 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2320 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2321 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2322 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2323 break;
2324 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2325 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2326 break;
2327 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2328 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2329 break;
2330 }
2331
2332 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2333 .addDef(Dst1)
2334 .addUse(Addr)
2335 .addUse(Data0)
2336 .addUse(Data1)
2337 .addImm(Offset)
2338 .cloneMemRefs(MI);
2339
2340 MI.eraseFromParent();
2341 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2342}
2343
2344bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2345 MachineInstr &I) const {
2346 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2347 switch (IntrinsicID) {
2348 case Intrinsic::amdgcn_end_cf:
2349 return selectEndCfIntrinsic(I);
2350 case Intrinsic::amdgcn_ds_ordered_add:
2351 case Intrinsic::amdgcn_ds_ordered_swap:
2352 return selectDSOrderedIntrinsic(I, IntrinsicID);
2353 case Intrinsic::amdgcn_ds_gws_init:
2354 case Intrinsic::amdgcn_ds_gws_barrier:
2355 case Intrinsic::amdgcn_ds_gws_sema_v:
2356 case Intrinsic::amdgcn_ds_gws_sema_br:
2357 case Intrinsic::amdgcn_ds_gws_sema_p:
2358 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2359 return selectDSGWSIntrinsic(I, IntrinsicID);
2360 case Intrinsic::amdgcn_ds_append:
2361 return selectDSAppendConsume(I, true);
2362 case Intrinsic::amdgcn_ds_consume:
2363 return selectDSAppendConsume(I, false);
2364 case Intrinsic::amdgcn_init_whole_wave:
2365 return selectInitWholeWave(I);
2366 case Intrinsic::amdgcn_raw_buffer_load_lds:
2367 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2368 case Intrinsic::amdgcn_struct_buffer_load_lds:
2369 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2370 return selectBufferLoadLds(I);
2371 // Until we can store both the address space of the global and the LDS
2372 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2373 // that the argument is a global pointer (buffer pointers have been handled by
2374 // a LLVM IR-level lowering).
2375 case Intrinsic::amdgcn_load_to_lds:
2376 case Intrinsic::amdgcn_global_load_lds:
2377 return selectGlobalLoadLds(I);
2378 case Intrinsic::amdgcn_exp_compr:
2379 if (!STI.hasCompressedExport()) {
2380 Function &F = I.getMF()->getFunction();
2381 F.getContext().diagnose(
2382 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2383 I.getDebugLoc(), DS_Error));
2384 return false;
2385 }
2386 break;
2387 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2388 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2389 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2390 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2391 return selectDSBvhStackIntrinsic(I);
2392 case Intrinsic::amdgcn_s_barrier_init:
2393 case Intrinsic::amdgcn_s_barrier_signal_var:
2394 return selectNamedBarrierInit(I, IntrinsicID);
2395 case Intrinsic::amdgcn_s_barrier_join:
2396 case Intrinsic::amdgcn_s_get_named_barrier_state:
2397 return selectNamedBarrierInst(I, IntrinsicID);
2398 case Intrinsic::amdgcn_s_get_barrier_state:
2399 return selectSGetBarrierState(I, IntrinsicID);
2400 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2401 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2402 }
2403 return selectImpl(I, *CoverageInfo);
2404}
2405
2406bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2407 if (selectImpl(I, *CoverageInfo))
2408 return true;
2409
2410 MachineBasicBlock *BB = I.getParent();
2411 const DebugLoc &DL = I.getDebugLoc();
2412
2413 Register DstReg = I.getOperand(0).getReg();
2414 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2415 assert(Size <= 32 || Size == 64);
2416 const MachineOperand &CCOp = I.getOperand(1);
2417 Register CCReg = CCOp.getReg();
2418 if (!isVCC(CCReg, *MRI)) {
2419 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2420 AMDGPU::S_CSELECT_B32;
2421 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2422 .addReg(CCReg);
2423
2424 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2425 // bank, because it does not cover the register class that we used to represent
2426 // for it. So we need to manually set the register class here.
2427 if (!MRI->getRegClassOrNull(CCReg))
2428 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2429 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2430 .add(I.getOperand(2))
2431 .add(I.getOperand(3));
2432
2433 bool Ret = false;
2434 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2435 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2436 I.eraseFromParent();
2437 return Ret;
2438 }
2439
2440 // Wide VGPR select should have been split in RegBankSelect.
2441 if (Size > 32)
2442 return false;
2443
2444 MachineInstr *Select =
2445 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2446 .addImm(0)
2447 .add(I.getOperand(3))
2448 .addImm(0)
2449 .add(I.getOperand(2))
2450 .add(I.getOperand(1));
2451
2452 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2453 I.eraseFromParent();
2454 return Ret;
2455}
2456
2457bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2458 Register DstReg = I.getOperand(0).getReg();
2459 Register SrcReg = I.getOperand(1).getReg();
2460 const LLT DstTy = MRI->getType(DstReg);
2461 const LLT SrcTy = MRI->getType(SrcReg);
2462 const LLT S1 = LLT::scalar(1);
2463
2464 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2465 const RegisterBank *DstRB;
2466 if (DstTy == S1) {
2467 // This is a special case. We don't treat s1 for legalization artifacts as
2468 // vcc booleans.
2469 DstRB = SrcRB;
2470 } else {
2471 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2472 if (SrcRB != DstRB)
2473 return false;
2474 }
2475
2476 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2477
2478 unsigned DstSize = DstTy.getSizeInBits();
2479 unsigned SrcSize = SrcTy.getSizeInBits();
2480
2481 const TargetRegisterClass *SrcRC =
2482 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2483 const TargetRegisterClass *DstRC =
2484 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2485 if (!SrcRC || !DstRC)
2486 return false;
2487
2488 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2489 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2490 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2491 return false;
2492 }
2493
2494 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2495 assert(STI.useRealTrue16Insts());
2496 const DebugLoc &DL = I.getDebugLoc();
2497 MachineBasicBlock *MBB = I.getParent();
2498 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2499 .addReg(SrcReg, 0, AMDGPU::lo16);
2500 I.eraseFromParent();
2501 return true;
2502 }
2503
2504 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2505 MachineBasicBlock *MBB = I.getParent();
2506 const DebugLoc &DL = I.getDebugLoc();
2507
2508 Register LoReg = MRI->createVirtualRegister(DstRC);
2509 Register HiReg = MRI->createVirtualRegister(DstRC);
2510 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2511 .addReg(SrcReg, 0, AMDGPU::sub0);
2512 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2513 .addReg(SrcReg, 0, AMDGPU::sub1);
2514
2515 if (IsVALU && STI.hasSDWA()) {
2516 // Write the low 16-bits of the high element into the high 16-bits of the
2517 // low element.
2518 MachineInstr *MovSDWA =
2519 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2520 .addImm(0) // $src0_modifiers
2521 .addReg(HiReg) // $src0
2522 .addImm(0) // $clamp
2523 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2524 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2525 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2526 .addReg(LoReg, RegState::Implicit);
2527 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2528 } else {
2529 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2530 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2531 Register ImmReg = MRI->createVirtualRegister(DstRC);
2532 if (IsVALU) {
2533 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2534 .addImm(16)
2535 .addReg(HiReg);
2536 } else {
2537 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2538 .addReg(HiReg)
2539 .addImm(16)
2540 .setOperandDead(3); // Dead scc
2541 }
2542
2543 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2544 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2545 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2546
2547 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2548 .addImm(0xffff);
2549 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2550 .addReg(LoReg)
2551 .addReg(ImmReg);
2552 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2553 .addReg(TmpReg0)
2554 .addReg(TmpReg1);
2555
2556 if (!IsVALU) {
2557 And.setOperandDead(3); // Dead scc
2558 Or.setOperandDead(3); // Dead scc
2559 }
2560 }
2561
2562 I.eraseFromParent();
2563 return true;
2564 }
2565
2566 if (!DstTy.isScalar())
2567 return false;
2568
2569 if (SrcSize > 32) {
2570 unsigned SubRegIdx = DstSize < 32
2571 ? static_cast<unsigned>(AMDGPU::sub0)
2572 : TRI.getSubRegFromChannel(0, DstSize / 32);
2573 if (SubRegIdx == AMDGPU::NoSubRegister)
2574 return false;
2575
2576 // Deal with weird cases where the class only partially supports the subreg
2577 // index.
2578 const TargetRegisterClass *SrcWithSubRC
2579 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2580 if (!SrcWithSubRC)
2581 return false;
2582
2583 if (SrcWithSubRC != SrcRC) {
2584 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2585 return false;
2586 }
2587
2588 I.getOperand(1).setSubReg(SubRegIdx);
2589 }
2590
2591 I.setDesc(TII.get(TargetOpcode::COPY));
2592 return true;
2593}
2594
2595/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2596static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2598 int SignedMask = static_cast<int>(Mask);
2599 return SignedMask >= -16 && SignedMask <= 64;
2600}
2601
2602// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2603const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2605 const TargetRegisterInfo &TRI) const {
2606 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2607 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2608 return RB;
2609
2610 // Ignore the type, since we don't use vcc in artifacts.
2611 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2612 return &RBI.getRegBankFromRegClass(*RC, LLT());
2613 return nullptr;
2614}
2615
2616bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2617 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2618 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2619 const DebugLoc &DL = I.getDebugLoc();
2620 MachineBasicBlock &MBB = *I.getParent();
2621 const Register DstReg = I.getOperand(0).getReg();
2622 const Register SrcReg = I.getOperand(1).getReg();
2623
2624 const LLT DstTy = MRI->getType(DstReg);
2625 const LLT SrcTy = MRI->getType(SrcReg);
2626 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2627 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2628 const unsigned DstSize = DstTy.getSizeInBits();
2629 if (!DstTy.isScalar())
2630 return false;
2631
2632 // Artifact casts should never use vcc.
2633 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2634
2635 // FIXME: This should probably be illegal and split earlier.
2636 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2637 if (DstSize <= 32)
2638 return selectCOPY(I);
2639
2640 const TargetRegisterClass *SrcRC =
2641 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2642 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2643 const TargetRegisterClass *DstRC =
2644 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2645
2646 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2647 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2648 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2649 .addReg(SrcReg)
2650 .addImm(AMDGPU::sub0)
2651 .addReg(UndefReg)
2652 .addImm(AMDGPU::sub1);
2653 I.eraseFromParent();
2654
2655 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2656 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2657 }
2658
2659 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2660 // 64-bit should have been split up in RegBankSelect
2661
2662 // Try to use an and with a mask if it will save code size.
2663 unsigned Mask;
2664 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2665 MachineInstr *ExtI =
2666 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2667 .addImm(Mask)
2668 .addReg(SrcReg);
2669 I.eraseFromParent();
2670 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2671 }
2672
2673 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2674 MachineInstr *ExtI =
2675 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2676 .addReg(SrcReg)
2677 .addImm(0) // Offset
2678 .addImm(SrcSize); // Width
2679 I.eraseFromParent();
2680 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2681 }
2682
2683 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2684 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2685 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2686 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2687 return false;
2688
2689 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2690 const unsigned SextOpc = SrcSize == 8 ?
2691 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2692 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2693 .addReg(SrcReg);
2694 I.eraseFromParent();
2695 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2696 }
2697
2698 // Using a single 32-bit SALU to calculate the high half is smaller than
2699 // S_BFE with a literal constant operand.
2700 if (DstSize > 32 && SrcSize == 32) {
2701 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2702 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2703 if (Signed) {
2704 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2705 .addReg(SrcReg, 0, SubReg)
2706 .addImm(31)
2707 .setOperandDead(3); // Dead scc
2708 } else {
2709 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2710 .addImm(0);
2711 }
2712 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2713 .addReg(SrcReg, 0, SubReg)
2714 .addImm(AMDGPU::sub0)
2715 .addReg(HiReg)
2716 .addImm(AMDGPU::sub1);
2717 I.eraseFromParent();
2718 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2719 *MRI);
2720 }
2721
2722 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2723 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2724
2725 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2726 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2727 // We need a 64-bit register source, but the high bits don't matter.
2728 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2729 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2730 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2731
2732 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2733 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2734 .addReg(SrcReg, 0, SubReg)
2735 .addImm(AMDGPU::sub0)
2736 .addReg(UndefReg)
2737 .addImm(AMDGPU::sub1);
2738
2739 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2740 .addReg(ExtReg)
2741 .addImm(SrcSize << 16);
2742
2743 I.eraseFromParent();
2744 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2745 }
2746
2747 unsigned Mask;
2748 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2749 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2750 .addReg(SrcReg)
2751 .addImm(Mask)
2752 .setOperandDead(3); // Dead scc
2753 } else {
2754 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2755 .addReg(SrcReg)
2756 .addImm(SrcSize << 16);
2757 }
2758
2759 I.eraseFromParent();
2760 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2761 }
2762
2763 return false;
2764}
2765
2769
2771 Register BitcastSrc;
2772 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2773 Reg = BitcastSrc;
2774 return Reg;
2775}
2776
2778 Register &Out) {
2779 Register Trunc;
2780 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2781 return false;
2782
2783 Register LShlSrc;
2784 Register Cst;
2785 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2786 Cst = stripCopy(Cst, MRI);
2787 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2788 Out = stripBitCast(LShlSrc, MRI);
2789 return true;
2790 }
2791 }
2792
2793 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2794 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2795 return false;
2796
2797 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2798 LLT::fixed_vector(2, 16));
2799
2800 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2801 assert(Mask.size() == 2);
2802
2803 if (Mask[0] == 1 && Mask[1] <= 1) {
2804 Out = Shuffle->getOperand(0).getReg();
2805 return true;
2806 }
2807
2808 return false;
2809}
2810
2811bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2812 if (!Subtarget->hasSALUFloatInsts())
2813 return false;
2814
2815 Register Dst = I.getOperand(0).getReg();
2816 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2817 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2818 return false;
2819
2820 Register Src = I.getOperand(1).getReg();
2821
2822 if (MRI->getType(Dst) == LLT::scalar(32) &&
2823 MRI->getType(Src) == LLT::scalar(16)) {
2824 if (isExtractHiElt(*MRI, Src, Src)) {
2825 MachineBasicBlock *BB = I.getParent();
2826 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2827 .addUse(Src);
2828 I.eraseFromParent();
2829 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2830 }
2831 }
2832
2833 return false;
2834}
2835
2836bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2837 // Only manually handle the f64 SGPR case.
2838 //
2839 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2840 // the bit ops theoretically have a second result due to the implicit def of
2841 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2842 // that is easy by disabling the check. The result works, but uses a
2843 // nonsensical sreg32orlds_and_sreg_1 regclass.
2844 //
2845 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2846 // the variadic REG_SEQUENCE operands.
2847
2848 Register Dst = MI.getOperand(0).getReg();
2849 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2850 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2851 MRI->getType(Dst) != LLT::scalar(64))
2852 return false;
2853
2854 Register Src = MI.getOperand(1).getReg();
2855 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2856 if (Fabs)
2857 Src = Fabs->getOperand(1).getReg();
2858
2859 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2860 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2861 return false;
2862
2863 MachineBasicBlock *BB = MI.getParent();
2864 const DebugLoc &DL = MI.getDebugLoc();
2865 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2866 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2867 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2868 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2869
2870 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2871 .addReg(Src, 0, AMDGPU::sub0);
2872 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2873 .addReg(Src, 0, AMDGPU::sub1);
2874 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2875 .addImm(0x80000000);
2876
2877 // Set or toggle sign bit.
2878 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2879 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2880 .addReg(HiReg)
2881 .addReg(ConstReg)
2882 .setOperandDead(3); // Dead scc
2883 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2884 .addReg(LoReg)
2885 .addImm(AMDGPU::sub0)
2886 .addReg(OpReg)
2887 .addImm(AMDGPU::sub1);
2888 MI.eraseFromParent();
2889 return true;
2890}
2891
2892// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2893bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2894 Register Dst = MI.getOperand(0).getReg();
2895 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2896 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2897 MRI->getType(Dst) != LLT::scalar(64))
2898 return false;
2899
2900 Register Src = MI.getOperand(1).getReg();
2901 MachineBasicBlock *BB = MI.getParent();
2902 const DebugLoc &DL = MI.getDebugLoc();
2903 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2904 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2905 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2906 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2907
2908 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2909 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2910 return false;
2911
2912 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2913 .addReg(Src, 0, AMDGPU::sub0);
2914 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2915 .addReg(Src, 0, AMDGPU::sub1);
2916 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2917 .addImm(0x7fffffff);
2918
2919 // Clear sign bit.
2920 // TODO: Should this used S_BITSET0_*?
2921 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2922 .addReg(HiReg)
2923 .addReg(ConstReg)
2924 .setOperandDead(3); // Dead scc
2925 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2926 .addReg(LoReg)
2927 .addImm(AMDGPU::sub0)
2928 .addReg(OpReg)
2929 .addImm(AMDGPU::sub1);
2930
2931 MI.eraseFromParent();
2932 return true;
2933}
2934
2935static bool isConstant(const MachineInstr &MI) {
2936 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2937}
2938
2939void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2940 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2941
2942 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2943 const MachineInstr *PtrMI =
2944 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2945
2946 assert(PtrMI);
2947
2948 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2949 return;
2950
2951 GEPInfo GEPInfo;
2952
2953 for (unsigned i = 1; i != 3; ++i) {
2954 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2955 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2956 assert(OpDef);
2957 if (i == 2 && isConstant(*OpDef)) {
2958 // TODO: Could handle constant base + variable offset, but a combine
2959 // probably should have commuted it.
2960 assert(GEPInfo.Imm == 0);
2961 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2962 continue;
2963 }
2964 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2965 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2966 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2967 else
2968 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2969 }
2970
2971 AddrInfo.push_back(GEPInfo);
2972 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2973}
2974
2975bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2976 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2977}
2978
2979bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2980 if (!MI.hasOneMemOperand())
2981 return false;
2982
2983 const MachineMemOperand *MMO = *MI.memoperands_begin();
2984 const Value *Ptr = MMO->getValue();
2985
2986 // UndefValue means this is a load of a kernel input. These are uniform.
2987 // Sometimes LDS instructions have constant pointers.
2988 // If Ptr is null, then that means this mem operand contains a
2989 // PseudoSourceValue like GOT.
2991 return true;
2992
2994 return true;
2995
2996 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2997 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2998 AMDGPU::SGPRRegBankID;
2999
3000 const Instruction *I = dyn_cast<Instruction>(Ptr);
3001 return I && I->getMetadata("amdgpu.uniform");
3002}
3003
3004bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3005 for (const GEPInfo &GEPInfo : AddrInfo) {
3006 if (!GEPInfo.VgprParts.empty())
3007 return true;
3008 }
3009 return false;
3010}
3011
3012void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3013 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3014 unsigned AS = PtrTy.getAddressSpace();
3016 STI.ldsRequiresM0Init()) {
3017 MachineBasicBlock *BB = I.getParent();
3018
3019 // If DS instructions require M0 initialization, insert it before selecting.
3020 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3021 .addImm(-1);
3022 }
3023}
3024
3025bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3026 MachineInstr &I) const {
3027 initM0(I);
3028 return selectImpl(I, *CoverageInfo);
3029}
3030
3032 if (Reg.isPhysical())
3033 return false;
3034
3035 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3036 const unsigned Opcode = MI.getOpcode();
3037
3038 if (Opcode == AMDGPU::COPY)
3039 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3040
3041 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3042 Opcode == AMDGPU::G_XOR)
3043 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3044 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3045
3046 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3047 return GI->is(Intrinsic::amdgcn_class);
3048
3049 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3050}
3051
3052bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3053 MachineBasicBlock *BB = I.getParent();
3054 MachineOperand &CondOp = I.getOperand(0);
3055 Register CondReg = CondOp.getReg();
3056 const DebugLoc &DL = I.getDebugLoc();
3057
3058 unsigned BrOpcode;
3059 Register CondPhysReg;
3060 const TargetRegisterClass *ConstrainRC;
3061
3062 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3063 // whether the branch is uniform when selecting the instruction. In
3064 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3065 // RegBankSelect knows what it's doing if the branch condition is scc, even
3066 // though it currently does not.
3067 if (!isVCC(CondReg, *MRI)) {
3068 if (MRI->getType(CondReg) != LLT::scalar(32))
3069 return false;
3070
3071 CondPhysReg = AMDGPU::SCC;
3072 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3073 ConstrainRC = &AMDGPU::SReg_32RegClass;
3074 } else {
3075 // FIXME: Should scc->vcc copies and with exec?
3076
3077 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3078 // need to insert an and with exec.
3079 if (!isVCmpResult(CondReg, *MRI)) {
3080 const bool Is64 = STI.isWave64();
3081 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3082 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3083
3084 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3085 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3086 .addReg(CondReg)
3087 .addReg(Exec)
3088 .setOperandDead(3); // Dead scc
3089 CondReg = TmpReg;
3090 }
3091
3092 CondPhysReg = TRI.getVCC();
3093 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3094 ConstrainRC = TRI.getBoolRC();
3095 }
3096
3097 if (!MRI->getRegClassOrNull(CondReg))
3098 MRI->setRegClass(CondReg, ConstrainRC);
3099
3100 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3101 .addReg(CondReg);
3102 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3103 .addMBB(I.getOperand(1).getMBB());
3104
3105 I.eraseFromParent();
3106 return true;
3107}
3108
3109bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3110 MachineInstr &I) const {
3111 Register DstReg = I.getOperand(0).getReg();
3112 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3113 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3114 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3115 if (IsVGPR)
3116 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3117
3118 return RBI.constrainGenericRegister(
3119 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3120}
3121
3122bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3123 Register DstReg = I.getOperand(0).getReg();
3124 Register SrcReg = I.getOperand(1).getReg();
3125 Register MaskReg = I.getOperand(2).getReg();
3126 LLT Ty = MRI->getType(DstReg);
3127 LLT MaskTy = MRI->getType(MaskReg);
3128 MachineBasicBlock *BB = I.getParent();
3129 const DebugLoc &DL = I.getDebugLoc();
3130
3131 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3132 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3133 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3134 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3135 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3136 return false;
3137
3138 // Try to avoid emitting a bit operation when we only need to touch half of
3139 // the 64-bit pointer.
3140 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3141 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3142 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3143
3144 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3145 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3146
3147 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3148 !CanCopyLow32 && !CanCopyHi32) {
3149 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3150 .addReg(SrcReg)
3151 .addReg(MaskReg)
3152 .setOperandDead(3); // Dead scc
3153 I.eraseFromParent();
3154 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3155 }
3156
3157 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3158 const TargetRegisterClass &RegRC
3159 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3160
3161 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3162 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3163 const TargetRegisterClass *MaskRC =
3164 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3165
3166 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3167 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3168 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3169 return false;
3170
3171 if (Ty.getSizeInBits() == 32) {
3172 assert(MaskTy.getSizeInBits() == 32 &&
3173 "ptrmask should have been narrowed during legalize");
3174
3175 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3176 .addReg(SrcReg)
3177 .addReg(MaskReg);
3178
3179 if (!IsVGPR)
3180 NewOp.setOperandDead(3); // Dead scc
3181 I.eraseFromParent();
3182 return true;
3183 }
3184
3185 Register HiReg = MRI->createVirtualRegister(&RegRC);
3186 Register LoReg = MRI->createVirtualRegister(&RegRC);
3187
3188 // Extract the subregisters from the source pointer.
3189 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3190 .addReg(SrcReg, 0, AMDGPU::sub0);
3191 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3192 .addReg(SrcReg, 0, AMDGPU::sub1);
3193
3194 Register MaskedLo, MaskedHi;
3195
3196 if (CanCopyLow32) {
3197 // If all the bits in the low half are 1, we only need a copy for it.
3198 MaskedLo = LoReg;
3199 } else {
3200 // Extract the mask subregister and apply the and.
3201 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3202 MaskedLo = MRI->createVirtualRegister(&RegRC);
3203
3204 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3205 .addReg(MaskReg, 0, AMDGPU::sub0);
3206 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3207 .addReg(LoReg)
3208 .addReg(MaskLo);
3209 }
3210
3211 if (CanCopyHi32) {
3212 // If all the bits in the high half are 1, we only need a copy for it.
3213 MaskedHi = HiReg;
3214 } else {
3215 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3216 MaskedHi = MRI->createVirtualRegister(&RegRC);
3217
3218 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3219 .addReg(MaskReg, 0, AMDGPU::sub1);
3220 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3221 .addReg(HiReg)
3222 .addReg(MaskHi);
3223 }
3224
3225 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3226 .addReg(MaskedLo)
3227 .addImm(AMDGPU::sub0)
3228 .addReg(MaskedHi)
3229 .addImm(AMDGPU::sub1);
3230 I.eraseFromParent();
3231 return true;
3232}
3233
3234/// Return the register to use for the index value, and the subregister to use
3235/// for the indirectly accessed register.
3236static std::pair<Register, unsigned>
3238 const TargetRegisterClass *SuperRC, Register IdxReg,
3239 unsigned EltSize, GISelValueTracking &ValueTracking) {
3240 Register IdxBaseReg;
3241 int Offset;
3242
3243 std::tie(IdxBaseReg, Offset) =
3244 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3245 if (IdxBaseReg == AMDGPU::NoRegister) {
3246 // This will happen if the index is a known constant. This should ordinarily
3247 // be legalized out, but handle it as a register just in case.
3248 assert(Offset == 0);
3249 IdxBaseReg = IdxReg;
3250 }
3251
3252 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3253
3254 // Skip out of bounds offsets, or else we would end up using an undefined
3255 // register.
3256 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3257 return std::pair(IdxReg, SubRegs[0]);
3258 return std::pair(IdxBaseReg, SubRegs[Offset]);
3259}
3260
3261bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3262 MachineInstr &MI) const {
3263 Register DstReg = MI.getOperand(0).getReg();
3264 Register SrcReg = MI.getOperand(1).getReg();
3265 Register IdxReg = MI.getOperand(2).getReg();
3266
3267 LLT DstTy = MRI->getType(DstReg);
3268 LLT SrcTy = MRI->getType(SrcReg);
3269
3270 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3271 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3272 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3273
3274 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3275 // into a waterfall loop.
3276 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3277 return false;
3278
3279 const TargetRegisterClass *SrcRC =
3280 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3281 const TargetRegisterClass *DstRC =
3282 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3283 if (!SrcRC || !DstRC)
3284 return false;
3285 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3286 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3287 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3288 return false;
3289
3290 MachineBasicBlock *BB = MI.getParent();
3291 const DebugLoc &DL = MI.getDebugLoc();
3292 const bool Is64 = DstTy.getSizeInBits() == 64;
3293
3294 unsigned SubReg;
3295 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3296 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3297
3298 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3299 if (DstTy.getSizeInBits() != 32 && !Is64)
3300 return false;
3301
3302 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3303 .addReg(IdxReg);
3304
3305 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3306 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3307 .addReg(SrcReg, 0, SubReg)
3308 .addReg(SrcReg, RegState::Implicit);
3309 MI.eraseFromParent();
3310 return true;
3311 }
3312
3313 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3314 return false;
3315
3316 if (!STI.useVGPRIndexMode()) {
3317 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3318 .addReg(IdxReg);
3319 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3320 .addReg(SrcReg, 0, SubReg)
3321 .addReg(SrcReg, RegState::Implicit);
3322 MI.eraseFromParent();
3323 return true;
3324 }
3325
3326 const MCInstrDesc &GPRIDXDesc =
3327 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3328 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3329 .addReg(SrcReg)
3330 .addReg(IdxReg)
3331 .addImm(SubReg);
3332
3333 MI.eraseFromParent();
3334 return true;
3335}
3336
3337// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3338bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3339 MachineInstr &MI) const {
3340 Register DstReg = MI.getOperand(0).getReg();
3341 Register VecReg = MI.getOperand(1).getReg();
3342 Register ValReg = MI.getOperand(2).getReg();
3343 Register IdxReg = MI.getOperand(3).getReg();
3344
3345 LLT VecTy = MRI->getType(DstReg);
3346 LLT ValTy = MRI->getType(ValReg);
3347 unsigned VecSize = VecTy.getSizeInBits();
3348 unsigned ValSize = ValTy.getSizeInBits();
3349
3350 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3351 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3352 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3353
3354 assert(VecTy.getElementType() == ValTy);
3355
3356 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3357 // into a waterfall loop.
3358 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3359 return false;
3360
3361 const TargetRegisterClass *VecRC =
3362 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3363 const TargetRegisterClass *ValRC =
3364 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3365
3366 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3367 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3368 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3369 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3370 return false;
3371
3372 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3373 return false;
3374
3375 unsigned SubReg;
3376 std::tie(IdxReg, SubReg) =
3377 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3378
3379 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3380 STI.useVGPRIndexMode();
3381
3382 MachineBasicBlock *BB = MI.getParent();
3383 const DebugLoc &DL = MI.getDebugLoc();
3384
3385 if (!IndexMode) {
3386 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3387 .addReg(IdxReg);
3388
3389 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3390 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3391 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3392 .addReg(VecReg)
3393 .addReg(ValReg)
3394 .addImm(SubReg);
3395 MI.eraseFromParent();
3396 return true;
3397 }
3398
3399 const MCInstrDesc &GPRIDXDesc =
3400 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3401 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3402 .addReg(VecReg)
3403 .addReg(ValReg)
3404 .addReg(IdxReg)
3405 .addImm(SubReg);
3406
3407 MI.eraseFromParent();
3408 return true;
3409}
3410
3411bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3412 if (!Subtarget->hasVMemToLDSLoad())
3413 return false;
3414 unsigned Opc;
3415 unsigned Size = MI.getOperand(3).getImm();
3416
3417 // The struct intrinsic variants add one additional operand over raw.
3418 const bool HasVIndex = MI.getNumOperands() == 9;
3419 Register VIndex;
3420 int OpOffset = 0;
3421 if (HasVIndex) {
3422 VIndex = MI.getOperand(4).getReg();
3423 OpOffset = 1;
3424 }
3425
3426 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3427 std::optional<ValueAndVReg> MaybeVOffset =
3429 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3430
3431 switch (Size) {
3432 default:
3433 return false;
3434 case 1:
3435 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3436 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3437 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3438 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3439 break;
3440 case 2:
3441 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3442 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3443 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3444 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3445 break;
3446 case 4:
3447 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3448 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3449 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3450 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3451 break;
3452 case 12:
3453 if (!Subtarget->hasLDSLoadB96_B128())
3454 return false;
3455
3456 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3457 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3458 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3459 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3460 break;
3461 case 16:
3462 if (!Subtarget->hasLDSLoadB96_B128())
3463 return false;
3464
3465 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3466 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3467 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3468 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3469 break;
3470 }
3471
3472 MachineBasicBlock *MBB = MI.getParent();
3473 const DebugLoc &DL = MI.getDebugLoc();
3474 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3475 .add(MI.getOperand(2));
3476
3477 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3478
3479 if (HasVIndex && HasVOffset) {
3480 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3481 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3482 .addReg(VIndex)
3483 .addImm(AMDGPU::sub0)
3484 .addReg(VOffset)
3485 .addImm(AMDGPU::sub1);
3486
3487 MIB.addReg(IdxReg);
3488 } else if (HasVIndex) {
3489 MIB.addReg(VIndex);
3490 } else if (HasVOffset) {
3491 MIB.addReg(VOffset);
3492 }
3493
3494 MIB.add(MI.getOperand(1)); // rsrc
3495 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3496 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3497 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3498 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3499 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3500 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3501 MIB.addImm(
3502 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3503 ? 1
3504 : 0); // swz
3505
3506 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3507 // Don't set the offset value here because the pointer points to the base of
3508 // the buffer.
3509 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3510
3511 MachinePointerInfo StorePtrI = LoadPtrI;
3512 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3516
3517 auto F = LoadMMO->getFlags() &
3519 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3520 Size, LoadMMO->getBaseAlign());
3521
3522 MachineMemOperand *StoreMMO =
3523 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3524 sizeof(int32_t), LoadMMO->getBaseAlign());
3525
3526 MIB.setMemRefs({LoadMMO, StoreMMO});
3527
3528 MI.eraseFromParent();
3529 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3530}
3531
3532/// Match a zero extend from a 32-bit value to 64-bits.
3533Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3534 Register ZExtSrc;
3535 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3536 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3537
3538 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3539 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3540 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3541 return Register();
3542
3543 assert(Def->getNumOperands() == 3 &&
3544 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3545 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3546 return Def->getOperand(1).getReg();
3547 }
3548
3549 return Register();
3550}
3551
3552/// Match a sign extend from a 32-bit value to 64-bits.
3553Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3554 Register SExtSrc;
3555 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3556 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3557
3558 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3559 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3560 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3561 return Register();
3562
3563 assert(Def->getNumOperands() == 3 &&
3564 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3565 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3566 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3567 m_SpecificICst(31))))
3568 return Def->getOperand(1).getReg();
3569
3570 if (VT->signBitIsZero(Reg))
3571 return matchZeroExtendFromS32(Reg);
3572
3573 return Register();
3574}
3575
3576/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3577/// is 32-bit.
3579AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3580 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3581 : matchZeroExtendFromS32(Reg);
3582}
3583
3584/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3585/// is 32-bit.
3587AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3588 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3589 : matchSignExtendFromS32(Reg);
3590}
3591
3593AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3594 bool IsSigned) const {
3595 if (IsSigned)
3596 return matchSignExtendFromS32OrS32(Reg);
3597
3598 return matchZeroExtendFromS32OrS32(Reg);
3599}
3600
3601Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3602 Register AnyExtSrc;
3603 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3604 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3605
3606 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3607 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3608 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3609 return Register();
3610
3611 assert(Def->getNumOperands() == 3 &&
3612 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3613
3614 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3615 return Def->getOperand(1).getReg();
3616
3617 return Register();
3618}
3619
3620bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3621 if (!Subtarget->hasVMemToLDSLoad())
3622 return false;
3623
3624 unsigned Opc;
3625 unsigned Size = MI.getOperand(3).getImm();
3626
3627 switch (Size) {
3628 default:
3629 return false;
3630 case 1:
3631 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3632 break;
3633 case 2:
3634 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3635 break;
3636 case 4:
3637 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3638 break;
3639 case 12:
3640 if (!Subtarget->hasLDSLoadB96_B128())
3641 return false;
3642 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3643 break;
3644 case 16:
3645 if (!Subtarget->hasLDSLoadB96_B128())
3646 return false;
3647 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3648 break;
3649 }
3650
3651 MachineBasicBlock *MBB = MI.getParent();
3652 const DebugLoc &DL = MI.getDebugLoc();
3653 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3654 .add(MI.getOperand(2));
3655
3656 Register Addr = MI.getOperand(1).getReg();
3657 Register VOffset;
3658 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3659 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3660 if (!isSGPR(Addr)) {
3661 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3662 if (isSGPR(AddrDef->Reg)) {
3663 Addr = AddrDef->Reg;
3664 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3665 Register SAddr =
3666 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3667 if (isSGPR(SAddr)) {
3668 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3669 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3670 Addr = SAddr;
3671 VOffset = Off;
3672 }
3673 }
3674 }
3675 }
3676
3677 if (isSGPR(Addr)) {
3679 if (!VOffset) {
3680 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3681 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3682 .addImm(0);
3683 }
3684 }
3685
3686 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3687 .addReg(Addr);
3688
3689 if (isSGPR(Addr))
3690 MIB.addReg(VOffset);
3691
3692 MIB.add(MI.getOperand(4)); // offset
3693
3694 unsigned Aux = MI.getOperand(5).getImm();
3695 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3696
3697 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3698 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3699 LoadPtrI.Offset = MI.getOperand(4).getImm();
3700 MachinePointerInfo StorePtrI = LoadPtrI;
3701 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3705 auto F = LoadMMO->getFlags() &
3707 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3708 Size, LoadMMO->getBaseAlign());
3709 MachineMemOperand *StoreMMO =
3710 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3711 sizeof(int32_t), Align(4));
3712
3713 MIB.setMemRefs({LoadMMO, StoreMMO});
3714
3715 MI.eraseFromParent();
3716 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3717}
3718
3719bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3720 MachineInstr &MI) const {
3721 unsigned OpcodeOpIdx =
3722 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3723 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3724 MI.removeOperand(OpcodeOpIdx);
3725 MI.addImplicitDefUseOperands(*MI.getMF());
3726 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3727}
3728
3729// FIXME: This should be removed and let the patterns select. We just need the
3730// AGPR/VGPR combination versions.
3731bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3732 unsigned Opc;
3733 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3734 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3735 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3736 break;
3737 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3738 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3739 break;
3740 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3741 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3742 break;
3743 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3744 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3745 break;
3746 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3747 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3748 break;
3749 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3750 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3751 break;
3752 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3753 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3754 break;
3755 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3756 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3757 break;
3758 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3759 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3760 break;
3761 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3762 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3763 break;
3764 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3765 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3766 break;
3767 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3768 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3769 break;
3770 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3771 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3772 break;
3773 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3774 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3775 break;
3776 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3777 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3778 break;
3779 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3780 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3781 break;
3782 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3783 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3784 break;
3785 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3786 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3787 break;
3788 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3789 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3790 break;
3791 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3792 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3793 break;
3794 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3795 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3796 break;
3797 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3798 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3799 break;
3800 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3801 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3802 break;
3803 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3804 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3805 break;
3806 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3807 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3808 break;
3809 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3810 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3811 break;
3812 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3813 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3814 break;
3815 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3816 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3817 break;
3818 default:
3819 llvm_unreachable("unhandled smfmac intrinsic");
3820 }
3821
3822 auto VDst_In = MI.getOperand(4);
3823
3824 MI.setDesc(TII.get(Opc));
3825 MI.removeOperand(4); // VDst_In
3826 MI.removeOperand(1); // Intrinsic ID
3827 MI.addOperand(VDst_In); // Readd VDst_In to the end
3828 MI.addImplicitDefUseOperands(*MI.getMF());
3829 const MCInstrDesc &MCID = MI.getDesc();
3830 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3831 MI.getOperand(0).setIsEarlyClobber(true);
3832 }
3833 return true;
3834}
3835
3836bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3837 MachineInstr &MI, Intrinsic::ID IntrID) const {
3838 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3839 !Subtarget->hasPermlane16Swap())
3840 return false;
3841 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3842 !Subtarget->hasPermlane32Swap())
3843 return false;
3844
3845 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3846 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3847 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3848
3849 MI.removeOperand(2);
3850 MI.setDesc(TII.get(Opcode));
3851 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3852
3853 MachineOperand &FI = MI.getOperand(4);
3855
3856 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3857}
3858
3859bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3860 Register DstReg = MI.getOperand(0).getReg();
3861 Register SrcReg = MI.getOperand(1).getReg();
3862 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3863 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3864 MachineBasicBlock *MBB = MI.getParent();
3865 const DebugLoc &DL = MI.getDebugLoc();
3866
3867 if (IsVALU) {
3868 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3869 .addImm(Subtarget->getWavefrontSizeLog2())
3870 .addReg(SrcReg);
3871 } else {
3872 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3873 .addReg(SrcReg)
3874 .addImm(Subtarget->getWavefrontSizeLog2())
3875 .setOperandDead(3); // Dead scc
3876 }
3877
3878 const TargetRegisterClass &RC =
3879 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3880 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3881 return false;
3882
3883 MI.eraseFromParent();
3884 return true;
3885}
3886
3887// Match BITOP3 operation and return a number of matched instructions plus
3888// truth table.
3889static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3891 const MachineRegisterInfo &MRI) {
3892 unsigned NumOpcodes = 0;
3893 uint8_t LHSBits, RHSBits;
3894
3895 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3896 // Define truth table given Src0, Src1, Src2 bits permutations:
3897 // 0 0 0
3898 // 0 0 1
3899 // 0 1 0
3900 // 0 1 1
3901 // 1 0 0
3902 // 1 0 1
3903 // 1 1 0
3904 // 1 1 1
3905 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3906
3907 if (mi_match(Op, MRI, m_AllOnesInt())) {
3908 Bits = 0xff;
3909 return true;
3910 }
3911 if (mi_match(Op, MRI, m_ZeroInt())) {
3912 Bits = 0;
3913 return true;
3914 }
3915
3916 for (unsigned I = 0; I < Src.size(); ++I) {
3917 // Try to find existing reused operand
3918 if (Src[I] == Op) {
3919 Bits = SrcBits[I];
3920 return true;
3921 }
3922 // Try to replace parent operator
3923 if (Src[I] == R) {
3924 Bits = SrcBits[I];
3925 Src[I] = Op;
3926 return true;
3927 }
3928 }
3929
3930 if (Src.size() == 3) {
3931 // No room left for operands. Try one last time, there can be a 'not' of
3932 // one of our source operands. In this case we can compute the bits
3933 // without growing Src vector.
3934 Register LHS;
3935 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3937 for (unsigned I = 0; I < Src.size(); ++I) {
3938 if (Src[I] == LHS) {
3939 Bits = ~SrcBits[I];
3940 return true;
3941 }
3942 }
3943 }
3944
3945 return false;
3946 }
3947
3948 Bits = SrcBits[Src.size()];
3949 Src.push_back(Op);
3950 return true;
3951 };
3952
3953 MachineInstr *MI = MRI.getVRegDef(R);
3954 switch (MI->getOpcode()) {
3955 case TargetOpcode::G_AND:
3956 case TargetOpcode::G_OR:
3957 case TargetOpcode::G_XOR: {
3958 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3959 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3960
3961 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3962 if (!getOperandBits(LHS, LHSBits) ||
3963 !getOperandBits(RHS, RHSBits)) {
3964 Src = Backup;
3965 return std::make_pair(0, 0);
3966 }
3967
3968 // Recursion is naturally limited by the size of the operand vector.
3969 auto Op = BitOp3_Op(LHS, Src, MRI);
3970 if (Op.first) {
3971 NumOpcodes += Op.first;
3972 LHSBits = Op.second;
3973 }
3974
3975 Op = BitOp3_Op(RHS, Src, MRI);
3976 if (Op.first) {
3977 NumOpcodes += Op.first;
3978 RHSBits = Op.second;
3979 }
3980 break;
3981 }
3982 default:
3983 return std::make_pair(0, 0);
3984 }
3985
3986 uint8_t TTbl;
3987 switch (MI->getOpcode()) {
3988 case TargetOpcode::G_AND:
3989 TTbl = LHSBits & RHSBits;
3990 break;
3991 case TargetOpcode::G_OR:
3992 TTbl = LHSBits | RHSBits;
3993 break;
3994 case TargetOpcode::G_XOR:
3995 TTbl = LHSBits ^ RHSBits;
3996 break;
3997 default:
3998 break;
3999 }
4000
4001 return std::make_pair(NumOpcodes + 1, TTbl);
4002}
4003
4004bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4005 if (!Subtarget->hasBitOp3Insts())
4006 return false;
4007
4008 Register DstReg = MI.getOperand(0).getReg();
4009 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4010 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4011 if (!IsVALU)
4012 return false;
4013
4015 uint8_t TTbl;
4016 unsigned NumOpcodes;
4017
4018 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4019
4020 // Src.empty() case can happen if all operands are all zero or all ones.
4021 // Normally it shall be optimized out before reaching this.
4022 if (NumOpcodes < 2 || Src.empty())
4023 return false;
4024
4025 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4026 if (NumOpcodes == 2 && IsB32) {
4027 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4028 // asm more readable. This cannot be modeled with AddedComplexity because
4029 // selector does not know how many operations did we match.
4030 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4031 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4032 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4033 return false;
4034 } else if (NumOpcodes < 4) {
4035 // For a uniform case threshold should be higher to account for moves
4036 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4037 // in SGPRs and a readtfirstlane after.
4038 return false;
4039 }
4040
4041 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4042 if (!IsB32 && STI.hasTrue16BitInsts())
4043 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4044 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4045 unsigned CBL = STI.getConstantBusLimit(Opc);
4046 MachineBasicBlock *MBB = MI.getParent();
4047 const DebugLoc &DL = MI.getDebugLoc();
4048
4049 for (unsigned I = 0; I < Src.size(); ++I) {
4050 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4051 if (RB->getID() != AMDGPU::SGPRRegBankID)
4052 continue;
4053 if (CBL > 0) {
4054 --CBL;
4055 continue;
4056 }
4057 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4058 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4059 .addReg(Src[I]);
4060 Src[I] = NewReg;
4061 }
4062
4063 // Last operand can be ignored, turning a ternary operation into a binary.
4064 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4065 // 'c' with 'a' here without changing the answer. In some pathological
4066 // cases it should be possible to get an operation with a single operand
4067 // too if optimizer would not catch it.
4068 while (Src.size() < 3)
4069 Src.push_back(Src[0]);
4070
4071 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4072 if (!IsB32)
4073 MIB.addImm(0); // src_mod0
4074 MIB.addReg(Src[0]);
4075 if (!IsB32)
4076 MIB.addImm(0); // src_mod1
4077 MIB.addReg(Src[1]);
4078 if (!IsB32)
4079 MIB.addImm(0); // src_mod2
4080 MIB.addReg(Src[2])
4081 .addImm(TTbl);
4082 if (!IsB32)
4083 MIB.addImm(0); // op_sel
4084
4085 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4086 MI.eraseFromParent();
4087
4088 return true;
4089}
4090
4091bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4092 Register SrcReg = MI.getOperand(0).getReg();
4093 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4094 return false;
4095
4096 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4097 Register SP =
4098 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4099 Register WaveAddr = getWaveAddress(DefMI);
4100 MachineBasicBlock *MBB = MI.getParent();
4101 const DebugLoc &DL = MI.getDebugLoc();
4102
4103 if (!WaveAddr) {
4104 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4105 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4106 .addReg(SrcReg)
4107 .addImm(Subtarget->getWavefrontSizeLog2())
4108 .setOperandDead(3); // Dead scc
4109 }
4110
4111 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4112 .addReg(WaveAddr);
4113
4114 MI.eraseFromParent();
4115 return true;
4116}
4117
4119
4120 if (!I.isPreISelOpcode()) {
4121 if (I.isCopy())
4122 return selectCOPY(I);
4123 return true;
4124 }
4125
4126 switch (I.getOpcode()) {
4127 case TargetOpcode::G_AND:
4128 case TargetOpcode::G_OR:
4129 case TargetOpcode::G_XOR:
4130 if (selectBITOP3(I))
4131 return true;
4132 if (selectImpl(I, *CoverageInfo))
4133 return true;
4134 return selectG_AND_OR_XOR(I);
4135 case TargetOpcode::G_ADD:
4136 case TargetOpcode::G_SUB:
4137 case TargetOpcode::G_PTR_ADD:
4138 if (selectImpl(I, *CoverageInfo))
4139 return true;
4140 return selectG_ADD_SUB(I);
4141 case TargetOpcode::G_UADDO:
4142 case TargetOpcode::G_USUBO:
4143 case TargetOpcode::G_UADDE:
4144 case TargetOpcode::G_USUBE:
4145 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4146 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4147 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4148 return selectG_AMDGPU_MAD_64_32(I);
4149 case TargetOpcode::G_INTTOPTR:
4150 case TargetOpcode::G_BITCAST:
4151 case TargetOpcode::G_PTRTOINT:
4152 case TargetOpcode::G_FREEZE:
4153 return selectCOPY(I);
4154 case TargetOpcode::G_FNEG:
4155 if (selectImpl(I, *CoverageInfo))
4156 return true;
4157 return selectG_FNEG(I);
4158 case TargetOpcode::G_FABS:
4159 if (selectImpl(I, *CoverageInfo))
4160 return true;
4161 return selectG_FABS(I);
4162 case TargetOpcode::G_EXTRACT:
4163 return selectG_EXTRACT(I);
4164 case TargetOpcode::G_MERGE_VALUES:
4165 case TargetOpcode::G_CONCAT_VECTORS:
4166 return selectG_MERGE_VALUES(I);
4167 case TargetOpcode::G_UNMERGE_VALUES:
4168 return selectG_UNMERGE_VALUES(I);
4169 case TargetOpcode::G_BUILD_VECTOR:
4170 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4171 return selectG_BUILD_VECTOR(I);
4172 case TargetOpcode::G_IMPLICIT_DEF:
4173 return selectG_IMPLICIT_DEF(I);
4174 case TargetOpcode::G_INSERT:
4175 return selectG_INSERT(I);
4176 case TargetOpcode::G_INTRINSIC:
4177 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4178 return selectG_INTRINSIC(I);
4179 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4180 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4181 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4182 case TargetOpcode::G_ICMP:
4183 case TargetOpcode::G_FCMP:
4184 if (selectG_ICMP_or_FCMP(I))
4185 return true;
4186 return selectImpl(I, *CoverageInfo);
4187 case TargetOpcode::G_LOAD:
4188 case TargetOpcode::G_ZEXTLOAD:
4189 case TargetOpcode::G_SEXTLOAD:
4190 case TargetOpcode::G_STORE:
4191 case TargetOpcode::G_ATOMIC_CMPXCHG:
4192 case TargetOpcode::G_ATOMICRMW_XCHG:
4193 case TargetOpcode::G_ATOMICRMW_ADD:
4194 case TargetOpcode::G_ATOMICRMW_SUB:
4195 case TargetOpcode::G_ATOMICRMW_AND:
4196 case TargetOpcode::G_ATOMICRMW_OR:
4197 case TargetOpcode::G_ATOMICRMW_XOR:
4198 case TargetOpcode::G_ATOMICRMW_MIN:
4199 case TargetOpcode::G_ATOMICRMW_MAX:
4200 case TargetOpcode::G_ATOMICRMW_UMIN:
4201 case TargetOpcode::G_ATOMICRMW_UMAX:
4202 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4203 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4204 case TargetOpcode::G_ATOMICRMW_FADD:
4205 case TargetOpcode::G_ATOMICRMW_FMIN:
4206 case TargetOpcode::G_ATOMICRMW_FMAX:
4207 return selectG_LOAD_STORE_ATOMICRMW(I);
4208 case TargetOpcode::G_SELECT:
4209 return selectG_SELECT(I);
4210 case TargetOpcode::G_TRUNC:
4211 return selectG_TRUNC(I);
4212 case TargetOpcode::G_SEXT:
4213 case TargetOpcode::G_ZEXT:
4214 case TargetOpcode::G_ANYEXT:
4215 case TargetOpcode::G_SEXT_INREG:
4216 // This is a workaround. For extension from type i1, `selectImpl()` uses
4217 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4218 // i1 can only be hold in a SGPR class.
4219 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4220 selectImpl(I, *CoverageInfo))
4221 return true;
4222 return selectG_SZA_EXT(I);
4223 case TargetOpcode::G_FPEXT:
4224 if (selectG_FPEXT(I))
4225 return true;
4226 return selectImpl(I, *CoverageInfo);
4227 case TargetOpcode::G_BRCOND:
4228 return selectG_BRCOND(I);
4229 case TargetOpcode::G_GLOBAL_VALUE:
4230 return selectG_GLOBAL_VALUE(I);
4231 case TargetOpcode::G_PTRMASK:
4232 return selectG_PTRMASK(I);
4233 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4234 return selectG_EXTRACT_VECTOR_ELT(I);
4235 case TargetOpcode::G_INSERT_VECTOR_ELT:
4236 return selectG_INSERT_VECTOR_ELT(I);
4237 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4238 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4239 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4240 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4241 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4242 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4244 assert(Intr && "not an image intrinsic with image pseudo");
4245 return selectImageIntrinsic(I, Intr);
4246 }
4247 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4248 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4249 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4250 return selectBVHIntersectRayIntrinsic(I);
4251 case AMDGPU::G_SBFX:
4252 case AMDGPU::G_UBFX:
4253 return selectG_SBFX_UBFX(I);
4254 case AMDGPU::G_SI_CALL:
4255 I.setDesc(TII.get(AMDGPU::SI_CALL));
4256 return true;
4257 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4258 return selectWaveAddress(I);
4259 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4260 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4261 return true;
4262 }
4263 case AMDGPU::G_STACKRESTORE:
4264 return selectStackRestore(I);
4265 case AMDGPU::G_PHI:
4266 return selectPHI(I);
4267 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4268 return selectCOPY_SCC_VCC(I);
4269 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4270 return selectCOPY_VCC_SCC(I);
4271 case AMDGPU::G_AMDGPU_READANYLANE:
4272 return selectReadAnyLane(I);
4273 case TargetOpcode::G_CONSTANT:
4274 case TargetOpcode::G_FCONSTANT:
4275 default:
4276 return selectImpl(I, *CoverageInfo);
4277 }
4278 return false;
4279}
4280
4282AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4283 return {{
4284 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4285 }};
4286
4287}
4288
4289std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4290 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4291 unsigned Mods = 0;
4292 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4293
4294 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4295 Src = MI->getOperand(1).getReg();
4296 Mods |= SISrcMods::NEG;
4297 MI = getDefIgnoringCopies(Src, *MRI);
4298 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4299 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4300 // denormal mode, but we're implicitly canonicalizing in a source operand.
4301 const ConstantFP *LHS =
4302 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4303 if (LHS && LHS->isZero()) {
4304 Mods |= SISrcMods::NEG;
4305 Src = MI->getOperand(2).getReg();
4306 }
4307 }
4308
4309 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4310 Src = MI->getOperand(1).getReg();
4311 Mods |= SISrcMods::ABS;
4312 }
4313
4314 if (OpSel)
4315 Mods |= SISrcMods::OP_SEL_0;
4316
4317 return std::pair(Src, Mods);
4318}
4319
4320Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4321 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4322 bool ForceVGPR) const {
4323 if ((Mods != 0 || ForceVGPR) &&
4324 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4325
4326 // If we looked through copies to find source modifiers on an SGPR operand,
4327 // we now have an SGPR register source. To avoid potentially violating the
4328 // constant bus restriction, we need to insert a copy to a VGPR.
4329 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4330 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4331 TII.get(AMDGPU::COPY), VGPRSrc)
4332 .addReg(Src);
4333 Src = VGPRSrc;
4334 }
4335
4336 return Src;
4337}
4338
4339///
4340/// This will select either an SGPR or VGPR operand and will save us from
4341/// having to write an extra tablegen pattern.
4343AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4344 return {{
4345 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4346 }};
4347}
4348
4350AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4351 Register Src;
4352 unsigned Mods;
4353 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4354
4355 return {{
4356 [=](MachineInstrBuilder &MIB) {
4357 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4358 },
4359 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4360 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4361 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4362 }};
4363}
4364
4366AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4367 Register Src;
4368 unsigned Mods;
4369 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4370 /*IsCanonicalizing=*/true,
4371 /*AllowAbs=*/false);
4372
4373 return {{
4374 [=](MachineInstrBuilder &MIB) {
4375 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4376 },
4377 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4378 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4379 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4380 }};
4381}
4382
4384AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4385 return {{
4386 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4387 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4388 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4389 }};
4390}
4391
4393AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4394 Register Src;
4395 unsigned Mods;
4396 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4397
4398 return {{
4399 [=](MachineInstrBuilder &MIB) {
4400 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4401 },
4402 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4403 }};
4404}
4405
4407AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4408 MachineOperand &Root) const {
4409 Register Src;
4410 unsigned Mods;
4411 std::tie(Src, Mods) =
4412 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4413
4414 return {{
4415 [=](MachineInstrBuilder &MIB) {
4416 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4417 },
4418 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4419 }};
4420}
4421
4423AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4424 Register Src;
4425 unsigned Mods;
4426 std::tie(Src, Mods) =
4427 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4428 /*AllowAbs=*/false);
4429
4430 return {{
4431 [=](MachineInstrBuilder &MIB) {
4432 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4433 },
4434 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4435 }};
4436}
4437
4439AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4440 Register Reg = Root.getReg();
4441 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4442 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4443 return {};
4444 return {{
4445 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4446 }};
4447}
4448
4449enum class SrcStatus {
4454 // This means current op = [op_upper, op_lower] and src = -op_lower.
4457 // This means current op = [op_upper, op_lower] and src = [op_upper,
4458 // -op_lower].
4466};
4467/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4468static bool isTruncHalf(const MachineInstr *MI,
4469 const MachineRegisterInfo &MRI) {
4470 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4471 return false;
4472
4473 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4474 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4475 return DstSize * 2 == SrcSize;
4476}
4477
4478/// Test if the MI is logic shift right with half bits,
4479/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4480static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4481 if (MI->getOpcode() != AMDGPU::G_LSHR)
4482 return false;
4483
4484 Register ShiftSrc;
4485 std::optional<ValueAndVReg> ShiftAmt;
4486 if (mi_match(MI->getOperand(0).getReg(), MRI,
4487 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4488 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4489 unsigned Shift = ShiftAmt->Value.getZExtValue();
4490 return Shift * 2 == SrcSize;
4491 }
4492 return false;
4493}
4494
4495/// Test if the MI is shift left with half bits,
4496/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4497static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4498 if (MI->getOpcode() != AMDGPU::G_SHL)
4499 return false;
4500
4501 Register ShiftSrc;
4502 std::optional<ValueAndVReg> ShiftAmt;
4503 if (mi_match(MI->getOperand(0).getReg(), MRI,
4504 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4505 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4506 unsigned Shift = ShiftAmt->Value.getZExtValue();
4507 return Shift * 2 == SrcSize;
4508 }
4509 return false;
4510}
4511
4512/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4513static bool isUnmergeHalf(const MachineInstr *MI,
4514 const MachineRegisterInfo &MRI) {
4515 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4516 return false;
4517 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4518 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4519}
4520
4522
4524 const MachineRegisterInfo &MRI) {
4525 LLT OpTy = MRI.getType(Reg);
4526 if (OpTy.isScalar())
4527 return TypeClass::SCALAR;
4528 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4531}
4532
4534 const MachineRegisterInfo &MRI) {
4536 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4537 return SrcStatus::INVALID;
4538
4539 switch (S) {
4540 case SrcStatus::IS_SAME:
4541 if (NegType == TypeClass::VECTOR_OF_TWO) {
4542 // Vector of 2:
4543 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4544 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4545 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4546 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4548 }
4549 if (NegType == TypeClass::SCALAR) {
4550 // Scalar:
4551 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4552 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4553 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4554 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4555 return SrcStatus::IS_HI_NEG;
4556 }
4557 break;
4559 if (NegType == TypeClass::VECTOR_OF_TWO) {
4560 // Vector of 2:
4561 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4562 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4563 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4564 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4565 return SrcStatus::IS_LO_NEG;
4566 }
4567 if (NegType == TypeClass::SCALAR) {
4568 // Scalar:
4569 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4570 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4571 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4572 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4573 return SrcStatus::IS_SAME;
4574 }
4575 break;
4577 if (NegType == TypeClass::VECTOR_OF_TWO) {
4578 // Vector of 2:
4579 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4580 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4581 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4582 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4583 return SrcStatus::IS_HI_NEG;
4584 }
4585 if (NegType == TypeClass::SCALAR) {
4586 // Scalar:
4587 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4588 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4589 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4590 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4592 }
4593 break;
4595 if (NegType == TypeClass::VECTOR_OF_TWO) {
4596 // Vector of 2:
4597 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4598 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4599 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4600 // [SrcHi, SrcLo] = [OpHi, OpLo]
4601 return SrcStatus::IS_SAME;
4602 }
4603 if (NegType == TypeClass::SCALAR) {
4604 // Scalar:
4605 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4606 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4607 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4608 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4609 return SrcStatus::IS_LO_NEG;
4610 }
4611 break;
4613 // Vector of 2:
4614 // Src = CurrUpper
4615 // Curr = [CurrUpper, CurrLower]
4616 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4617 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4618 // Src = -OpUpper
4619 //
4620 // Scalar:
4621 // Src = CurrUpper
4622 // Curr = [CurrUpper, CurrLower]
4623 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4624 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4625 // Src = -OpUpper
4628 if (NegType == TypeClass::VECTOR_OF_TWO) {
4629 // Vector of 2:
4630 // Src = CurrLower
4631 // Curr = [CurrUpper, CurrLower]
4632 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4633 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4634 // Src = -OpLower
4636 }
4637 if (NegType == TypeClass::SCALAR) {
4638 // Scalar:
4639 // Src = CurrLower
4640 // Curr = [CurrUpper, CurrLower]
4641 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4642 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4643 // Src = OpLower
4645 }
4646 break;
4648 // Vector of 2:
4649 // Src = -CurrUpper
4650 // Curr = [CurrUpper, CurrLower]
4651 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4652 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4653 // Src = -(-OpUpper) = OpUpper
4654 //
4655 // Scalar:
4656 // Src = -CurrUpper
4657 // Curr = [CurrUpper, CurrLower]
4658 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4659 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4660 // Src = -(-OpUpper) = OpUpper
4663 if (NegType == TypeClass::VECTOR_OF_TWO) {
4664 // Vector of 2:
4665 // Src = -CurrLower
4666 // Curr = [CurrUpper, CurrLower]
4667 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4668 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4669 // Src = -(-OpLower) = OpLower
4671 }
4672 if (NegType == TypeClass::SCALAR) {
4673 // Scalar:
4674 // Src = -CurrLower
4675 // Curr = [CurrUpper, CurrLower]
4676 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4677 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4678 // Src = -OpLower
4680 }
4681 break;
4682 default:
4683 break;
4684 }
4685 llvm_unreachable("unexpected SrcStatus & NegType combination");
4686}
4687
4688static std::optional<std::pair<Register, SrcStatus>>
4689calcNextStatus(std::pair<Register, SrcStatus> Curr,
4690 const MachineRegisterInfo &MRI) {
4691 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4692
4693 unsigned Opc = MI->getOpcode();
4694
4695 // Handle general Opc cases.
4696 switch (Opc) {
4697 case AMDGPU::G_BITCAST:
4698 return std::optional<std::pair<Register, SrcStatus>>(
4699 {MI->getOperand(1).getReg(), Curr.second});
4700 case AMDGPU::COPY:
4701 if (MI->getOperand(1).getReg().isPhysical())
4702 return std::nullopt;
4703 return std::optional<std::pair<Register, SrcStatus>>(
4704 {MI->getOperand(1).getReg(), Curr.second});
4705 case AMDGPU::G_FNEG: {
4706 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4707 if (Stat == SrcStatus::INVALID)
4708 return std::nullopt;
4709 return std::optional<std::pair<Register, SrcStatus>>(
4710 {MI->getOperand(1).getReg(), Stat});
4711 }
4712 default:
4713 break;
4714 }
4715
4716 // Calc next Stat from current Stat.
4717 switch (Curr.second) {
4718 case SrcStatus::IS_SAME:
4719 if (isTruncHalf(MI, MRI))
4720 return std::optional<std::pair<Register, SrcStatus>>(
4721 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4722 else if (isUnmergeHalf(MI, MRI)) {
4723 if (Curr.first == MI->getOperand(0).getReg())
4724 return std::optional<std::pair<Register, SrcStatus>>(
4725 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4726 return std::optional<std::pair<Register, SrcStatus>>(
4727 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4728 }
4729 break;
4731 if (isTruncHalf(MI, MRI)) {
4732 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4733 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4734 // = [OpLowerHi, OpLowerLo]
4735 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4736 // = [-OpLowerHi, OpLowerLo]
4737 // = -OpLower
4738 return std::optional<std::pair<Register, SrcStatus>>(
4739 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4740 }
4741 if (isUnmergeHalf(MI, MRI)) {
4742 if (Curr.first == MI->getOperand(0).getReg())
4743 return std::optional<std::pair<Register, SrcStatus>>(
4744 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4745 return std::optional<std::pair<Register, SrcStatus>>(
4746 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4747 }
4748 break;
4750 if (isShlHalf(MI, MRI))
4751 return std::optional<std::pair<Register, SrcStatus>>(
4752 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4753 break;
4755 if (isLshrHalf(MI, MRI))
4756 return std::optional<std::pair<Register, SrcStatus>>(
4757 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4758 break;
4760 if (isShlHalf(MI, MRI))
4761 return std::optional<std::pair<Register, SrcStatus>>(
4762 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4763 break;
4765 if (isLshrHalf(MI, MRI))
4766 return std::optional<std::pair<Register, SrcStatus>>(
4767 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4768 break;
4769 default:
4770 break;
4771 }
4772 return std::nullopt;
4773}
4774
4775/// This is used to control valid status that current MI supports. For example,
4776/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4777/// bit on VOP3P.
4778/// The class can be further extended to recognize support on SEL, NEG, ABS bit
4779/// for different MI on different arch
4781private:
4782 bool HasNeg = false;
4783 // Assume all complex pattern of VOP3P have opsel.
4784 bool HasOpsel = true;
4785
4786public:
4788 const MachineInstr *MI = MRI.getVRegDef(Reg);
4789 unsigned Opc = MI->getOpcode();
4790
4791 if (Opc < TargetOpcode::GENERIC_OP_END) {
4792 // Keep same for generic op.
4793 HasNeg = true;
4794 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4795 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4796 // Only float point intrinsic has neg & neg_hi bits.
4797 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4798 HasNeg = true;
4799 }
4800 }
4801 bool checkOptions(SrcStatus Stat) const {
4802 if (!HasNeg &&
4803 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4804 return false;
4805 }
4806 if (!HasOpsel &&
4807 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4808 return false;
4809 }
4810 return true;
4811 }
4812};
4813
4816 int MaxDepth = 3) {
4817 int Depth = 0;
4818 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4820
4821 while (Depth <= MaxDepth && Curr.has_value()) {
4822 Depth++;
4823 if (SO.checkOptions(Curr.value().second))
4824 Statlist.push_back(Curr.value());
4825 Curr = calcNextStatus(Curr.value(), MRI);
4826 }
4827
4828 return Statlist;
4829}
4830
4831static std::pair<Register, SrcStatus>
4833 int MaxDepth = 3) {
4834 int Depth = 0;
4835 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4836 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4837
4838 while (Depth <= MaxDepth && Curr.has_value()) {
4839 Depth++;
4840 SrcStatus Stat = Curr.value().second;
4841 if (SO.checkOptions(Stat)) {
4842 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4844 LastSameOrNeg = Curr.value();
4845 }
4846 Curr = calcNextStatus(Curr.value(), MRI);
4847 }
4848
4849 return LastSameOrNeg;
4850}
4851
4852static bool isSameBitWidth(Register Reg1, Register Reg2,
4853 const MachineRegisterInfo &MRI) {
4854 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4855 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4856 return Width1 == Width2;
4857}
4858
4859static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
4860 // SrcStatus::IS_LOWER_HALF remain 0.
4861 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
4862 Mods ^= SISrcMods::NEG_HI;
4863 Mods |= SISrcMods::OP_SEL_1;
4864 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
4865 Mods |= SISrcMods::OP_SEL_1;
4866 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
4867 Mods ^= SISrcMods::NEG_HI;
4868 else if (HiStat == SrcStatus::IS_HI_NEG)
4869 Mods ^= SISrcMods::NEG_HI;
4870
4871 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
4872 Mods ^= SISrcMods::NEG;
4873 Mods |= SISrcMods::OP_SEL_0;
4874 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
4875 Mods |= SISrcMods::OP_SEL_0;
4876 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
4877 Mods |= SISrcMods::NEG;
4878 else if (LoStat == SrcStatus::IS_HI_NEG)
4879 Mods ^= SISrcMods::NEG;
4880
4881 return Mods;
4882}
4883
4884static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
4885 Register RootReg, const SIInstrInfo &TII,
4886 const MachineRegisterInfo &MRI) {
4887 auto IsHalfState = [](SrcStatus S) {
4890 };
4891 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
4892 IsHalfState(HiStat);
4893}
4894
4895std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
4896 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
4897 unsigned Mods = 0;
4898 // No modification if Root type is not form of <2 x Type>.
4899 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
4900 Mods |= SISrcMods::OP_SEL_1;
4901 return {RootReg, Mods};
4902 }
4903
4904 SearchOptions SO(RootReg, MRI);
4905
4906 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
4907
4908 if (Stat.second == SrcStatus::IS_BOTH_NEG)
4910 else if (Stat.second == SrcStatus::IS_HI_NEG)
4911 Mods ^= SISrcMods::NEG_HI;
4912 else if (Stat.second == SrcStatus::IS_LO_NEG)
4913 Mods ^= SISrcMods::NEG;
4914
4915 MachineInstr *MI = MRI.getVRegDef(Stat.first);
4916
4917 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
4918 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
4919 Mods |= SISrcMods::OP_SEL_1;
4920 return {Stat.first, Mods};
4921 }
4922
4924 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
4925
4926 if (StatlistHi.empty()) {
4927 Mods |= SISrcMods::OP_SEL_1;
4928 return {Stat.first, Mods};
4929 }
4930
4932 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
4933
4934 if (StatlistLo.empty()) {
4935 Mods |= SISrcMods::OP_SEL_1;
4936 return {Stat.first, Mods};
4937 }
4938
4939 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
4940 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
4941 if (StatlistHi[I].first == StatlistLo[J].first &&
4942 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
4943 StatlistHi[I].first, RootReg, TII, MRI))
4944 return {StatlistHi[I].first,
4945 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
4946 }
4947 }
4948 // Packed instructions do not have abs modifiers.
4949 Mods |= SISrcMods::OP_SEL_1;
4950
4951 return {Stat.first, Mods};
4952}
4953
4954// Removed unused function `getAllKindImm` to eliminate dead code.
4955
4956static bool checkRB(Register Reg, unsigned int RBNo,
4957 const AMDGPURegisterBankInfo &RBI,
4958 const MachineRegisterInfo &MRI,
4959 const TargetRegisterInfo &TRI) {
4960 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
4961 return RB->getID() == RBNo;
4962}
4963
4964// This function is used to get the correct register bank for returned reg.
4965// Assume:
4966// 1. VOP3P is always legal for VGPR.
4967// 2. RootOp's regbank is legal.
4968// Thus
4969// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
4970// 2. If RootOp is VGPR, then NewOp must be VGPR.
4972 const AMDGPURegisterBankInfo &RBI,
4974 const TargetRegisterInfo &TRI,
4975 const SIInstrInfo &TII) {
4976 // RootOp can only be VGPR or SGPR (some hand written cases such as.
4977 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
4978 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
4979 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
4980 return NewReg;
4981
4982 MachineInstr *MI = MRI.getVRegDef(RootReg);
4983 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
4984 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
4985 return RootReg;
4986 }
4987
4988 MachineBasicBlock *BB = MI->getParent();
4989 Register DstReg = MRI.cloneVirtualRegister(RootReg);
4990
4992 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
4993 .addReg(NewReg);
4994
4995 // Only accept VGPR.
4996 return MIB->getOperand(0).getReg();
4997}
4998
5000AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5001 bool IsDOT) const {
5002 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5003 Register Reg;
5004 unsigned Mods;
5005 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5006
5007 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5008 return {{
5009 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5010 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5011 }};
5012}
5013
5015AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5016
5017 return selectVOP3PRetHelper(Root);
5018}
5019
5021AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5022
5023 return selectVOP3PRetHelper(Root, true);
5024}
5025
5027AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5028 MachineOperand &Root) const {
5029 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5030 "expected i1 value");
5031 unsigned Mods = SISrcMods::OP_SEL_1;
5032 if (Root.getImm() != 0)
5033 Mods |= SISrcMods::OP_SEL_0;
5034
5035 return {{
5036 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5037 }};
5038}
5039
5041 MachineInstr *InsertPt,
5043 const TargetRegisterClass *DstRegClass;
5044 switch (Elts.size()) {
5045 case 8:
5046 DstRegClass = &AMDGPU::VReg_256RegClass;
5047 break;
5048 case 4:
5049 DstRegClass = &AMDGPU::VReg_128RegClass;
5050 break;
5051 case 2:
5052 DstRegClass = &AMDGPU::VReg_64RegClass;
5053 break;
5054 default:
5055 llvm_unreachable("unhandled Reg sequence size");
5056 }
5057
5058 MachineIRBuilder B(*InsertPt);
5059 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5060 .addDef(MRI.createVirtualRegister(DstRegClass));
5061 for (unsigned i = 0; i < Elts.size(); ++i) {
5062 MIB.addReg(Elts[i]);
5064 }
5065 return MIB->getOperand(0).getReg();
5066}
5067
5068static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5070 MachineInstr *InsertPt,
5072 if (ModOpcode == TargetOpcode::G_FNEG) {
5073 Mods |= SISrcMods::NEG;
5074 // Check if all elements also have abs modifier
5075 SmallVector<Register, 8> NegAbsElts;
5076 for (auto El : Elts) {
5077 Register FabsSrc;
5078 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5079 break;
5080 NegAbsElts.push_back(FabsSrc);
5081 }
5082 if (Elts.size() != NegAbsElts.size()) {
5083 // Neg
5084 Src = buildRegSequence(Elts, InsertPt, MRI);
5085 } else {
5086 // Neg and Abs
5087 Mods |= SISrcMods::NEG_HI;
5088 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5089 }
5090 } else {
5091 assert(ModOpcode == TargetOpcode::G_FABS);
5092 // Abs
5093 Mods |= SISrcMods::NEG_HI;
5094 Src = buildRegSequence(Elts, InsertPt, MRI);
5095 }
5096}
5097
5099AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5100 Register Src = Root.getReg();
5101 unsigned Mods = SISrcMods::OP_SEL_1;
5103
5104 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5105 assert(BV->getNumSources() > 0);
5106 // Based on first element decide which mod we match, neg or abs
5107 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5108 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5109 ? AMDGPU::G_FNEG
5110 : AMDGPU::G_FABS;
5111 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5112 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5113 if (ElF32->getOpcode() != ModOpcode)
5114 break;
5115 EltsF32.push_back(ElF32->getOperand(1).getReg());
5116 }
5117
5118 // All elements had ModOpcode modifier
5119 if (BV->getNumSources() == EltsF32.size()) {
5120 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5121 *MRI);
5122 }
5123 }
5124
5125 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5126 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5127}
5128
5130AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5131 Register Src = Root.getReg();
5132 unsigned Mods = SISrcMods::OP_SEL_1;
5133 SmallVector<Register, 8> EltsV2F16;
5134
5135 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5136 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5137 Register FNegSrc;
5138 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5139 break;
5140 EltsV2F16.push_back(FNegSrc);
5141 }
5142
5143 // All elements had ModOpcode modifier
5144 if (CV->getNumSources() == EltsV2F16.size()) {
5145 Mods |= SISrcMods::NEG;
5146 Mods |= SISrcMods::NEG_HI;
5147 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5148 }
5149 }
5150
5151 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5152 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5153}
5154
5156AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5157 Register Src = Root.getReg();
5158 unsigned Mods = SISrcMods::OP_SEL_1;
5159 SmallVector<Register, 8> EltsV2F16;
5160
5161 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5162 assert(CV->getNumSources() > 0);
5163 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5164 // Based on first element decide which mod we match, neg or abs
5165 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5166 ? AMDGPU::G_FNEG
5167 : AMDGPU::G_FABS;
5168
5169 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5170 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5171 if (ElV2F16->getOpcode() != ModOpcode)
5172 break;
5173 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5174 }
5175
5176 // All elements had ModOpcode modifier
5177 if (CV->getNumSources() == EltsV2F16.size()) {
5178 MachineIRBuilder B(*Root.getParent());
5179 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5180 *MRI);
5181 }
5182 }
5183
5184 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5185 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5186}
5187
5189AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5190 std::optional<FPValueAndVReg> FPValReg;
5191 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5192 if (TII.isInlineConstant(FPValReg->Value)) {
5193 return {{[=](MachineInstrBuilder &MIB) {
5194 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5195 }}};
5196 }
5197 // Non-inlineable splat floats should not fall-through for integer immediate
5198 // checks.
5199 return {};
5200 }
5201
5202 APInt ICst;
5203 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5204 if (TII.isInlineConstant(ICst)) {
5205 return {
5206 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5207 }
5208 }
5209
5210 return {};
5211}
5212
5214AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5215 Register Src =
5216 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5217 unsigned Key = 0;
5218
5219 Register ShiftSrc;
5220 std::optional<ValueAndVReg> ShiftAmt;
5221 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5222 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5223 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5224 Key = ShiftAmt->Value.getZExtValue() / 8;
5225 Src = ShiftSrc;
5226 }
5227
5228 return {{
5229 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5230 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5231 }};
5232}
5233
5235AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5236
5237 Register Src =
5238 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5239 unsigned Key = 0;
5240
5241 Register ShiftSrc;
5242 std::optional<ValueAndVReg> ShiftAmt;
5243 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5244 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5245 ShiftAmt->Value.getZExtValue() == 16) {
5246 Src = ShiftSrc;
5247 Key = 1;
5248 }
5249
5250 return {{
5251 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5252 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5253 }};
5254}
5255
5257AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5258 Register Src =
5259 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5260 unsigned Key = 0;
5261
5262 Register S32 = matchZeroExtendFromS32(Src);
5263 if (!S32)
5264 S32 = matchAnyExtendFromS32(Src);
5265
5266 if (S32) {
5267 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5268 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5269 assert(Def->getNumOperands() == 3);
5270 Register DstReg1 = Def->getOperand(1).getReg();
5271 if (mi_match(S32, *MRI,
5272 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5273 Src = Def->getOperand(2).getReg();
5274 Key = 1;
5275 }
5276 }
5277 }
5278
5279 return {{
5280 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5281 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5282 }};
5283}
5284
5286AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5287 Register Src;
5288 unsigned Mods;
5289 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5290
5291 // FIXME: Handle op_sel
5292 return {{
5293 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5294 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5295 }};
5296}
5297
5298// FIXME-TRUE16 remove when fake16 is removed
5300AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5301 Register Src;
5302 unsigned Mods;
5303 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5304 /*IsCanonicalizing=*/true,
5305 /*AllowAbs=*/false,
5306 /*OpSel=*/false);
5307
5308 return {{
5309 [=](MachineInstrBuilder &MIB) {
5310 MIB.addReg(
5311 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5312 },
5313 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5314 }};
5315}
5316
5318AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5319 Register Src;
5320 unsigned Mods;
5321 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5322 /*IsCanonicalizing=*/true,
5323 /*AllowAbs=*/false,
5324 /*OpSel=*/true);
5325
5326 return {{
5327 [=](MachineInstrBuilder &MIB) {
5328 MIB.addReg(
5329 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5330 },
5331 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5332 }};
5333}
5334
5335// Given \p Offset and load specified by the \p Root operand check if \p Offset
5336// is a multiple of the load byte size. If it is update \p Offset to a
5337// pre-scaled value and return true.
5338bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5340 bool IsSigned) const {
5341 if (!Subtarget->hasScaleOffset())
5342 return false;
5343
5344 const MachineInstr &MI = *Root.getParent();
5345 MachineMemOperand *MMO = *MI.memoperands_begin();
5346
5347 if (!MMO->getSize().hasValue())
5348 return false;
5349
5350 uint64_t Size = MMO->getSize().getValue();
5351
5352 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5353 if (!OffsetReg)
5354 OffsetReg = Offset;
5355
5356 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5357 OffsetReg = Def->Reg;
5358
5359 Register Op0;
5360 MachineInstr *Mul;
5361 bool ScaleOffset =
5362 (isPowerOf2_64(Size) &&
5363 mi_match(OffsetReg, *MRI,
5364 m_GShl(m_Reg(Op0),
5367 mi_match(OffsetReg, *MRI,
5369 m_Copy(m_SpecificICst(Size))))) ||
5370 mi_match(
5371 OffsetReg, *MRI,
5372 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5373 m_Reg(Op0), m_SpecificICst(Size))) ||
5374 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5375 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5376 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5377 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5378 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5379 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5380 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5381 mi_match(Mul->getOperand(3).getReg(), *MRI,
5383 m_Copy(m_SpecificICst(Size))))) &&
5384 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5385
5386 if (ScaleOffset)
5387 Offset = Op0;
5388
5389 return ScaleOffset;
5390}
5391
5392bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5393 Register &Base,
5394 Register *SOffset,
5395 int64_t *Offset,
5396 bool *ScaleOffset) const {
5397 MachineInstr *MI = Root.getParent();
5398 MachineBasicBlock *MBB = MI->getParent();
5399
5400 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5401 // then we can select all ptr + 32-bit offsets.
5402 SmallVector<GEPInfo, 4> AddrInfo;
5403 getAddrModeInfo(*MI, *MRI, AddrInfo);
5404
5405 if (AddrInfo.empty())
5406 return false;
5407
5408 const GEPInfo &GEPI = AddrInfo[0];
5409 std::optional<int64_t> EncodedImm;
5410
5411 if (ScaleOffset)
5412 *ScaleOffset = false;
5413
5414 if (SOffset && Offset) {
5415 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5416 /*HasSOffset=*/true);
5417 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5418 AddrInfo.size() > 1) {
5419 const GEPInfo &GEPI2 = AddrInfo[1];
5420 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5421 Register OffsetReg = GEPI2.SgprParts[1];
5422 if (ScaleOffset)
5423 *ScaleOffset =
5424 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5425 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5426 if (OffsetReg) {
5427 Base = GEPI2.SgprParts[0];
5428 *SOffset = OffsetReg;
5429 *Offset = *EncodedImm;
5430 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5431 return true;
5432
5433 // For unbuffered smem loads, it is illegal for the Immediate Offset
5434 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5435 // is negative. Handle the case where the Immediate Offset + SOffset
5436 // is negative.
5437 auto SKnown = VT->getKnownBits(*SOffset);
5438 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5439 return false;
5440
5441 return true;
5442 }
5443 }
5444 }
5445 return false;
5446 }
5447
5448 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5449 /*HasSOffset=*/false);
5450 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5451 Base = GEPI.SgprParts[0];
5452 *Offset = *EncodedImm;
5453 return true;
5454 }
5455
5456 // SGPR offset is unsigned.
5457 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5458 GEPI.Imm != 0) {
5459 // If we make it this far we have a load with an 32-bit immediate offset.
5460 // It is OK to select this using a sgpr offset, because we have already
5461 // failed trying to select this load into one of the _IMM variants since
5462 // the _IMM Patterns are considered before the _SGPR patterns.
5463 Base = GEPI.SgprParts[0];
5464 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5465 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5466 .addImm(GEPI.Imm);
5467 return true;
5468 }
5469
5470 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5471 Register OffsetReg = GEPI.SgprParts[1];
5472 if (ScaleOffset)
5473 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5474 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5475 if (OffsetReg) {
5476 Base = GEPI.SgprParts[0];
5477 *SOffset = OffsetReg;
5478 return true;
5479 }
5480 }
5481
5482 return false;
5483}
5484
5486AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5487 Register Base;
5488 int64_t Offset;
5489 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5490 /* ScaleOffset */ nullptr))
5491 return std::nullopt;
5492
5493 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5494 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5495}
5496
5498AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5499 SmallVector<GEPInfo, 4> AddrInfo;
5500 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5501
5502 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5503 return std::nullopt;
5504
5505 const GEPInfo &GEPInfo = AddrInfo[0];
5506 Register PtrReg = GEPInfo.SgprParts[0];
5507 std::optional<int64_t> EncodedImm =
5508 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5509 if (!EncodedImm)
5510 return std::nullopt;
5511
5512 return {{
5513 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5514 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5515 }};
5516}
5517
5519AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5520 Register Base, SOffset;
5521 bool ScaleOffset;
5522 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5523 &ScaleOffset))
5524 return std::nullopt;
5525
5526 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5527 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5528 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5529 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5530}
5531
5533AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5534 Register Base, SOffset;
5535 int64_t Offset;
5536 bool ScaleOffset;
5537 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5538 return std::nullopt;
5539
5540 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5541 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5542 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5543 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5544 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5545}
5546
5547std::pair<Register, int>
5548AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5549 uint64_t FlatVariant) const {
5550 MachineInstr *MI = Root.getParent();
5551
5552 auto Default = std::pair(Root.getReg(), 0);
5553
5554 if (!STI.hasFlatInstOffsets())
5555 return Default;
5556
5557 Register PtrBase;
5558 int64_t ConstOffset;
5559 bool IsInBounds;
5560 std::tie(PtrBase, ConstOffset, IsInBounds) =
5561 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5562
5563 // Adding the offset to the base address with an immediate in a FLAT
5564 // instruction must not change the memory aperture in which the address falls.
5565 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5566 // instructions.
5567 if (ConstOffset == 0 ||
5568 (FlatVariant == SIInstrFlags::FlatScratch &&
5569 !isFlatScratchBaseLegal(Root.getReg())) ||
5570 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5571 return Default;
5572
5573 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5574 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5575 return Default;
5576
5577 return std::pair(PtrBase, ConstOffset);
5578}
5579
5581AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5582 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5583
5584 return {{
5585 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5586 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5587 }};
5588}
5589
5591AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5592 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5593
5594 return {{
5595 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5596 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5597 }};
5598}
5599
5601AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5602 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5603
5604 return {{
5605 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5606 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5607 }};
5608}
5609
5610// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5612AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5613 unsigned CPolBits,
5614 bool NeedIOffset) const {
5615 Register Addr = Root.getReg();
5616 Register PtrBase;
5617 int64_t ConstOffset;
5618 int64_t ImmOffset = 0;
5619
5620 // Match the immediate offset first, which canonically is moved as low as
5621 // possible.
5622 std::tie(PtrBase, ConstOffset, std::ignore) =
5623 getPtrBaseWithConstantOffset(Addr, *MRI);
5624
5625 if (ConstOffset != 0) {
5626 if (NeedIOffset &&
5627 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5629 Addr = PtrBase;
5630 ImmOffset = ConstOffset;
5631 } else {
5632 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5633 if (isSGPR(PtrBaseDef->Reg)) {
5634 if (ConstOffset > 0) {
5635 // Offset is too large.
5636 //
5637 // saddr + large_offset -> saddr +
5638 // (voffset = large_offset & ~MaxOffset) +
5639 // (large_offset & MaxOffset);
5640 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5641 if (NeedIOffset) {
5642 std::tie(SplitImmOffset, RemainderOffset) =
5643 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5645 }
5646
5647 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5648 : isUInt<32>(RemainderOffset)) {
5649 MachineInstr *MI = Root.getParent();
5650 MachineBasicBlock *MBB = MI->getParent();
5651 Register HighBits =
5652 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5653
5654 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5655 HighBits)
5656 .addImm(RemainderOffset);
5657
5658 if (NeedIOffset)
5659 return {{
5660 [=](MachineInstrBuilder &MIB) {
5661 MIB.addReg(PtrBase);
5662 }, // saddr
5663 [=](MachineInstrBuilder &MIB) {
5664 MIB.addReg(HighBits);
5665 }, // voffset
5666 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5667 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5668 }};
5669 return {{
5670 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5671 [=](MachineInstrBuilder &MIB) {
5672 MIB.addReg(HighBits);
5673 }, // voffset
5674 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5675 }};
5676 }
5677 }
5678
5679 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5680 // is 1 we would need to perform 1 or 2 extra moves for each half of
5681 // the constant and it is better to do a scalar add and then issue a
5682 // single VALU instruction to materialize zero. Otherwise it is less
5683 // instructions to perform VALU adds with immediates or inline literals.
5684 unsigned NumLiterals =
5685 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5686 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5687 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5688 return std::nullopt;
5689 }
5690 }
5691 }
5692
5693 // Match the variable offset.
5694 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5695 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5696 // Look through the SGPR->VGPR copy.
5697 Register SAddr =
5698 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5699
5700 if (isSGPR(SAddr)) {
5701 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5702
5703 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5704 // inserted later.
5705 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5706 Subtarget->hasSignedGVSOffset());
5707 if (Register VOffset = matchExtendFromS32OrS32(
5708 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5709 if (NeedIOffset)
5710 return {{[=](MachineInstrBuilder &MIB) { // saddr
5711 MIB.addReg(SAddr);
5712 },
5713 [=](MachineInstrBuilder &MIB) { // voffset
5714 MIB.addReg(VOffset);
5715 },
5716 [=](MachineInstrBuilder &MIB) { // offset
5717 MIB.addImm(ImmOffset);
5718 },
5719 [=](MachineInstrBuilder &MIB) { // cpol
5720 MIB.addImm(CPolBits |
5721 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5722 }}};
5723 return {{[=](MachineInstrBuilder &MIB) { // saddr
5724 MIB.addReg(SAddr);
5725 },
5726 [=](MachineInstrBuilder &MIB) { // voffset
5727 MIB.addReg(VOffset);
5728 },
5729 [=](MachineInstrBuilder &MIB) { // cpol
5730 MIB.addImm(CPolBits |
5731 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5732 }}};
5733 }
5734 }
5735 }
5736
5737 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5738 // drop this.
5739 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5740 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5741 return std::nullopt;
5742
5743 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5744 // moves required to copy a 64-bit SGPR to VGPR.
5745 MachineInstr *MI = Root.getParent();
5746 MachineBasicBlock *MBB = MI->getParent();
5747 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5748
5749 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5750 .addImm(0);
5751
5752 if (NeedIOffset)
5753 return {{
5754 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5755 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5756 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5757 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5758 }};
5759 return {{
5760 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5761 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5762 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5763 }};
5764}
5765
5767AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5768 return selectGlobalSAddr(Root, 0);
5769}
5770
5772AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5773 const MachineInstr &I = *Root.getParent();
5774
5775 // We are assuming CPol is always the last operand of the intrinsic.
5776 auto PassedCPol =
5777 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5778 return selectGlobalSAddr(Root, PassedCPol);
5779}
5780
5782AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
5783 const MachineInstr &I = *Root.getParent();
5784
5785 // We are assuming CPol is second from last operand of the intrinsic.
5786 auto PassedCPol =
5787 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5788 return selectGlobalSAddr(Root, PassedCPol);
5789}
5790
5792AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
5793 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
5794}
5795
5797AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5798 MachineOperand &Root) const {
5799 const MachineInstr &I = *Root.getParent();
5800
5801 // We are assuming CPol is always the last operand of the intrinsic.
5802 auto PassedCPol =
5803 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5804 return selectGlobalSAddr(Root, PassedCPol, false);
5805}
5806
5808AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
5809 MachineOperand &Root) const {
5810 const MachineInstr &I = *Root.getParent();
5811
5812 // We are assuming CPol is second from last operand of the intrinsic.
5813 auto PassedCPol =
5814 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5815 return selectGlobalSAddr(Root, PassedCPol, false);
5816}
5817
5819AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5820 Register Addr = Root.getReg();
5821 Register PtrBase;
5822 int64_t ConstOffset;
5823 int64_t ImmOffset = 0;
5824
5825 // Match the immediate offset first, which canonically is moved as low as
5826 // possible.
5827 std::tie(PtrBase, ConstOffset, std::ignore) =
5828 getPtrBaseWithConstantOffset(Addr, *MRI);
5829
5830 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5831 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5833 Addr = PtrBase;
5834 ImmOffset = ConstOffset;
5835 }
5836
5837 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5838 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5839 int FI = AddrDef->MI->getOperand(1).getIndex();
5840 return {{
5841 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5842 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5843 }};
5844 }
5845
5846 Register SAddr = AddrDef->Reg;
5847
5848 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5849 Register LHS = AddrDef->MI->getOperand(1).getReg();
5850 Register RHS = AddrDef->MI->getOperand(2).getReg();
5851 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5852 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5853
5854 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5855 isSGPR(RHSDef->Reg)) {
5856 int FI = LHSDef->MI->getOperand(1).getIndex();
5857 MachineInstr &I = *Root.getParent();
5858 MachineBasicBlock *BB = I.getParent();
5859 const DebugLoc &DL = I.getDebugLoc();
5860 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5861
5862 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
5863 .addFrameIndex(FI)
5864 .addReg(RHSDef->Reg)
5865 .setOperandDead(3); // Dead scc
5866 }
5867 }
5868
5869 if (!isSGPR(SAddr))
5870 return std::nullopt;
5871
5872 return {{
5873 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
5874 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5875 }};
5876}
5877
5878// Check whether the flat scratch SVS swizzle bug affects this access.
5879bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5880 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5881 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5882 return false;
5883
5884 // The bug affects the swizzling of SVS accesses if there is any carry out
5885 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5886 // voffset to (soffset + inst_offset).
5887 auto VKnown = VT->getKnownBits(VAddr);
5888 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
5889 KnownBits::makeConstant(APInt(32, ImmOffset)));
5890 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5891 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5892 return (VMax & 3) + (SMax & 3) >= 4;
5893}
5894
5896AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5897 Register Addr = Root.getReg();
5898 Register PtrBase;
5899 int64_t ConstOffset;
5900 int64_t ImmOffset = 0;
5901
5902 // Match the immediate offset first, which canonically is moved as low as
5903 // possible.
5904 std::tie(PtrBase, ConstOffset, std::ignore) =
5905 getPtrBaseWithConstantOffset(Addr, *MRI);
5906
5907 Register OrigAddr = Addr;
5908 if (ConstOffset != 0 &&
5909 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5911 Addr = PtrBase;
5912 ImmOffset = ConstOffset;
5913 }
5914
5915 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5916 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5917 return std::nullopt;
5918
5919 Register RHS = AddrDef->MI->getOperand(2).getReg();
5920 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5921 return std::nullopt;
5922
5923 Register LHS = AddrDef->MI->getOperand(1).getReg();
5924 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5925
5926 if (OrigAddr != Addr) {
5927 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5928 return std::nullopt;
5929 } else {
5930 if (!isFlatScratchBaseLegalSV(OrigAddr))
5931 return std::nullopt;
5932 }
5933
5934 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5935 return std::nullopt;
5936
5937 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
5939 : 0;
5940
5941 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5942 int FI = LHSDef->MI->getOperand(1).getIndex();
5943 return {{
5944 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5945 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5946 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5947 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5948 }};
5949 }
5950
5951 if (!isSGPR(LHS))
5952 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
5953 LHS = Def->Reg;
5954
5955 if (!isSGPR(LHS))
5956 return std::nullopt;
5957
5958 return {{
5959 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5960 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5961 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5962 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5963 }};
5964}
5965
5967AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5968 MachineInstr *MI = Root.getParent();
5969 MachineBasicBlock *MBB = MI->getParent();
5970 MachineFunction *MF = MBB->getParent();
5971 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5972
5973 int64_t Offset = 0;
5974 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5975 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5976 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5977
5978 // TODO: Should this be inside the render function? The iterator seems to
5979 // move.
5980 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5981 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5982 HighBits)
5983 .addImm(Offset & ~MaxOffset);
5984
5985 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5986 MIB.addReg(Info->getScratchRSrcReg());
5987 },
5988 [=](MachineInstrBuilder &MIB) { // vaddr
5989 MIB.addReg(HighBits);
5990 },
5991 [=](MachineInstrBuilder &MIB) { // soffset
5992 // Use constant zero for soffset and rely on eliminateFrameIndex
5993 // to choose the appropriate frame register if need be.
5994 MIB.addImm(0);
5995 },
5996 [=](MachineInstrBuilder &MIB) { // offset
5997 MIB.addImm(Offset & MaxOffset);
5998 }}};
5999 }
6000
6001 assert(Offset == 0 || Offset == -1);
6002
6003 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6004 // offsets.
6005 std::optional<int> FI;
6006 Register VAddr = Root.getReg();
6007
6008 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6009 Register PtrBase;
6010 int64_t ConstOffset;
6011 std::tie(PtrBase, ConstOffset, std::ignore) =
6012 getPtrBaseWithConstantOffset(VAddr, *MRI);
6013 if (ConstOffset != 0) {
6014 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6015 (!STI.privateMemoryResourceIsRangeChecked() ||
6016 VT->signBitIsZero(PtrBase))) {
6017 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6018 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6019 FI = PtrBaseDef->getOperand(1).getIndex();
6020 else
6021 VAddr = PtrBase;
6022 Offset = ConstOffset;
6023 }
6024 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6025 FI = RootDef->getOperand(1).getIndex();
6026 }
6027
6028 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6029 MIB.addReg(Info->getScratchRSrcReg());
6030 },
6031 [=](MachineInstrBuilder &MIB) { // vaddr
6032 if (FI)
6033 MIB.addFrameIndex(*FI);
6034 else
6035 MIB.addReg(VAddr);
6036 },
6037 [=](MachineInstrBuilder &MIB) { // soffset
6038 // Use constant zero for soffset and rely on eliminateFrameIndex
6039 // to choose the appropriate frame register if need be.
6040 MIB.addImm(0);
6041 },
6042 [=](MachineInstrBuilder &MIB) { // offset
6043 MIB.addImm(Offset);
6044 }}};
6045}
6046
6047bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6048 int64_t Offset) const {
6049 if (!isUInt<16>(Offset))
6050 return false;
6051
6052 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6053 return true;
6054
6055 // On Southern Islands instruction with a negative base value and an offset
6056 // don't seem to work.
6057 return VT->signBitIsZero(Base);
6058}
6059
6060bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6061 int64_t Offset1,
6062 unsigned Size) const {
6063 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6064 return false;
6065 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6066 return false;
6067
6068 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6069 return true;
6070
6071 // On Southern Islands instruction with a negative base value and an offset
6072 // don't seem to work.
6073 return VT->signBitIsZero(Base);
6074}
6075
6076// Return whether the operation has NoUnsignedWrap property.
6077static bool isNoUnsignedWrap(MachineInstr *Addr) {
6078 return Addr->getOpcode() == TargetOpcode::G_OR ||
6079 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6081}
6082
6083// Check that the base address of flat scratch load/store in the form of `base +
6084// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6085// requirement). We always treat the first operand as the base address here.
6086bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6087 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6088
6089 if (isNoUnsignedWrap(AddrMI))
6090 return true;
6091
6092 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6093 // values.
6094 if (STI.hasSignedScratchOffsets())
6095 return true;
6096
6097 Register LHS = AddrMI->getOperand(1).getReg();
6098 Register RHS = AddrMI->getOperand(2).getReg();
6099
6100 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6101 std::optional<ValueAndVReg> RhsValReg =
6103 // If the immediate offset is negative and within certain range, the base
6104 // address cannot also be negative. If the base is also negative, the sum
6105 // would be either negative or much larger than the valid range of scratch
6106 // memory a thread can access.
6107 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6108 RhsValReg->Value.getSExtValue() > -0x40000000)
6109 return true;
6110 }
6111
6112 return VT->signBitIsZero(LHS);
6113}
6114
6115// Check address value in SGPR/VGPR are legal for flat scratch in the form
6116// of: SGPR + VGPR.
6117bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6118 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6119
6120 if (isNoUnsignedWrap(AddrMI))
6121 return true;
6122
6123 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6124 // values.
6125 if (STI.hasSignedScratchOffsets())
6126 return true;
6127
6128 Register LHS = AddrMI->getOperand(1).getReg();
6129 Register RHS = AddrMI->getOperand(2).getReg();
6130 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6131}
6132
6133// Check address value in SGPR/VGPR are legal for flat scratch in the form
6134// of: SGPR + VGPR + Imm.
6135bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6136 Register Addr) const {
6137 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6138 // values.
6139 if (STI.hasSignedScratchOffsets())
6140 return true;
6141
6142 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6143 Register Base = AddrMI->getOperand(1).getReg();
6144 std::optional<DefinitionAndSourceRegister> BaseDef =
6146 std::optional<ValueAndVReg> RHSOffset =
6148 assert(RHSOffset);
6149
6150 // If the immediate offset is negative and within certain range, the base
6151 // address cannot also be negative. If the base is also negative, the sum
6152 // would be either negative or much larger than the valid range of scratch
6153 // memory a thread can access.
6154 if (isNoUnsignedWrap(BaseDef->MI) &&
6155 (isNoUnsignedWrap(AddrMI) ||
6156 (RHSOffset->Value.getSExtValue() < 0 &&
6157 RHSOffset->Value.getSExtValue() > -0x40000000)))
6158 return true;
6159
6160 Register LHS = BaseDef->MI->getOperand(1).getReg();
6161 Register RHS = BaseDef->MI->getOperand(2).getReg();
6162 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6163}
6164
6165bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6166 unsigned ShAmtBits) const {
6167 assert(MI.getOpcode() == TargetOpcode::G_AND);
6168
6169 std::optional<APInt> RHS =
6170 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6171 if (!RHS)
6172 return false;
6173
6174 if (RHS->countr_one() >= ShAmtBits)
6175 return true;
6176
6177 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6178 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6179}
6180
6182AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6183 MachineOperand &Root) const {
6184 Register Reg = Root.getReg();
6185 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6186
6187 std::optional<DefinitionAndSourceRegister> Def =
6189 assert(Def && "this shouldn't be an optional result");
6190 Reg = Def->Reg;
6191
6192 if (Register WaveBase = getWaveAddress(Def->MI)) {
6193 return {{
6194 [=](MachineInstrBuilder &MIB) { // rsrc
6195 MIB.addReg(Info->getScratchRSrcReg());
6196 },
6197 [=](MachineInstrBuilder &MIB) { // soffset
6198 MIB.addReg(WaveBase);
6199 },
6200 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6201 }};
6202 }
6203
6204 int64_t Offset = 0;
6205
6206 // FIXME: Copy check is a hack
6208 if (mi_match(Reg, *MRI,
6209 m_GPtrAdd(m_Reg(BasePtr),
6211 if (!TII.isLegalMUBUFImmOffset(Offset))
6212 return {};
6213 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6214 Register WaveBase = getWaveAddress(BasePtrDef);
6215 if (!WaveBase)
6216 return {};
6217
6218 return {{
6219 [=](MachineInstrBuilder &MIB) { // rsrc
6220 MIB.addReg(Info->getScratchRSrcReg());
6221 },
6222 [=](MachineInstrBuilder &MIB) { // soffset
6223 MIB.addReg(WaveBase);
6224 },
6225 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6226 }};
6227 }
6228
6229 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6230 !TII.isLegalMUBUFImmOffset(Offset))
6231 return {};
6232
6233 return {{
6234 [=](MachineInstrBuilder &MIB) { // rsrc
6235 MIB.addReg(Info->getScratchRSrcReg());
6236 },
6237 [=](MachineInstrBuilder &MIB) { // soffset
6238 MIB.addImm(0);
6239 },
6240 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6241 }};
6242}
6243
6244std::pair<Register, unsigned>
6245AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6246 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6247 int64_t ConstAddr = 0;
6248
6249 Register PtrBase;
6250 int64_t Offset;
6251 std::tie(PtrBase, Offset, std::ignore) =
6252 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6253
6254 if (Offset) {
6255 if (isDSOffsetLegal(PtrBase, Offset)) {
6256 // (add n0, c0)
6257 return std::pair(PtrBase, Offset);
6258 }
6259 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6260 // TODO
6261
6262
6263 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6264 // TODO
6265
6266 }
6267
6268 return std::pair(Root.getReg(), 0);
6269}
6270
6272AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6273 Register Reg;
6274 unsigned Offset;
6275 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6276 return {{
6277 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6278 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6279 }};
6280}
6281
6283AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6284 return selectDSReadWrite2(Root, 4);
6285}
6286
6288AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6289 return selectDSReadWrite2(Root, 8);
6290}
6291
6293AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6294 unsigned Size) const {
6295 Register Reg;
6296 unsigned Offset;
6297 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6298 return {{
6299 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6300 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6301 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6302 }};
6303}
6304
6305std::pair<Register, unsigned>
6306AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6307 unsigned Size) const {
6308 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6309 int64_t ConstAddr = 0;
6310
6311 Register PtrBase;
6312 int64_t Offset;
6313 std::tie(PtrBase, Offset, std::ignore) =
6314 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6315
6316 if (Offset) {
6317 int64_t OffsetValue0 = Offset;
6318 int64_t OffsetValue1 = Offset + Size;
6319 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6320 // (add n0, c0)
6321 return std::pair(PtrBase, OffsetValue0 / Size);
6322 }
6323 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6324 // TODO
6325
6326 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6327 // TODO
6328
6329 }
6330
6331 return std::pair(Root.getReg(), 0);
6332}
6333
6334/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6335/// the base value with the constant offset, and if the offset computation is
6336/// known to be inbounds. There may be intervening copies between \p Root and
6337/// the identified constant. Returns \p Root, 0, false if this does not match
6338/// the pattern.
6339std::tuple<Register, int64_t, bool>
6340AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6341 Register Root, const MachineRegisterInfo &MRI) const {
6342 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6343 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6344 return {Root, 0, false};
6345
6346 MachineOperand &RHS = RootI->getOperand(2);
6347 std::optional<ValueAndVReg> MaybeOffset =
6349 if (!MaybeOffset)
6350 return {Root, 0, false};
6351 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6352 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6353 IsInBounds};
6354}
6355
6357 MIB.addImm(0);
6358}
6359
6360/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6361/// BasePtr is not valid, a null base pointer will be used.
6363 uint32_t FormatLo, uint32_t FormatHi,
6364 Register BasePtr) {
6365 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6366 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6367 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6368 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6369
6370 B.buildInstr(AMDGPU::S_MOV_B32)
6371 .addDef(RSrc2)
6372 .addImm(FormatLo);
6373 B.buildInstr(AMDGPU::S_MOV_B32)
6374 .addDef(RSrc3)
6375 .addImm(FormatHi);
6376
6377 // Build the half of the subregister with the constants before building the
6378 // full 128-bit register. If we are building multiple resource descriptors,
6379 // this will allow CSEing of the 2-component register.
6380 B.buildInstr(AMDGPU::REG_SEQUENCE)
6381 .addDef(RSrcHi)
6382 .addReg(RSrc2)
6383 .addImm(AMDGPU::sub0)
6384 .addReg(RSrc3)
6385 .addImm(AMDGPU::sub1);
6386
6387 Register RSrcLo = BasePtr;
6388 if (!BasePtr) {
6389 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6390 B.buildInstr(AMDGPU::S_MOV_B64)
6391 .addDef(RSrcLo)
6392 .addImm(0);
6393 }
6394
6395 B.buildInstr(AMDGPU::REG_SEQUENCE)
6396 .addDef(RSrc)
6397 .addReg(RSrcLo)
6398 .addImm(AMDGPU::sub0_sub1)
6399 .addReg(RSrcHi)
6400 .addImm(AMDGPU::sub2_sub3);
6401
6402 return RSrc;
6403}
6404
6406 const SIInstrInfo &TII, Register BasePtr) {
6407 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6408
6409 // FIXME: Why are half the "default" bits ignored based on the addressing
6410 // mode?
6411 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6412}
6413
6415 const SIInstrInfo &TII, Register BasePtr) {
6416 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6417
6418 // FIXME: Why are half the "default" bits ignored based on the addressing
6419 // mode?
6420 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6421}
6422
6423AMDGPUInstructionSelector::MUBUFAddressData
6424AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6425 MUBUFAddressData Data;
6426 Data.N0 = Src;
6427
6428 Register PtrBase;
6429 int64_t Offset;
6430
6431 std::tie(PtrBase, Offset, std::ignore) =
6432 getPtrBaseWithConstantOffset(Src, *MRI);
6433 if (isUInt<32>(Offset)) {
6434 Data.N0 = PtrBase;
6435 Data.Offset = Offset;
6436 }
6437
6438 if (MachineInstr *InputAdd
6439 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6440 Data.N2 = InputAdd->getOperand(1).getReg();
6441 Data.N3 = InputAdd->getOperand(2).getReg();
6442
6443 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6444 // FIXME: Don't know this was defined by operand 0
6445 //
6446 // TODO: Remove this when we have copy folding optimizations after
6447 // RegBankSelect.
6448 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6449 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6450 }
6451
6452 return Data;
6453}
6454
6455/// Return if the addr64 mubuf mode should be used for the given address.
6456bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6457 // (ptr_add N2, N3) -> addr64, or
6458 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6459 if (Addr.N2)
6460 return true;
6461
6462 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6463 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6464}
6465
6466/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6467/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6468/// component.
6469void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6470 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6471 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6472 return;
6473
6474 // Illegal offset, store it in soffset.
6475 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6476 B.buildInstr(AMDGPU::S_MOV_B32)
6477 .addDef(SOffset)
6478 .addImm(ImmOffset);
6479 ImmOffset = 0;
6480}
6481
6482bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6483 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6484 Register &SOffset, int64_t &Offset) const {
6485 // FIXME: Predicates should stop this from reaching here.
6486 // addr64 bit was removed for volcanic islands.
6487 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6488 return false;
6489
6490 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6491 if (!shouldUseAddr64(AddrData))
6492 return false;
6493
6494 Register N0 = AddrData.N0;
6495 Register N2 = AddrData.N2;
6496 Register N3 = AddrData.N3;
6497 Offset = AddrData.Offset;
6498
6499 // Base pointer for the SRD.
6500 Register SRDPtr;
6501
6502 if (N2) {
6503 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6504 assert(N3);
6505 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6506 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6507 // addr64, and construct the default resource from a 0 address.
6508 VAddr = N0;
6509 } else {
6510 SRDPtr = N3;
6511 VAddr = N2;
6512 }
6513 } else {
6514 // N2 is not divergent.
6515 SRDPtr = N2;
6516 VAddr = N3;
6517 }
6518 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6519 // Use the default null pointer in the resource
6520 VAddr = N0;
6521 } else {
6522 // N0 -> offset, or
6523 // (N0 + C1) -> offset
6524 SRDPtr = N0;
6525 }
6526
6527 MachineIRBuilder B(*Root.getParent());
6528 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6529 splitIllegalMUBUFOffset(B, SOffset, Offset);
6530 return true;
6531}
6532
6533bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6534 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6535 int64_t &Offset) const {
6536
6537 // FIXME: Pattern should not reach here.
6538 if (STI.useFlatForGlobal())
6539 return false;
6540
6541 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6542 if (shouldUseAddr64(AddrData))
6543 return false;
6544
6545 // N0 -> offset, or
6546 // (N0 + C1) -> offset
6547 Register SRDPtr = AddrData.N0;
6548 Offset = AddrData.Offset;
6549
6550 // TODO: Look through extensions for 32-bit soffset.
6551 MachineIRBuilder B(*Root.getParent());
6552
6553 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6554 splitIllegalMUBUFOffset(B, SOffset, Offset);
6555 return true;
6556}
6557
6559AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6560 Register VAddr;
6561 Register RSrcReg;
6562 Register SOffset;
6563 int64_t Offset = 0;
6564
6565 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6566 return {};
6567
6568 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6569 // pattern.
6570 return {{
6571 [=](MachineInstrBuilder &MIB) { // rsrc
6572 MIB.addReg(RSrcReg);
6573 },
6574 [=](MachineInstrBuilder &MIB) { // vaddr
6575 MIB.addReg(VAddr);
6576 },
6577 [=](MachineInstrBuilder &MIB) { // soffset
6578 if (SOffset)
6579 MIB.addReg(SOffset);
6580 else if (STI.hasRestrictedSOffset())
6581 MIB.addReg(AMDGPU::SGPR_NULL);
6582 else
6583 MIB.addImm(0);
6584 },
6585 [=](MachineInstrBuilder &MIB) { // offset
6586 MIB.addImm(Offset);
6587 },
6588 addZeroImm, // cpol
6589 addZeroImm, // tfe
6590 addZeroImm // swz
6591 }};
6592}
6593
6595AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6596 Register RSrcReg;
6597 Register SOffset;
6598 int64_t Offset = 0;
6599
6600 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6601 return {};
6602
6603 return {{
6604 [=](MachineInstrBuilder &MIB) { // rsrc
6605 MIB.addReg(RSrcReg);
6606 },
6607 [=](MachineInstrBuilder &MIB) { // soffset
6608 if (SOffset)
6609 MIB.addReg(SOffset);
6610 else if (STI.hasRestrictedSOffset())
6611 MIB.addReg(AMDGPU::SGPR_NULL);
6612 else
6613 MIB.addImm(0);
6614 },
6615 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6616 addZeroImm, // cpol
6617 addZeroImm, // tfe
6618 addZeroImm, // swz
6619 }};
6620}
6621
6623AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6624
6625 Register SOffset = Root.getReg();
6626
6627 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6628 SOffset = AMDGPU::SGPR_NULL;
6629
6630 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6631}
6632
6633/// Get an immediate that must be 32-bits, and treated as zero extended.
6634static std::optional<uint64_t>
6636 // getIConstantVRegVal sexts any values, so see if that matters.
6637 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6638 if (!OffsetVal || !isInt<32>(*OffsetVal))
6639 return std::nullopt;
6640 return Lo_32(*OffsetVal);
6641}
6642
6644AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6645 std::optional<uint64_t> OffsetVal =
6646 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6647 if (!OffsetVal)
6648 return {};
6649
6650 std::optional<int64_t> EncodedImm =
6651 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6652 if (!EncodedImm)
6653 return {};
6654
6655 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6656}
6657
6659AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6660 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6661
6662 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6663 if (!OffsetVal)
6664 return {};
6665
6666 std::optional<int64_t> EncodedImm =
6668 if (!EncodedImm)
6669 return {};
6670
6671 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6672}
6673
6675AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6676 // Match the (soffset + offset) pair as a 32-bit register base and
6677 // an immediate offset.
6678 Register SOffset;
6679 unsigned Offset;
6680 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6681 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6682 if (!SOffset)
6683 return std::nullopt;
6684
6685 std::optional<int64_t> EncodedOffset =
6686 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6687 if (!EncodedOffset)
6688 return std::nullopt;
6689
6690 assert(MRI->getType(SOffset) == LLT::scalar(32));
6691 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6692 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6693}
6694
6695std::pair<Register, unsigned>
6696AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6697 bool &Matched) const {
6698 Matched = false;
6699
6700 Register Src;
6701 unsigned Mods;
6702 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6703
6704 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6705 assert(MRI->getType(Src) == LLT::scalar(16));
6706
6707 // Only change Src if src modifier could be gained. In such cases new Src
6708 // could be sgpr but this does not violate constant bus restriction for
6709 // instruction that is being selected.
6710 Src = stripBitCast(Src, *MRI);
6711
6712 const auto CheckAbsNeg = [&]() {
6713 // Be careful about folding modifiers if we already have an abs. fneg is
6714 // applied last, so we don't want to apply an earlier fneg.
6715 if ((Mods & SISrcMods::ABS) == 0) {
6716 unsigned ModsTmp;
6717 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6718
6719 if ((ModsTmp & SISrcMods::NEG) != 0)
6720 Mods ^= SISrcMods::NEG;
6721
6722 if ((ModsTmp & SISrcMods::ABS) != 0)
6723 Mods |= SISrcMods::ABS;
6724 }
6725 };
6726
6727 CheckAbsNeg();
6728
6729 // op_sel/op_sel_hi decide the source type and source.
6730 // If the source's op_sel_hi is set, it indicates to do a conversion from
6731 // fp16. If the sources's op_sel is set, it picks the high half of the
6732 // source register.
6733
6734 Mods |= SISrcMods::OP_SEL_1;
6735
6736 if (isExtractHiElt(*MRI, Src, Src)) {
6737 Mods |= SISrcMods::OP_SEL_0;
6738 CheckAbsNeg();
6739 }
6740
6741 Matched = true;
6742 }
6743
6744 return {Src, Mods};
6745}
6746
6748AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6749 MachineOperand &Root) const {
6750 Register Src;
6751 unsigned Mods;
6752 bool Matched;
6753 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6754 if (!Matched)
6755 return {};
6756
6757 return {{
6758 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6759 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6760 }};
6761}
6762
6764AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6765 Register Src;
6766 unsigned Mods;
6767 bool Matched;
6768 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6769
6770 return {{
6771 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6772 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6773 }};
6774}
6775
6776bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6777 MachineInstr &I, Intrinsic::ID IntrID) const {
6778 MachineBasicBlock *MBB = I.getParent();
6779 const DebugLoc &DL = I.getDebugLoc();
6780 Register CCReg = I.getOperand(0).getReg();
6781
6782 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6783 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6784
6785 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6786 .addImm(I.getOperand(2).getImm());
6787
6788 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6789
6790 I.eraseFromParent();
6791 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6792 *MRI);
6793}
6794
6795bool AMDGPUInstructionSelector::selectSGetBarrierState(
6796 MachineInstr &I, Intrinsic::ID IntrID) const {
6797 MachineBasicBlock *MBB = I.getParent();
6798 const DebugLoc &DL = I.getDebugLoc();
6799 const MachineOperand &BarOp = I.getOperand(2);
6800 std::optional<int64_t> BarValImm =
6801 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6802
6803 if (!BarValImm) {
6804 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6805 .addReg(BarOp.getReg());
6806 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6807 }
6808 MachineInstrBuilder MIB;
6809 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6810 : AMDGPU::S_GET_BARRIER_STATE_M0;
6811 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6812
6813 auto DstReg = I.getOperand(0).getReg();
6814 const TargetRegisterClass *DstRC =
6815 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6816 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6817 return false;
6818 MIB.addDef(DstReg);
6819 if (BarValImm) {
6820 MIB.addImm(*BarValImm);
6821 }
6822 I.eraseFromParent();
6823 return true;
6824}
6825
6826unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6827 if (HasInlineConst) {
6828 switch (IntrID) {
6829 default:
6830 llvm_unreachable("not a named barrier op");
6831 case Intrinsic::amdgcn_s_barrier_join:
6832 return AMDGPU::S_BARRIER_JOIN_IMM;
6833 case Intrinsic::amdgcn_s_get_named_barrier_state:
6834 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6835 };
6836 } else {
6837 switch (IntrID) {
6838 default:
6839 llvm_unreachable("not a named barrier op");
6840 case Intrinsic::amdgcn_s_barrier_join:
6841 return AMDGPU::S_BARRIER_JOIN_M0;
6842 case Intrinsic::amdgcn_s_get_named_barrier_state:
6843 return AMDGPU::S_GET_BARRIER_STATE_M0;
6844 };
6845 }
6846}
6847
6848bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6849 MachineInstr &I, Intrinsic::ID IntrID) const {
6850 MachineBasicBlock *MBB = I.getParent();
6851 const DebugLoc &DL = I.getDebugLoc();
6852 const MachineOperand &BarOp = I.getOperand(1);
6853 const MachineOperand &CntOp = I.getOperand(2);
6854
6855 // BarID = (BarOp >> 4) & 0x3F
6856 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6857 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6858 .add(BarOp)
6859 .addImm(4u)
6860 .setOperandDead(3); // Dead scc
6861
6862 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6863 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6864 .addReg(TmpReg0)
6865 .addImm(0x3F)
6866 .setOperandDead(3); // Dead scc
6867
6868 // MO = ((CntOp & 0x3F) << shAmt) | BarID
6869 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6870 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
6871 .add(CntOp)
6872 .addImm(0x3F)
6873 .setOperandDead(3); // Dead scc
6874
6875 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6876 constexpr unsigned ShAmt = 16;
6877 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
6878 .addReg(TmpReg2)
6879 .addImm(ShAmt)
6880 .setOperandDead(3); // Dead scc
6881
6882 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6883 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
6884 .addReg(TmpReg1)
6885 .addReg(TmpReg3)
6886 .setOperandDead(3); // Dead scc;
6887
6888 auto CopyMIB =
6889 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
6890 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6891
6892 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
6893 ? AMDGPU::S_BARRIER_INIT_M0
6894 : AMDGPU::S_BARRIER_SIGNAL_M0;
6895 MachineInstrBuilder MIB;
6896 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6897
6898 I.eraseFromParent();
6899 return true;
6900}
6901
6902bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6903 MachineInstr &I, Intrinsic::ID IntrID) const {
6904 MachineBasicBlock *MBB = I.getParent();
6905 const DebugLoc &DL = I.getDebugLoc();
6906 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6907 ? I.getOperand(2)
6908 : I.getOperand(1);
6909 std::optional<int64_t> BarValImm =
6910 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6911
6912 if (!BarValImm) {
6913 // BarID = (BarOp >> 4) & 0x3F
6914 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6915 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6916 .addReg(BarOp.getReg())
6917 .addImm(4u)
6918 .setOperandDead(3); // Dead scc;
6919
6920 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6921 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6922 .addReg(TmpReg0)
6923 .addImm(0x3F)
6924 .setOperandDead(3); // Dead scc;
6925
6926 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6927 .addReg(TmpReg1);
6928 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6929 }
6930
6931 MachineInstrBuilder MIB;
6932 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6933 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6934
6935 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6936 auto DstReg = I.getOperand(0).getReg();
6937 const TargetRegisterClass *DstRC =
6938 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6939 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6940 return false;
6941 MIB.addDef(DstReg);
6942 }
6943
6944 if (BarValImm) {
6945 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6946 MIB.addImm(BarId);
6947 }
6948
6949 I.eraseFromParent();
6950 return true;
6951}
6952
6953void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6954 const MachineInstr &MI,
6955 int OpIdx) const {
6956 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6957 "Expected G_CONSTANT");
6958 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6959}
6960
6961void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6962 const MachineInstr &MI,
6963 int OpIdx) const {
6964 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6965 "Expected G_CONSTANT");
6966 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6967}
6968
6969void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6970 const MachineInstr &MI,
6971 int OpIdx) const {
6972 const MachineOperand &Op = MI.getOperand(1);
6973 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6974 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6975}
6976
6977void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6978 const MachineInstr &MI,
6979 int OpIdx) const {
6980 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6981 "Expected G_CONSTANT");
6982 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6983}
6984
6985/// This only really exists to satisfy DAG type checking machinery, so is a
6986/// no-op here.
6987void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6988 const MachineInstr &MI,
6989 int OpIdx) const {
6990 const MachineOperand &Op = MI.getOperand(OpIdx);
6991 int64_t Imm;
6992 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6993 MIB.addImm(Imm);
6994 else
6995 MIB.addImm(Op.getImm());
6996}
6997
6998void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6999 const MachineInstr &MI,
7000 int OpIdx) const {
7001 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7002}
7003
7004void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7005 const MachineInstr &MI,
7006 int OpIdx) const {
7007 assert(OpIdx >= 0 && "expected to match an immediate operand");
7008 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7009}
7010
7011void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7012 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7013 assert(OpIdx >= 0 && "expected to match an immediate operand");
7014 MIB.addImm(
7015 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7016}
7017
7018void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7019 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7020 assert(OpIdx >= 0 && "expected to match an immediate operand");
7021 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7023 : (int64_t)SISrcMods::DST_OP_SEL);
7024}
7025
7026void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7027 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7028 assert(OpIdx >= 0 && "expected to match an immediate operand");
7029 MIB.addImm(
7030 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7031}
7032
7033void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7034 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7035 assert(OpIdx >= 0 && "expected to match an immediate operand");
7036 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7037 ? (int64_t)(SISrcMods::OP_SEL_0)
7038 : 0);
7039}
7040
7041void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7042 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7043 assert(OpIdx >= 0 && "expected to match an immediate operand");
7044 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7045 : 0);
7046}
7047
7048void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7049 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7050 assert(OpIdx >= 0 && "expected to match an immediate operand");
7051 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7052 : 0);
7053}
7054
7055void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7056 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7057 assert(OpIdx >= 0 && "expected to match an immediate operand");
7058 MIB.addImm(
7059 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7060}
7061
7062void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7063 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7064 assert(OpIdx >= 0 && "expected to match an immediate operand");
7065 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7066 ? (int64_t)SISrcMods::DST_OP_SEL
7067 : 0);
7068}
7069
7070void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7071 const MachineInstr &MI,
7072 int OpIdx) const {
7073 assert(OpIdx >= 0 && "expected to match an immediate operand");
7074 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7077}
7078
7079void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7080 const MachineInstr &MI,
7081 int OpIdx) const {
7082 assert(OpIdx >= 0 && "expected to match an immediate operand");
7083 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7086 MIB.addImm(Swizzle);
7087}
7088
7089void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7090 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7091 assert(OpIdx >= 0 && "expected to match an immediate operand");
7092 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7095 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7096}
7097
7098void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7099 const MachineInstr &MI,
7100 int OpIdx) const {
7101 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7102}
7103
7104void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7105 const MachineInstr &MI,
7106 int OpIdx) const {
7107 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7108 int ExpVal = APF.getExactLog2Abs();
7109 assert(ExpVal != INT_MIN);
7110 MIB.addImm(ExpVal);
7111}
7112
7113void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7114 const MachineInstr &MI,
7115 int OpIdx) const {
7116 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7117 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7118 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7119 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7120 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7121}
7122
7123void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7124 const MachineInstr &MI,
7125 int OpIdx) const {
7126 unsigned Mods = SISrcMods::OP_SEL_1;
7127 if (MI.getOperand(OpIdx).getImm())
7128 Mods ^= SISrcMods::NEG;
7129 MIB.addImm((int64_t)Mods);
7130}
7131
7132void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7133 const MachineInstr &MI,
7134 int OpIdx) const {
7135 unsigned Mods = SISrcMods::OP_SEL_1;
7136 if (MI.getOperand(OpIdx).getImm())
7138 MIB.addImm((int64_t)Mods);
7139}
7140
7141void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7142 const MachineInstr &MI,
7143 int OpIdx) const {
7144 unsigned Val = MI.getOperand(OpIdx).getImm();
7145 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7146 if (Val == 1) // neg
7147 Mods ^= SISrcMods::NEG;
7148 if (Val == 2) // abs
7149 Mods ^= SISrcMods::ABS;
7150 if (Val == 3) // neg and abs
7151 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7152 MIB.addImm((int64_t)Mods);
7153}
7154
7155void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7156 const MachineInstr &MI,
7157 int OpIdx) const {
7158 uint32_t V = MI.getOperand(2).getImm();
7161 if (!Subtarget->hasSafeCUPrefetch())
7162 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7163 MIB.addImm(V);
7164}
7165
7166/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7167void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7168 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7169 unsigned Val = MI.getOperand(OpIdx).getImm();
7170 unsigned New = 0;
7171 if (Val & 0x1)
7173 if (Val & 0x2)
7175 MIB.addImm(New);
7176}
7177
7178bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7179 return TII.isInlineConstant(Imm);
7180}
7181
7182bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7183 return TII.isInlineConstant(Imm);
7184}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:920
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:652
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:460
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:295
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:493
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:315
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:440
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:468
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:500
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.