LLVM 19.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
50#include "AMDGPUGenGlobalISel.inc"
53#include "AMDGPUGenGlobalISel.inc"
55{
56}
57
58const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
61 CodeGenCoverage *CoverageInfo,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
68}
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 bool IsSGPR = TRI.isSGPRClass(SrcRC);
165 unsigned AndOpc =
166 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
167 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
168 .addImm(1)
169 .addReg(SrcReg);
170 if (IsSGPR)
171 And.setOperandDead(3); // Dead scc
172
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
174 .addImm(0)
175 .addReg(MaskedReg);
176 }
177
178 if (!MRI->getRegClassOrNull(SrcReg))
179 MRI->setRegClass(SrcReg, SrcRC);
180 I.eraseFromParent();
181 return true;
182 }
183
184 const TargetRegisterClass *RC =
186 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
187 return false;
188
189 return true;
190 }
191
192 for (const MachineOperand &MO : I.operands()) {
193 if (MO.getReg().isPhysical())
194 continue;
195
196 const TargetRegisterClass *RC =
198 if (!RC)
199 continue;
200 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
201 }
202 return true;
203}
204
205bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
206 const Register DefReg = I.getOperand(0).getReg();
207 const LLT DefTy = MRI->getType(DefReg);
208
209 // S1 G_PHIs should not be selected in instruction-select, instead:
210 // - divergent S1 G_PHI should go through lane mask merging algorithm
211 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
212 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
213 if (DefTy == LLT::scalar(1))
214 return false;
215
216 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
217
218 const RegClassOrRegBank &RegClassOrBank =
219 MRI->getRegClassOrRegBank(DefReg);
220
221 const TargetRegisterClass *DefRC
222 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
223 if (!DefRC) {
224 if (!DefTy.isValid()) {
225 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
226 return false;
227 }
228
229 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
230 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
231 if (!DefRC) {
232 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
233 return false;
234 }
235 }
236
237 // TODO: Verify that all registers have the same bank
238 I.setDesc(TII.get(TargetOpcode::PHI));
239 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
240}
241
243AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
244 const TargetRegisterClass &SubRC,
245 unsigned SubIdx) const {
246
247 MachineInstr *MI = MO.getParent();
249 Register DstReg = MRI->createVirtualRegister(&SubRC);
250
251 if (MO.isReg()) {
252 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
253 Register Reg = MO.getReg();
254 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
255 .addReg(Reg, 0, ComposedSubIdx);
256
257 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
258 MO.isKill(), MO.isDead(), MO.isUndef(),
259 MO.isEarlyClobber(), 0, MO.isDebug(),
260 MO.isInternalRead());
261 }
262
263 assert(MO.isImm());
264
265 APInt Imm(64, MO.getImm());
266
267 switch (SubIdx) {
268 default:
269 llvm_unreachable("do not know to split immediate with this sub index.");
270 case AMDGPU::sub0:
271 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
272 case AMDGPU::sub1:
273 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
274 }
275}
276
277static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
278 switch (Opc) {
279 case AMDGPU::G_AND:
280 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
281 case AMDGPU::G_OR:
282 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
283 case AMDGPU::G_XOR:
284 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
285 default:
286 llvm_unreachable("not a bit op");
287 }
288}
289
290bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
293
294 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
295 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
296 DstRB->getID() != AMDGPU::VCCRegBankID)
297 return false;
298
299 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
300 STI.isWave64());
301 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
302
303 // Dead implicit-def of scc
304 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
305 true, // isImp
306 false, // isKill
307 true)); // isDead
308 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
309}
310
311bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
312 MachineBasicBlock *BB = I.getParent();
314 Register DstReg = I.getOperand(0).getReg();
315 const DebugLoc &DL = I.getDebugLoc();
316 LLT Ty = MRI->getType(DstReg);
317 if (Ty.isVector())
318 return false;
319
320 unsigned Size = Ty.getSizeInBits();
321 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
322 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
323 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
324
325 if (Size == 32) {
326 if (IsSALU) {
327 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
329 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
330 .add(I.getOperand(1))
331 .add(I.getOperand(2))
332 .setOperandDead(3); // Dead scc
333 I.eraseFromParent();
334 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
335 }
336
337 if (STI.hasAddNoCarry()) {
338 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
339 I.setDesc(TII.get(Opc));
340 I.addOperand(*MF, MachineOperand::CreateImm(0));
341 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
342 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
343 }
344
345 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
346
347 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
349 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
350 .addDef(UnusedCarry, RegState::Dead)
351 .add(I.getOperand(1))
352 .add(I.getOperand(2))
353 .addImm(0);
354 I.eraseFromParent();
355 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
356 }
357
358 assert(!Sub && "illegal sub should not reach here");
359
360 const TargetRegisterClass &RC
361 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
362 const TargetRegisterClass &HalfRC
363 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
364
365 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
366 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
367 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
368 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
369
370 Register DstLo = MRI->createVirtualRegister(&HalfRC);
371 Register DstHi = MRI->createVirtualRegister(&HalfRC);
372
373 if (IsSALU) {
374 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
375 .add(Lo1)
376 .add(Lo2);
377 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
378 .add(Hi1)
379 .add(Hi2)
380 .setOperandDead(3); // Dead scc
381 } else {
382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
383 Register CarryReg = MRI->createVirtualRegister(CarryRC);
384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
385 .addDef(CarryReg)
386 .add(Lo1)
387 .add(Lo2)
388 .addImm(0);
389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
391 .add(Hi1)
392 .add(Hi2)
393 .addReg(CarryReg, RegState::Kill)
394 .addImm(0);
395
396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
397 return false;
398 }
399
400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
401 .addReg(DstLo)
402 .addImm(AMDGPU::sub0)
403 .addReg(DstHi)
404 .addImm(AMDGPU::sub1);
405
406
407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
408 return false;
409
410 I.eraseFromParent();
411 return true;
412}
413
414bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
415 MachineInstr &I) const {
416 MachineBasicBlock *BB = I.getParent();
418 const DebugLoc &DL = I.getDebugLoc();
419 Register Dst0Reg = I.getOperand(0).getReg();
420 Register Dst1Reg = I.getOperand(1).getReg();
421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
422 I.getOpcode() == AMDGPU::G_UADDE;
423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
424 I.getOpcode() == AMDGPU::G_USUBE;
425
426 if (isVCC(Dst1Reg, *MRI)) {
427 unsigned NoCarryOpc =
428 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
429 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
430 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
431 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
432 I.addOperand(*MF, MachineOperand::CreateImm(0));
433 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
434 }
435
436 Register Src0Reg = I.getOperand(2).getReg();
437 Register Src1Reg = I.getOperand(3).getReg();
438
439 if (HasCarryIn) {
440 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
441 .addReg(I.getOperand(4).getReg());
442 }
443
444 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
445 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
446
447 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
448 .add(I.getOperand(2))
449 .add(I.getOperand(3));
450
451 if (MRI->use_nodbg_empty(Dst1Reg)) {
452 CarryInst.setOperandDead(3); // Dead scc
453 } else {
454 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
455 .addReg(AMDGPU::SCC);
456 if (!MRI->getRegClassOrNull(Dst1Reg))
457 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
458 }
459
460 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
461 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
462 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
463 return false;
464
465 if (HasCarryIn &&
466 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
467 AMDGPU::SReg_32RegClass, *MRI))
468 return false;
469
470 I.eraseFromParent();
471 return true;
472}
473
474bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
475 MachineInstr &I) const {
476 MachineBasicBlock *BB = I.getParent();
478 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
479
480 unsigned Opc;
481 if (Subtarget->hasMADIntraFwdBug())
482 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
483 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
484 else
485 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
486 I.setDesc(TII.get(Opc));
487 I.addOperand(*MF, MachineOperand::CreateImm(0));
488 I.addImplicitDefUseOperands(*MF);
489 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
490}
491
492// TODO: We should probably legalize these to only using 32-bit results.
493bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
494 MachineBasicBlock *BB = I.getParent();
495 Register DstReg = I.getOperand(0).getReg();
496 Register SrcReg = I.getOperand(1).getReg();
497 LLT DstTy = MRI->getType(DstReg);
498 LLT SrcTy = MRI->getType(SrcReg);
499 const unsigned SrcSize = SrcTy.getSizeInBits();
500 unsigned DstSize = DstTy.getSizeInBits();
501
502 // TODO: Should handle any multiple of 32 offset.
503 unsigned Offset = I.getOperand(2).getImm();
504 if (Offset % 32 != 0 || DstSize > 128)
505 return false;
506
507 // 16-bit operations really use 32-bit registers.
508 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
509 if (DstSize == 16)
510 DstSize = 32;
511
512 const TargetRegisterClass *DstRC =
513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
515 return false;
516
517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
518 const TargetRegisterClass *SrcRC =
519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
520 if (!SrcRC)
521 return false;
523 DstSize / 32);
524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
525 if (!SrcRC)
526 return false;
527
528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
529 *SrcRC, I.getOperand(1));
530 const DebugLoc &DL = I.getDebugLoc();
531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
532 .addReg(SrcReg, 0, SubReg);
533
534 I.eraseFromParent();
535 return true;
536}
537
538bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
539 MachineBasicBlock *BB = MI.getParent();
540 Register DstReg = MI.getOperand(0).getReg();
541 LLT DstTy = MRI->getType(DstReg);
542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
543
544 const unsigned SrcSize = SrcTy.getSizeInBits();
545 if (SrcSize < 32)
546 return selectImpl(MI, *CoverageInfo);
547
548 const DebugLoc &DL = MI.getDebugLoc();
549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
550 const unsigned DstSize = DstTy.getSizeInBits();
551 const TargetRegisterClass *DstRC =
552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
553 if (!DstRC)
554 return false;
555
556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
560 MachineOperand &Src = MI.getOperand(I + 1);
561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
562 MIB.addImm(SubRegs[I]);
563
564 const TargetRegisterClass *SrcRC
565 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
567 return false;
568 }
569
570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
571 return false;
572
573 MI.eraseFromParent();
574 return true;
575}
576
577bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
578 MachineBasicBlock *BB = MI.getParent();
579 const int NumDst = MI.getNumOperands() - 1;
580
581 MachineOperand &Src = MI.getOperand(NumDst);
582
583 Register SrcReg = Src.getReg();
584 Register DstReg0 = MI.getOperand(0).getReg();
585 LLT DstTy = MRI->getType(DstReg0);
586 LLT SrcTy = MRI->getType(SrcReg);
587
588 const unsigned DstSize = DstTy.getSizeInBits();
589 const unsigned SrcSize = SrcTy.getSizeInBits();
590 const DebugLoc &DL = MI.getDebugLoc();
591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
592
593 const TargetRegisterClass *SrcRC =
594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
596 return false;
597
598 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
599 // source, and this relies on the fact that the same subregister indices are
600 // used for both.
601 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
602 for (int I = 0, E = NumDst; I != E; ++I) {
603 MachineOperand &Dst = MI.getOperand(I);
604 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
605 .addReg(SrcReg, 0, SubRegs[I]);
606
607 // Make sure the subregister index is valid for the source register.
608 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
609 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
610 return false;
611
612 const TargetRegisterClass *DstRC =
614 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
615 return false;
616 }
617
618 MI.eraseFromParent();
619 return true;
620}
621
622bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
623 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
624 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
625
626 Register Src0 = MI.getOperand(1).getReg();
627 Register Src1 = MI.getOperand(2).getReg();
628 LLT SrcTy = MRI->getType(Src0);
629 const unsigned SrcSize = SrcTy.getSizeInBits();
630
631 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
632 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
633 return selectG_MERGE_VALUES(MI);
634 }
635
636 // Selection logic below is for V2S16 only.
637 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
638 Register Dst = MI.getOperand(0).getReg();
639 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
640 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
641 SrcTy != LLT::scalar(32)))
642 return selectImpl(MI, *CoverageInfo);
643
644 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
645 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
646 return false;
647
648 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
649 DstBank->getID() == AMDGPU::VGPRRegBankID);
650 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
651
652 const DebugLoc &DL = MI.getDebugLoc();
653 MachineBasicBlock *BB = MI.getParent();
654
655 // First, before trying TableGen patterns, check if both sources are
656 // constants. In those cases, we can trivially compute the final constant
657 // and emit a simple move.
658 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
659 if (ConstSrc1) {
660 auto ConstSrc0 =
661 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
662 if (ConstSrc0) {
663 const int64_t K0 = ConstSrc0->Value.getSExtValue();
664 const int64_t K1 = ConstSrc1->Value.getSExtValue();
665 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
666 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
667 uint32_t Imm = Lo16 | (Hi16 << 16);
668
669 // VALU
670 if (IsVector) {
671 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
672 MI.eraseFromParent();
673 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
674 }
675
676 // SALU
677 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
678 MI.eraseFromParent();
679 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
680 }
681 }
682
683 // Now try TableGen patterns.
684 if (selectImpl(MI, *CoverageInfo))
685 return true;
686
687 // TODO: This should probably be a combine somewhere
688 // (build_vector $src0, undef) -> copy $src0
689 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
690 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
691 MI.setDesc(TII.get(AMDGPU::COPY));
692 MI.removeOperand(2);
693 const auto &RC =
694 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
695 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
696 RBI.constrainGenericRegister(Src0, RC, *MRI);
697 }
698
699 // TODO: Can be improved?
700 if (IsVector) {
701 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
702 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
703 .addImm(0xFFFF)
704 .addReg(Src0);
705 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
706 return false;
707
708 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
709 .addReg(Src1)
710 .addImm(16)
711 .addReg(TmpReg);
712 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
713 return false;
714
715 MI.eraseFromParent();
716 return true;
717 }
718
719 Register ShiftSrc0;
720 Register ShiftSrc1;
721
722 // With multiple uses of the shift, this will duplicate the shift and
723 // increase register pressure.
724 //
725 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
726 // => (S_PACK_HH_B32_B16 $src0, $src1)
727 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
728 // => (S_PACK_HL_B32_B16 $src0, $src1)
729 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
730 // => (S_PACK_LH_B32_B16 $src0, $src1)
731 // (build_vector $src0, $src1)
732 // => (S_PACK_LL_B32_B16 $src0, $src1)
733
734 bool Shift0 = mi_match(
735 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
736
737 bool Shift1 = mi_match(
738 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
739
740 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
741 if (Shift0 && Shift1) {
742 Opc = AMDGPU::S_PACK_HH_B32_B16;
743 MI.getOperand(1).setReg(ShiftSrc0);
744 MI.getOperand(2).setReg(ShiftSrc1);
745 } else if (Shift1) {
746 Opc = AMDGPU::S_PACK_LH_B32_B16;
747 MI.getOperand(2).setReg(ShiftSrc1);
748 } else if (Shift0) {
749 auto ConstSrc1 =
750 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
751 if (ConstSrc1 && ConstSrc1->Value == 0) {
752 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
753 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
754 .addReg(ShiftSrc0)
755 .addImm(16)
756 .setOperandDead(3); // Dead scc
757
758 MI.eraseFromParent();
759 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
760 }
761 if (STI.hasSPackHL()) {
762 Opc = AMDGPU::S_PACK_HL_B32_B16;
763 MI.getOperand(1).setReg(ShiftSrc0);
764 }
765 }
766
767 MI.setDesc(TII.get(Opc));
768 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
769}
770
771bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
772 const MachineOperand &MO = I.getOperand(0);
773
774 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
775 // regbank check here is to know why getConstrainedRegClassForOperand failed.
777 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
778 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
779 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
780 return true;
781 }
782
783 return false;
784}
785
786bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
787 MachineBasicBlock *BB = I.getParent();
788
789 Register DstReg = I.getOperand(0).getReg();
790 Register Src0Reg = I.getOperand(1).getReg();
791 Register Src1Reg = I.getOperand(2).getReg();
792 LLT Src1Ty = MRI->getType(Src1Reg);
793
794 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
795 unsigned InsSize = Src1Ty.getSizeInBits();
796
797 int64_t Offset = I.getOperand(3).getImm();
798
799 // FIXME: These cases should have been illegal and unnecessary to check here.
800 if (Offset % 32 != 0 || InsSize % 32 != 0)
801 return false;
802
803 // Currently not handled by getSubRegFromChannel.
804 if (InsSize > 128)
805 return false;
806
807 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
808 if (SubReg == AMDGPU::NoSubRegister)
809 return false;
810
811 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
812 const TargetRegisterClass *DstRC =
813 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
814 if (!DstRC)
815 return false;
816
817 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
818 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
819 const TargetRegisterClass *Src0RC =
820 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
821 const TargetRegisterClass *Src1RC =
822 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
823
824 // Deal with weird cases where the class only partially supports the subreg
825 // index.
826 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
827 if (!Src0RC || !Src1RC)
828 return false;
829
830 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
831 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
832 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
833 return false;
834
835 const DebugLoc &DL = I.getDebugLoc();
836 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
837 .addReg(Src0Reg)
838 .addReg(Src1Reg)
839 .addImm(SubReg);
840
841 I.eraseFromParent();
842 return true;
843}
844
845bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
846 Register DstReg = MI.getOperand(0).getReg();
847 Register SrcReg = MI.getOperand(1).getReg();
848 Register OffsetReg = MI.getOperand(2).getReg();
849 Register WidthReg = MI.getOperand(3).getReg();
850
851 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
852 "scalar BFX instructions are expanded in regbankselect");
853 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
854 "64-bit vector BFX instructions are expanded in regbankselect");
855
856 const DebugLoc &DL = MI.getDebugLoc();
857 MachineBasicBlock *MBB = MI.getParent();
858
859 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
860 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
861 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
862 .addReg(SrcReg)
863 .addReg(OffsetReg)
864 .addReg(WidthReg);
865 MI.eraseFromParent();
866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
867}
868
869bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
870 if (STI.getLDSBankCount() != 16)
871 return selectImpl(MI, *CoverageInfo);
872
873 Register Dst = MI.getOperand(0).getReg();
874 Register Src0 = MI.getOperand(2).getReg();
875 Register M0Val = MI.getOperand(6).getReg();
876 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
877 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
878 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
879 return false;
880
881 // This requires 2 instructions. It is possible to write a pattern to support
882 // this, but the generated isel emitter doesn't correctly deal with multiple
883 // output instructions using the same physical register input. The copy to m0
884 // is incorrectly placed before the second instruction.
885 //
886 // TODO: Match source modifiers.
887
888 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
889 const DebugLoc &DL = MI.getDebugLoc();
890 MachineBasicBlock *MBB = MI.getParent();
891
892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
893 .addReg(M0Val);
894 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
895 .addImm(2)
896 .addImm(MI.getOperand(4).getImm()) // $attr
897 .addImm(MI.getOperand(3).getImm()); // $attrchan
898
899 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
900 .addImm(0) // $src0_modifiers
901 .addReg(Src0) // $src0
902 .addImm(MI.getOperand(4).getImm()) // $attr
903 .addImm(MI.getOperand(3).getImm()) // $attrchan
904 .addImm(0) // $src2_modifiers
905 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
906 .addImm(MI.getOperand(5).getImm()) // $high
907 .addImm(0) // $clamp
908 .addImm(0); // $omod
909
910 MI.eraseFromParent();
911 return true;
912}
913
914// Writelane is special in that it can use SGPR and M0 (which would normally
915// count as using the constant bus twice - but in this case it is allowed since
916// the lane selector doesn't count as a use of the constant bus). However, it is
917// still required to abide by the 1 SGPR rule. Fix this up if we might have
918// multiple SGPRs.
919bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
920 // With a constant bus limit of at least 2, there's no issue.
921 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
922 return selectImpl(MI, *CoverageInfo);
923
924 MachineBasicBlock *MBB = MI.getParent();
925 const DebugLoc &DL = MI.getDebugLoc();
926 Register VDst = MI.getOperand(0).getReg();
927 Register Val = MI.getOperand(2).getReg();
928 Register LaneSelect = MI.getOperand(3).getReg();
929 Register VDstIn = MI.getOperand(4).getReg();
930
931 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
932
933 std::optional<ValueAndVReg> ConstSelect =
934 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
935 if (ConstSelect) {
936 // The selector has to be an inline immediate, so we can use whatever for
937 // the other operands.
938 MIB.addReg(Val);
939 MIB.addImm(ConstSelect->Value.getSExtValue() &
940 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
941 } else {
942 std::optional<ValueAndVReg> ConstVal =
944
945 // If the value written is an inline immediate, we can get away without a
946 // copy to m0.
947 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
948 STI.hasInv2PiInlineImm())) {
949 MIB.addImm(ConstVal->Value.getSExtValue());
950 MIB.addReg(LaneSelect);
951 } else {
952 MIB.addReg(Val);
953
954 // If the lane selector was originally in a VGPR and copied with
955 // readfirstlane, there's a hazard to read the same SGPR from the
956 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
957 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
958
959 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
960 .addReg(LaneSelect);
961 MIB.addReg(AMDGPU::M0);
962 }
963 }
964
965 MIB.addReg(VDstIn);
966
967 MI.eraseFromParent();
968 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
969}
970
971// We need to handle this here because tablegen doesn't support matching
972// instructions with multiple outputs.
973bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
974 Register Dst0 = MI.getOperand(0).getReg();
975 Register Dst1 = MI.getOperand(1).getReg();
976
977 LLT Ty = MRI->getType(Dst0);
978 unsigned Opc;
979 if (Ty == LLT::scalar(32))
980 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
981 else if (Ty == LLT::scalar(64))
982 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
983 else
984 return false;
985
986 // TODO: Match source modifiers.
987
988 const DebugLoc &DL = MI.getDebugLoc();
989 MachineBasicBlock *MBB = MI.getParent();
990
991 Register Numer = MI.getOperand(3).getReg();
992 Register Denom = MI.getOperand(4).getReg();
993 unsigned ChooseDenom = MI.getOperand(5).getImm();
994
995 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
996
997 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
998 .addDef(Dst1)
999 .addImm(0) // $src0_modifiers
1000 .addUse(Src0) // $src0
1001 .addImm(0) // $src1_modifiers
1002 .addUse(Denom) // $src1
1003 .addImm(0) // $src2_modifiers
1004 .addUse(Numer) // $src2
1005 .addImm(0) // $clamp
1006 .addImm(0); // $omod
1007
1008 MI.eraseFromParent();
1009 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1010}
1011
1012bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1013 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1014 switch (IntrinsicID) {
1015 case Intrinsic::amdgcn_if_break: {
1016 MachineBasicBlock *BB = I.getParent();
1017
1018 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1019 // SelectionDAG uses for wave32 vs wave64.
1020 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1021 .add(I.getOperand(0))
1022 .add(I.getOperand(2))
1023 .add(I.getOperand(3));
1024
1025 Register DstReg = I.getOperand(0).getReg();
1026 Register Src0Reg = I.getOperand(2).getReg();
1027 Register Src1Reg = I.getOperand(3).getReg();
1028
1029 I.eraseFromParent();
1030
1031 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1032 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1033
1034 return true;
1035 }
1036 case Intrinsic::amdgcn_interp_p1_f16:
1037 return selectInterpP1F16(I);
1038 case Intrinsic::amdgcn_wqm:
1039 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1040 case Intrinsic::amdgcn_softwqm:
1041 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1042 case Intrinsic::amdgcn_strict_wwm:
1043 case Intrinsic::amdgcn_wwm:
1044 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1045 case Intrinsic::amdgcn_strict_wqm:
1046 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1047 case Intrinsic::amdgcn_writelane:
1048 return selectWritelane(I);
1049 case Intrinsic::amdgcn_div_scale:
1050 return selectDivScale(I);
1051 case Intrinsic::amdgcn_icmp:
1052 case Intrinsic::amdgcn_fcmp:
1053 if (selectImpl(I, *CoverageInfo))
1054 return true;
1055 return selectIntrinsicCmp(I);
1056 case Intrinsic::amdgcn_ballot:
1057 return selectBallot(I);
1058 case Intrinsic::amdgcn_reloc_constant:
1059 return selectRelocConstant(I);
1060 case Intrinsic::amdgcn_groupstaticsize:
1061 return selectGroupStaticSize(I);
1062 case Intrinsic::returnaddress:
1063 return selectReturnAddress(I);
1064 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1065 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1066 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1067 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1068 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1069 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1070 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1071 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1072 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1073 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1074 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1075 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1076 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1077 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1078 return selectSMFMACIntrin(I);
1079 default:
1080 return selectImpl(I, *CoverageInfo);
1081 }
1082}
1083
1085 const GCNSubtarget &ST) {
1086 if (Size != 16 && Size != 32 && Size != 64)
1087 return -1;
1088
1089 if (Size == 16 && !ST.has16BitInsts())
1090 return -1;
1091
1092 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1093 unsigned S64Opc) {
1094 if (Size == 16)
1095 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1096 if (Size == 32)
1097 return S32Opc;
1098 return S64Opc;
1099 };
1100
1101 switch (P) {
1102 default:
1103 llvm_unreachable("Unknown condition code!");
1104 case CmpInst::ICMP_NE:
1105 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1106 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1107 case CmpInst::ICMP_EQ:
1108 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1109 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1110 case CmpInst::ICMP_SGT:
1111 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1112 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1113 case CmpInst::ICMP_SGE:
1114 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1115 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1116 case CmpInst::ICMP_SLT:
1117 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1118 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1119 case CmpInst::ICMP_SLE:
1120 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1121 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1122 case CmpInst::ICMP_UGT:
1123 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1124 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1125 case CmpInst::ICMP_UGE:
1126 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1127 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1128 case CmpInst::ICMP_ULT:
1129 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1130 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1131 case CmpInst::ICMP_ULE:
1132 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1133 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1134
1135 case CmpInst::FCMP_OEQ:
1136 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1137 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1138 case CmpInst::FCMP_OGT:
1139 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1140 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1141 case CmpInst::FCMP_OGE:
1142 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1143 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1144 case CmpInst::FCMP_OLT:
1145 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1146 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1147 case CmpInst::FCMP_OLE:
1148 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1149 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1150 case CmpInst::FCMP_ONE:
1151 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1152 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1153 case CmpInst::FCMP_ORD:
1154 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1155 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1156 case CmpInst::FCMP_UNO:
1157 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1158 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1159 case CmpInst::FCMP_UEQ:
1160 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1161 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1162 case CmpInst::FCMP_UGT:
1163 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1164 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1165 case CmpInst::FCMP_UGE:
1166 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1167 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1168 case CmpInst::FCMP_ULT:
1169 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1170 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1171 case CmpInst::FCMP_ULE:
1172 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1173 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1174 case CmpInst::FCMP_UNE:
1175 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1176 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1177 case CmpInst::FCMP_TRUE:
1178 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1179 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1181 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1182 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1183 }
1184}
1185
1186int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1187 unsigned Size) const {
1188 if (Size == 64) {
1189 if (!STI.hasScalarCompareEq64())
1190 return -1;
1191
1192 switch (P) {
1193 case CmpInst::ICMP_NE:
1194 return AMDGPU::S_CMP_LG_U64;
1195 case CmpInst::ICMP_EQ:
1196 return AMDGPU::S_CMP_EQ_U64;
1197 default:
1198 return -1;
1199 }
1200 }
1201
1202 if (Size == 32) {
1203 switch (P) {
1204 case CmpInst::ICMP_NE:
1205 return AMDGPU::S_CMP_LG_U32;
1206 case CmpInst::ICMP_EQ:
1207 return AMDGPU::S_CMP_EQ_U32;
1208 case CmpInst::ICMP_SGT:
1209 return AMDGPU::S_CMP_GT_I32;
1210 case CmpInst::ICMP_SGE:
1211 return AMDGPU::S_CMP_GE_I32;
1212 case CmpInst::ICMP_SLT:
1213 return AMDGPU::S_CMP_LT_I32;
1214 case CmpInst::ICMP_SLE:
1215 return AMDGPU::S_CMP_LE_I32;
1216 case CmpInst::ICMP_UGT:
1217 return AMDGPU::S_CMP_GT_U32;
1218 case CmpInst::ICMP_UGE:
1219 return AMDGPU::S_CMP_GE_U32;
1220 case CmpInst::ICMP_ULT:
1221 return AMDGPU::S_CMP_LT_U32;
1222 case CmpInst::ICMP_ULE:
1223 return AMDGPU::S_CMP_LE_U32;
1224 case CmpInst::FCMP_OEQ:
1225 return AMDGPU::S_CMP_EQ_F32;
1226 case CmpInst::FCMP_OGT:
1227 return AMDGPU::S_CMP_GT_F32;
1228 case CmpInst::FCMP_OGE:
1229 return AMDGPU::S_CMP_GE_F32;
1230 case CmpInst::FCMP_OLT:
1231 return AMDGPU::S_CMP_LT_F32;
1232 case CmpInst::FCMP_OLE:
1233 return AMDGPU::S_CMP_LE_F32;
1234 case CmpInst::FCMP_ONE:
1235 return AMDGPU::S_CMP_LG_F32;
1236 case CmpInst::FCMP_ORD:
1237 return AMDGPU::S_CMP_O_F32;
1238 case CmpInst::FCMP_UNO:
1239 return AMDGPU::S_CMP_U_F32;
1240 case CmpInst::FCMP_UEQ:
1241 return AMDGPU::S_CMP_NLG_F32;
1242 case CmpInst::FCMP_UGT:
1243 return AMDGPU::S_CMP_NLE_F32;
1244 case CmpInst::FCMP_UGE:
1245 return AMDGPU::S_CMP_NLT_F32;
1246 case CmpInst::FCMP_ULT:
1247 return AMDGPU::S_CMP_NGE_F32;
1248 case CmpInst::FCMP_ULE:
1249 return AMDGPU::S_CMP_NGT_F32;
1250 case CmpInst::FCMP_UNE:
1251 return AMDGPU::S_CMP_NEQ_F32;
1252 default:
1253 llvm_unreachable("Unknown condition code!");
1254 }
1255 }
1256
1257 if (Size == 16) {
1258 if (!STI.hasSALUFloatInsts())
1259 return -1;
1260
1261 switch (P) {
1262 case CmpInst::FCMP_OEQ:
1263 return AMDGPU::S_CMP_EQ_F16;
1264 case CmpInst::FCMP_OGT:
1265 return AMDGPU::S_CMP_GT_F16;
1266 case CmpInst::FCMP_OGE:
1267 return AMDGPU::S_CMP_GE_F16;
1268 case CmpInst::FCMP_OLT:
1269 return AMDGPU::S_CMP_LT_F16;
1270 case CmpInst::FCMP_OLE:
1271 return AMDGPU::S_CMP_LE_F16;
1272 case CmpInst::FCMP_ONE:
1273 return AMDGPU::S_CMP_LG_F16;
1274 case CmpInst::FCMP_ORD:
1275 return AMDGPU::S_CMP_O_F16;
1276 case CmpInst::FCMP_UNO:
1277 return AMDGPU::S_CMP_U_F16;
1278 case CmpInst::FCMP_UEQ:
1279 return AMDGPU::S_CMP_NLG_F16;
1280 case CmpInst::FCMP_UGT:
1281 return AMDGPU::S_CMP_NLE_F16;
1282 case CmpInst::FCMP_UGE:
1283 return AMDGPU::S_CMP_NLT_F16;
1284 case CmpInst::FCMP_ULT:
1285 return AMDGPU::S_CMP_NGE_F16;
1286 case CmpInst::FCMP_ULE:
1287 return AMDGPU::S_CMP_NGT_F16;
1288 case CmpInst::FCMP_UNE:
1289 return AMDGPU::S_CMP_NEQ_F16;
1290 default:
1291 llvm_unreachable("Unknown condition code!");
1292 }
1293 }
1294
1295 return -1;
1296}
1297
1298bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1299
1300 MachineBasicBlock *BB = I.getParent();
1301 const DebugLoc &DL = I.getDebugLoc();
1302
1303 Register SrcReg = I.getOperand(2).getReg();
1304 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1305
1306 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1307
1308 Register CCReg = I.getOperand(0).getReg();
1309 if (!isVCC(CCReg, *MRI)) {
1310 int Opcode = getS_CMPOpcode(Pred, Size);
1311 if (Opcode == -1)
1312 return false;
1313 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1314 .add(I.getOperand(2))
1315 .add(I.getOperand(3));
1316 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1317 .addReg(AMDGPU::SCC);
1318 bool Ret =
1319 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1320 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1321 I.eraseFromParent();
1322 return Ret;
1323 }
1324
1325 if (I.getOpcode() == AMDGPU::G_FCMP)
1326 return false;
1327
1328 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1329 if (Opcode == -1)
1330 return false;
1331
1332 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1333 I.getOperand(0).getReg())
1334 .add(I.getOperand(2))
1335 .add(I.getOperand(3));
1337 *TRI.getBoolRC(), *MRI);
1338 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1339 I.eraseFromParent();
1340 return Ret;
1341}
1342
1343bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1344 Register Dst = I.getOperand(0).getReg();
1345 if (isVCC(Dst, *MRI))
1346 return false;
1347
1348 LLT DstTy = MRI->getType(Dst);
1349 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1350 return false;
1351
1352 MachineBasicBlock *BB = I.getParent();
1353 const DebugLoc &DL = I.getDebugLoc();
1354 Register SrcReg = I.getOperand(2).getReg();
1355 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1356
1357 // i1 inputs are not supported in GlobalISel.
1358 if (Size == 1)
1359 return false;
1360
1361 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1362 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1363 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1364 I.eraseFromParent();
1365 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1366 }
1367
1368 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1369 if (Opcode == -1)
1370 return false;
1371
1372 MachineInstrBuilder SelectedMI;
1373 MachineOperand &LHS = I.getOperand(2);
1374 MachineOperand &RHS = I.getOperand(3);
1375 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1376 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1377 Register Src0Reg =
1378 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1379 Register Src1Reg =
1380 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1381 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1382 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1383 SelectedMI.addImm(Src0Mods);
1384 SelectedMI.addReg(Src0Reg);
1385 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1386 SelectedMI.addImm(Src1Mods);
1387 SelectedMI.addReg(Src1Reg);
1388 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1389 SelectedMI.addImm(0); // clamp
1390 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1391 SelectedMI.addImm(0); // op_sel
1392
1393 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1394 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1395 return false;
1396
1397 I.eraseFromParent();
1398 return true;
1399}
1400
1401bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1402 MachineBasicBlock *BB = I.getParent();
1403 const DebugLoc &DL = I.getDebugLoc();
1404 Register DstReg = I.getOperand(0).getReg();
1405 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1406 const bool Is64 = Size == 64;
1407 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1408
1409 // In the common case, the return type matches the wave size.
1410 // However we also support emitting i64 ballots in wave32 mode.
1411 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1412 return false;
1413
1414 std::optional<ValueAndVReg> Arg =
1415 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1416
1417 const auto BuildCopy = [&](Register SrcReg) {
1418 if (Size == STI.getWavefrontSize()) {
1419 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1420 .addReg(SrcReg);
1421 return;
1422 }
1423
1424 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1426 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1428 .addReg(SrcReg)
1429 .addImm(AMDGPU::sub0)
1430 .addReg(HiReg)
1431 .addImm(AMDGPU::sub1);
1432 };
1433
1434 if (Arg) {
1435 const int64_t Value = Arg->Value.getSExtValue();
1436 if (Value == 0) {
1437 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1438 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1439 } else if (Value == -1) // all ones
1440 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1441 else
1442 return false;
1443 } else
1444 BuildCopy(I.getOperand(2).getReg());
1445
1446 I.eraseFromParent();
1447 return true;
1448}
1449
1450bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1451 Register DstReg = I.getOperand(0).getReg();
1452 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1453 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1454 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1455 return false;
1456
1457 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1458
1460 const MDNode *Metadata = I.getOperand(2).getMetadata();
1461 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1462 auto RelocSymbol = cast<GlobalVariable>(
1463 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1464
1465 MachineBasicBlock *BB = I.getParent();
1466 BuildMI(*BB, &I, I.getDebugLoc(),
1467 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1469
1470 I.eraseFromParent();
1471 return true;
1472}
1473
1474bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1476
1477 Register DstReg = I.getOperand(0).getReg();
1478 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1479 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1480 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1481
1482 MachineBasicBlock *MBB = I.getParent();
1483 const DebugLoc &DL = I.getDebugLoc();
1484
1485 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1486
1487 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1489 MIB.addImm(MFI->getLDSSize());
1490 } else {
1492 const GlobalValue *GV
1493 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1495 }
1496
1497 I.eraseFromParent();
1498 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1499}
1500
1501bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1502 MachineBasicBlock *MBB = I.getParent();
1504 const DebugLoc &DL = I.getDebugLoc();
1505
1506 MachineOperand &Dst = I.getOperand(0);
1507 Register DstReg = Dst.getReg();
1508 unsigned Depth = I.getOperand(2).getImm();
1509
1510 const TargetRegisterClass *RC
1511 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1512 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1513 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1514 return false;
1515
1516 // Check for kernel and shader functions
1517 if (Depth != 0 ||
1519 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1520 .addImm(0);
1521 I.eraseFromParent();
1522 return true;
1523 }
1524
1526 // There is a call to @llvm.returnaddress in this function
1527 MFI.setReturnAddressIsTaken(true);
1528
1529 // Get the return address reg and mark it as an implicit live-in
1530 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1531 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1532 AMDGPU::SReg_64RegClass, DL);
1533 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1534 .addReg(LiveIn);
1535 I.eraseFromParent();
1536 return true;
1537}
1538
1539bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1540 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1541 // SelectionDAG uses for wave32 vs wave64.
1542 MachineBasicBlock *BB = MI.getParent();
1543 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1544 .add(MI.getOperand(1));
1545
1546 Register Reg = MI.getOperand(1).getReg();
1547 MI.eraseFromParent();
1548
1549 if (!MRI->getRegClassOrNull(Reg))
1550 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1551 return true;
1552}
1553
1554bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1555 MachineInstr &MI, Intrinsic::ID IntrID) const {
1556 MachineBasicBlock *MBB = MI.getParent();
1558 const DebugLoc &DL = MI.getDebugLoc();
1559
1560 unsigned IndexOperand = MI.getOperand(7).getImm();
1561 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1562 bool WaveDone = MI.getOperand(9).getImm() != 0;
1563
1564 if (WaveDone && !WaveRelease)
1565 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1566
1567 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1568 IndexOperand &= ~0x3f;
1569 unsigned CountDw = 0;
1570
1572 CountDw = (IndexOperand >> 24) & 0xf;
1573 IndexOperand &= ~(0xf << 24);
1574
1575 if (CountDw < 1 || CountDw > 4) {
1577 "ds_ordered_count: dword count must be between 1 and 4");
1578 }
1579 }
1580
1581 if (IndexOperand)
1582 report_fatal_error("ds_ordered_count: bad index operand");
1583
1584 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1585 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1586
1587 unsigned Offset0 = OrderedCountIndex << 2;
1588 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1589
1591 Offset1 |= (CountDw - 1) << 6;
1592
1594 Offset1 |= ShaderType << 2;
1595
1596 unsigned Offset = Offset0 | (Offset1 << 8);
1597
1598 Register M0Val = MI.getOperand(2).getReg();
1599 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1600 .addReg(M0Val);
1601
1602 Register DstReg = MI.getOperand(0).getReg();
1603 Register ValReg = MI.getOperand(3).getReg();
1605 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1606 .addReg(ValReg)
1607 .addImm(Offset)
1608 .cloneMemRefs(MI);
1609
1610 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1611 return false;
1612
1613 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1614 MI.eraseFromParent();
1615 return Ret;
1616}
1617
1618static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1619 switch (IntrID) {
1620 case Intrinsic::amdgcn_ds_gws_init:
1621 return AMDGPU::DS_GWS_INIT;
1622 case Intrinsic::amdgcn_ds_gws_barrier:
1623 return AMDGPU::DS_GWS_BARRIER;
1624 case Intrinsic::amdgcn_ds_gws_sema_v:
1625 return AMDGPU::DS_GWS_SEMA_V;
1626 case Intrinsic::amdgcn_ds_gws_sema_br:
1627 return AMDGPU::DS_GWS_SEMA_BR;
1628 case Intrinsic::amdgcn_ds_gws_sema_p:
1629 return AMDGPU::DS_GWS_SEMA_P;
1630 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1631 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1632 default:
1633 llvm_unreachable("not a gws intrinsic");
1634 }
1635}
1636
1637bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1638 Intrinsic::ID IID) const {
1639 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1640 !STI.hasGWSSemaReleaseAll()))
1641 return false;
1642
1643 // intrinsic ID, vsrc, offset
1644 const bool HasVSrc = MI.getNumOperands() == 3;
1645 assert(HasVSrc || MI.getNumOperands() == 2);
1646
1647 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1648 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1649 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1650 return false;
1651
1652 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1653 unsigned ImmOffset;
1654
1655 MachineBasicBlock *MBB = MI.getParent();
1656 const DebugLoc &DL = MI.getDebugLoc();
1657
1658 MachineInstr *Readfirstlane = nullptr;
1659
1660 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1661 // incoming offset, in case there's an add of a constant. We'll have to put it
1662 // back later.
1663 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1664 Readfirstlane = OffsetDef;
1665 BaseOffset = OffsetDef->getOperand(1).getReg();
1666 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1667 }
1668
1669 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1670 // If we have a constant offset, try to use the 0 in m0 as the base.
1671 // TODO: Look into changing the default m0 initialization value. If the
1672 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1673 // the immediate offset.
1674
1675 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1676 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1677 .addImm(0);
1678 } else {
1679 std::tie(BaseOffset, ImmOffset) =
1680 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1681
1682 if (Readfirstlane) {
1683 // We have the constant offset now, so put the readfirstlane back on the
1684 // variable component.
1685 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1686 return false;
1687
1688 Readfirstlane->getOperand(1).setReg(BaseOffset);
1689 BaseOffset = Readfirstlane->getOperand(0).getReg();
1690 } else {
1691 if (!RBI.constrainGenericRegister(BaseOffset,
1692 AMDGPU::SReg_32RegClass, *MRI))
1693 return false;
1694 }
1695
1696 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1697 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1698 .addReg(BaseOffset)
1699 .addImm(16)
1700 .setOperandDead(3); // Dead scc
1701
1702 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1703 .addReg(M0Base);
1704 }
1705
1706 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1707 // offset field) % 64. Some versions of the programming guide omit the m0
1708 // part, or claim it's from offset 0.
1709 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1710
1711 if (HasVSrc) {
1712 Register VSrc = MI.getOperand(1).getReg();
1713 MIB.addReg(VSrc);
1714
1715 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1716 return false;
1717 }
1718
1719 MIB.addImm(ImmOffset)
1720 .cloneMemRefs(MI);
1721
1722 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1723
1724 MI.eraseFromParent();
1725 return true;
1726}
1727
1728bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1729 bool IsAppend) const {
1730 Register PtrBase = MI.getOperand(2).getReg();
1731 LLT PtrTy = MRI->getType(PtrBase);
1732 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1733
1734 unsigned Offset;
1735 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1736
1737 // TODO: Should this try to look through readfirstlane like GWS?
1738 if (!isDSOffsetLegal(PtrBase, Offset)) {
1739 PtrBase = MI.getOperand(2).getReg();
1740 Offset = 0;
1741 }
1742
1743 MachineBasicBlock *MBB = MI.getParent();
1744 const DebugLoc &DL = MI.getDebugLoc();
1745 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1746
1747 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1748 .addReg(PtrBase);
1749 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1750 return false;
1751
1752 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1753 .addImm(Offset)
1754 .addImm(IsGDS ? -1 : 0)
1755 .cloneMemRefs(MI);
1756 MI.eraseFromParent();
1757 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1758}
1759
1760bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1762 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1763 if (WGSize <= STI.getWavefrontSize()) {
1764 MachineBasicBlock *MBB = MI.getParent();
1765 const DebugLoc &DL = MI.getDebugLoc();
1766 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1767 MI.eraseFromParent();
1768 return true;
1769 }
1770 }
1771
1772 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1773 if (STI.hasSplitBarriers()) {
1774 MachineBasicBlock *MBB = MI.getParent();
1775 const DebugLoc &DL = MI.getDebugLoc();
1776 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1778 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1780 MI.eraseFromParent();
1781 return true;
1782 }
1783
1784 return selectImpl(MI, *CoverageInfo);
1785}
1786
1787static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1788 bool &IsTexFail) {
1789 if (TexFailCtrl)
1790 IsTexFail = true;
1791
1792 TFE = (TexFailCtrl & 0x1) ? true : false;
1793 TexFailCtrl &= ~(uint64_t)0x1;
1794 LWE = (TexFailCtrl & 0x2) ? true : false;
1795 TexFailCtrl &= ~(uint64_t)0x2;
1796
1797 return TexFailCtrl == 0;
1798}
1799
1800bool AMDGPUInstructionSelector::selectImageIntrinsic(
1802 MachineBasicBlock *MBB = MI.getParent();
1803 const DebugLoc &DL = MI.getDebugLoc();
1804
1805 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1807
1808 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1809 unsigned IntrOpcode = Intr->BaseOpcode;
1810 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1811 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1812 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1813
1814 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1815
1816 Register VDataIn, VDataOut;
1817 LLT VDataTy;
1818 int NumVDataDwords = -1;
1819 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1820 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1821
1822 bool Unorm;
1823 if (!BaseOpcode->Sampler)
1824 Unorm = true;
1825 else
1826 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1827
1828 bool TFE;
1829 bool LWE;
1830 bool IsTexFail = false;
1831 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1832 TFE, LWE, IsTexFail))
1833 return false;
1834
1835 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1836 const bool IsA16 = (Flags & 1) != 0;
1837 const bool IsG16 = (Flags & 2) != 0;
1838
1839 // A16 implies 16 bit gradients if subtarget doesn't support G16
1840 if (IsA16 && !STI.hasG16() && !IsG16)
1841 return false;
1842
1843 unsigned DMask = 0;
1844 unsigned DMaskLanes = 0;
1845
1846 if (BaseOpcode->Atomic) {
1847 VDataOut = MI.getOperand(0).getReg();
1848 VDataIn = MI.getOperand(2).getReg();
1849 LLT Ty = MRI->getType(VDataIn);
1850
1851 // Be careful to allow atomic swap on 16-bit element vectors.
1852 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1853 Ty.getSizeInBits() == 128 :
1854 Ty.getSizeInBits() == 64;
1855
1856 if (BaseOpcode->AtomicX2) {
1857 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1858
1859 DMask = Is64Bit ? 0xf : 0x3;
1860 NumVDataDwords = Is64Bit ? 4 : 2;
1861 } else {
1862 DMask = Is64Bit ? 0x3 : 0x1;
1863 NumVDataDwords = Is64Bit ? 2 : 1;
1864 }
1865 } else {
1866 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1867 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1868
1869 if (BaseOpcode->Store) {
1870 VDataIn = MI.getOperand(1).getReg();
1871 VDataTy = MRI->getType(VDataIn);
1872 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1873 } else {
1874 VDataOut = MI.getOperand(0).getReg();
1875 VDataTy = MRI->getType(VDataOut);
1876 NumVDataDwords = DMaskLanes;
1877
1878 if (IsD16 && !STI.hasUnpackedD16VMem())
1879 NumVDataDwords = (DMaskLanes + 1) / 2;
1880 }
1881 }
1882
1883 // Set G16 opcode
1884 if (Subtarget->hasG16() && IsG16) {
1885 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1887 assert(G16MappingInfo);
1888 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1889 }
1890
1891 // TODO: Check this in verifier.
1892 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1893
1894 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1895 if (BaseOpcode->Atomic)
1896 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1897 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1899 return false;
1900
1901 int NumVAddrRegs = 0;
1902 int NumVAddrDwords = 0;
1903 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1904 // Skip the $noregs and 0s inserted during legalization.
1905 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1906 if (!AddrOp.isReg())
1907 continue; // XXX - Break?
1908
1909 Register Addr = AddrOp.getReg();
1910 if (!Addr)
1911 break;
1912
1913 ++NumVAddrRegs;
1914 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1915 }
1916
1917 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1918 // NSA, these should have been packed into a single value in the first
1919 // address register
1920 const bool UseNSA =
1921 NumVAddrRegs != 1 &&
1922 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1923 : NumVAddrDwords == NumVAddrRegs);
1924 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1925 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1926 return false;
1927 }
1928
1929 if (IsTexFail)
1930 ++NumVDataDwords;
1931
1932 int Opcode = -1;
1933 if (IsGFX12Plus) {
1934 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1935 NumVDataDwords, NumVAddrDwords);
1936 } else if (IsGFX11Plus) {
1937 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1938 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1939 : AMDGPU::MIMGEncGfx11Default,
1940 NumVDataDwords, NumVAddrDwords);
1941 } else if (IsGFX10Plus) {
1942 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1943 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1944 : AMDGPU::MIMGEncGfx10Default,
1945 NumVDataDwords, NumVAddrDwords);
1946 } else {
1947 if (Subtarget->hasGFX90AInsts()) {
1948 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1949 NumVDataDwords, NumVAddrDwords);
1950 if (Opcode == -1) {
1951 LLVM_DEBUG(
1952 dbgs()
1953 << "requested image instruction is not supported on this GPU\n");
1954 return false;
1955 }
1956 }
1957 if (Opcode == -1 &&
1959 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1960 NumVDataDwords, NumVAddrDwords);
1961 if (Opcode == -1)
1962 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1963 NumVDataDwords, NumVAddrDwords);
1964 }
1965 if (Opcode == -1)
1966 return false;
1967
1968 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1969 .cloneMemRefs(MI);
1970
1971 if (VDataOut) {
1972 if (BaseOpcode->AtomicX2) {
1973 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1974
1975 Register TmpReg = MRI->createVirtualRegister(
1976 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1977 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1978
1979 MIB.addDef(TmpReg);
1980 if (!MRI->use_empty(VDataOut)) {
1981 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1982 .addReg(TmpReg, RegState::Kill, SubReg);
1983 }
1984
1985 } else {
1986 MIB.addDef(VDataOut); // vdata output
1987 }
1988 }
1989
1990 if (VDataIn)
1991 MIB.addReg(VDataIn); // vdata input
1992
1993 for (int I = 0; I != NumVAddrRegs; ++I) {
1994 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1995 if (SrcOp.isReg()) {
1996 assert(SrcOp.getReg() != 0);
1997 MIB.addReg(SrcOp.getReg());
1998 }
1999 }
2000
2001 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2002 if (BaseOpcode->Sampler)
2003 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2004
2005 MIB.addImm(DMask); // dmask
2006
2007 if (IsGFX10Plus)
2008 MIB.addImm(DimInfo->Encoding);
2009 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2010 MIB.addImm(Unorm);
2011
2012 MIB.addImm(CPol);
2013 MIB.addImm(IsA16 && // a16 or r128
2014 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2015 if (IsGFX10Plus)
2016 MIB.addImm(IsA16 ? -1 : 0);
2017
2018 if (!Subtarget->hasGFX90AInsts()) {
2019 MIB.addImm(TFE); // tfe
2020 } else if (TFE) {
2021 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2022 return false;
2023 }
2024
2025 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2026 MIB.addImm(LWE); // lwe
2027 if (!IsGFX10Plus)
2028 MIB.addImm(DimInfo->DA ? -1 : 0);
2029 if (BaseOpcode->HasD16)
2030 MIB.addImm(IsD16 ? -1 : 0);
2031
2032 MI.eraseFromParent();
2033 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2034 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2035 return true;
2036}
2037
2038// We need to handle this here because tablegen doesn't support matching
2039// instructions with multiple outputs.
2040bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2041 MachineInstr &MI) const {
2042 Register Dst0 = MI.getOperand(0).getReg();
2043 Register Dst1 = MI.getOperand(1).getReg();
2044
2045 const DebugLoc &DL = MI.getDebugLoc();
2046 MachineBasicBlock *MBB = MI.getParent();
2047
2048 Register Addr = MI.getOperand(3).getReg();
2049 Register Data0 = MI.getOperand(4).getReg();
2050 Register Data1 = MI.getOperand(5).getReg();
2051 unsigned Offset = MI.getOperand(6).getImm();
2052
2053 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2054 .addDef(Dst1)
2055 .addUse(Addr)
2056 .addUse(Data0)
2057 .addUse(Data1)
2058 .addImm(Offset)
2059 .cloneMemRefs(MI);
2060
2061 MI.eraseFromParent();
2062 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2063}
2064
2065bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2066 MachineInstr &I) const {
2067 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2068 switch (IntrinsicID) {
2069 case Intrinsic::amdgcn_end_cf:
2070 return selectEndCfIntrinsic(I);
2071 case Intrinsic::amdgcn_ds_ordered_add:
2072 case Intrinsic::amdgcn_ds_ordered_swap:
2073 return selectDSOrderedIntrinsic(I, IntrinsicID);
2074 case Intrinsic::amdgcn_ds_gws_init:
2075 case Intrinsic::amdgcn_ds_gws_barrier:
2076 case Intrinsic::amdgcn_ds_gws_sema_v:
2077 case Intrinsic::amdgcn_ds_gws_sema_br:
2078 case Intrinsic::amdgcn_ds_gws_sema_p:
2079 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2080 return selectDSGWSIntrinsic(I, IntrinsicID);
2081 case Intrinsic::amdgcn_ds_append:
2082 return selectDSAppendConsume(I, true);
2083 case Intrinsic::amdgcn_ds_consume:
2084 return selectDSAppendConsume(I, false);
2085 case Intrinsic::amdgcn_s_barrier:
2086 return selectSBarrier(I);
2087 case Intrinsic::amdgcn_raw_buffer_load_lds:
2088 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2089 case Intrinsic::amdgcn_struct_buffer_load_lds:
2090 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2091 return selectBufferLoadLds(I);
2092 case Intrinsic::amdgcn_global_load_lds:
2093 return selectGlobalLoadLds(I);
2094 case Intrinsic::amdgcn_exp_compr:
2095 if (!STI.hasCompressedExport()) {
2096 Function &F = I.getMF()->getFunction();
2098 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2099 F.getContext().diagnose(NoFpRet);
2100 return false;
2101 }
2102 break;
2103 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2104 return selectDSBvhStackIntrinsic(I);
2105 case Intrinsic::amdgcn_s_barrier_init:
2106 case Intrinsic::amdgcn_s_barrier_join:
2107 case Intrinsic::amdgcn_s_wakeup_barrier:
2108 case Intrinsic::amdgcn_s_get_barrier_state:
2109 return selectNamedBarrierInst(I, IntrinsicID);
2110 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2111 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2112 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2113 case Intrinsic::amdgcn_s_barrier_leave:
2114 return selectSBarrierLeave(I);
2115 }
2116 return selectImpl(I, *CoverageInfo);
2117}
2118
2119bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2120 if (selectImpl(I, *CoverageInfo))
2121 return true;
2122
2123 MachineBasicBlock *BB = I.getParent();
2124 const DebugLoc &DL = I.getDebugLoc();
2125
2126 Register DstReg = I.getOperand(0).getReg();
2127 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2128 assert(Size <= 32 || Size == 64);
2129 const MachineOperand &CCOp = I.getOperand(1);
2130 Register CCReg = CCOp.getReg();
2131 if (!isVCC(CCReg, *MRI)) {
2132 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2133 AMDGPU::S_CSELECT_B32;
2134 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2135 .addReg(CCReg);
2136
2137 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2138 // bank, because it does not cover the register class that we used to represent
2139 // for it. So we need to manually set the register class here.
2140 if (!MRI->getRegClassOrNull(CCReg))
2141 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2142 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2143 .add(I.getOperand(2))
2144 .add(I.getOperand(3));
2145
2146 bool Ret = false;
2147 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2148 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2149 I.eraseFromParent();
2150 return Ret;
2151 }
2152
2153 // Wide VGPR select should have been split in RegBankSelect.
2154 if (Size > 32)
2155 return false;
2156
2158 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2159 .addImm(0)
2160 .add(I.getOperand(3))
2161 .addImm(0)
2162 .add(I.getOperand(2))
2163 .add(I.getOperand(1));
2164
2165 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2166 I.eraseFromParent();
2167 return Ret;
2168}
2169
2170static int sizeToSubRegIndex(unsigned Size) {
2171 switch (Size) {
2172 case 32:
2173 return AMDGPU::sub0;
2174 case 64:
2175 return AMDGPU::sub0_sub1;
2176 case 96:
2177 return AMDGPU::sub0_sub1_sub2;
2178 case 128:
2179 return AMDGPU::sub0_sub1_sub2_sub3;
2180 case 256:
2181 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2182 default:
2183 if (Size < 32)
2184 return AMDGPU::sub0;
2185 if (Size > 256)
2186 return -1;
2188 }
2189}
2190
2191bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2192 Register DstReg = I.getOperand(0).getReg();
2193 Register SrcReg = I.getOperand(1).getReg();
2194 const LLT DstTy = MRI->getType(DstReg);
2195 const LLT SrcTy = MRI->getType(SrcReg);
2196 const LLT S1 = LLT::scalar(1);
2197
2198 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2199 const RegisterBank *DstRB;
2200 if (DstTy == S1) {
2201 // This is a special case. We don't treat s1 for legalization artifacts as
2202 // vcc booleans.
2203 DstRB = SrcRB;
2204 } else {
2205 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2206 if (SrcRB != DstRB)
2207 return false;
2208 }
2209
2210 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2211
2212 unsigned DstSize = DstTy.getSizeInBits();
2213 unsigned SrcSize = SrcTy.getSizeInBits();
2214
2215 const TargetRegisterClass *SrcRC =
2216 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2217 const TargetRegisterClass *DstRC =
2218 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2219 if (!SrcRC || !DstRC)
2220 return false;
2221
2222 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2223 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2224 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2225 return false;
2226 }
2227
2228 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2229 MachineBasicBlock *MBB = I.getParent();
2230 const DebugLoc &DL = I.getDebugLoc();
2231
2232 Register LoReg = MRI->createVirtualRegister(DstRC);
2233 Register HiReg = MRI->createVirtualRegister(DstRC);
2234 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2235 .addReg(SrcReg, 0, AMDGPU::sub0);
2236 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2237 .addReg(SrcReg, 0, AMDGPU::sub1);
2238
2239 if (IsVALU && STI.hasSDWA()) {
2240 // Write the low 16-bits of the high element into the high 16-bits of the
2241 // low element.
2242 MachineInstr *MovSDWA =
2243 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2244 .addImm(0) // $src0_modifiers
2245 .addReg(HiReg) // $src0
2246 .addImm(0) // $clamp
2247 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2248 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2249 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2250 .addReg(LoReg, RegState::Implicit);
2251 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2252 } else {
2253 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2254 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2255 Register ImmReg = MRI->createVirtualRegister(DstRC);
2256 if (IsVALU) {
2257 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2258 .addImm(16)
2259 .addReg(HiReg);
2260 } else {
2261 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2262 .addReg(HiReg)
2263 .addImm(16)
2264 .setOperandDead(3); // Dead scc
2265 }
2266
2267 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2268 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2269 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2270
2271 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2272 .addImm(0xffff);
2273 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2274 .addReg(LoReg)
2275 .addReg(ImmReg);
2276 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2277 .addReg(TmpReg0)
2278 .addReg(TmpReg1);
2279
2280 if (!IsVALU) {
2281 And.setOperandDead(3); // Dead scc
2282 Or.setOperandDead(3); // Dead scc
2283 }
2284 }
2285
2286 I.eraseFromParent();
2287 return true;
2288 }
2289
2290 if (!DstTy.isScalar())
2291 return false;
2292
2293 if (SrcSize > 32) {
2294 int SubRegIdx = sizeToSubRegIndex(DstSize);
2295 if (SubRegIdx == -1)
2296 return false;
2297
2298 // Deal with weird cases where the class only partially supports the subreg
2299 // index.
2300 const TargetRegisterClass *SrcWithSubRC
2301 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2302 if (!SrcWithSubRC)
2303 return false;
2304
2305 if (SrcWithSubRC != SrcRC) {
2306 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2307 return false;
2308 }
2309
2310 I.getOperand(1).setSubReg(SubRegIdx);
2311 }
2312
2313 I.setDesc(TII.get(TargetOpcode::COPY));
2314 return true;
2315}
2316
2317/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2318static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2319 Mask = maskTrailingOnes<unsigned>(Size);
2320 int SignedMask = static_cast<int>(Mask);
2321 return SignedMask >= -16 && SignedMask <= 64;
2322}
2323
2324// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2325const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2326 Register Reg, const MachineRegisterInfo &MRI,
2327 const TargetRegisterInfo &TRI) const {
2328 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2329 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2330 return RB;
2331
2332 // Ignore the type, since we don't use vcc in artifacts.
2333 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2334 return &RBI.getRegBankFromRegClass(*RC, LLT());
2335 return nullptr;
2336}
2337
2338bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2339 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2340 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2341 const DebugLoc &DL = I.getDebugLoc();
2342 MachineBasicBlock &MBB = *I.getParent();
2343 const Register DstReg = I.getOperand(0).getReg();
2344 const Register SrcReg = I.getOperand(1).getReg();
2345
2346 const LLT DstTy = MRI->getType(DstReg);
2347 const LLT SrcTy = MRI->getType(SrcReg);
2348 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2349 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2350 const unsigned DstSize = DstTy.getSizeInBits();
2351 if (!DstTy.isScalar())
2352 return false;
2353
2354 // Artifact casts should never use vcc.
2355 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2356
2357 // FIXME: This should probably be illegal and split earlier.
2358 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2359 if (DstSize <= 32)
2360 return selectCOPY(I);
2361
2362 const TargetRegisterClass *SrcRC =
2363 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2364 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2365 const TargetRegisterClass *DstRC =
2366 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2367
2368 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2369 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2370 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2371 .addReg(SrcReg)
2372 .addImm(AMDGPU::sub0)
2373 .addReg(UndefReg)
2374 .addImm(AMDGPU::sub1);
2375 I.eraseFromParent();
2376
2377 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2378 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2379 }
2380
2381 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2382 // 64-bit should have been split up in RegBankSelect
2383
2384 // Try to use an and with a mask if it will save code size.
2385 unsigned Mask;
2386 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2387 MachineInstr *ExtI =
2388 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2389 .addImm(Mask)
2390 .addReg(SrcReg);
2391 I.eraseFromParent();
2392 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2393 }
2394
2395 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2396 MachineInstr *ExtI =
2397 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2398 .addReg(SrcReg)
2399 .addImm(0) // Offset
2400 .addImm(SrcSize); // Width
2401 I.eraseFromParent();
2402 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2403 }
2404
2405 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2406 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2407 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2408 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2409 return false;
2410
2411 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2412 const unsigned SextOpc = SrcSize == 8 ?
2413 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2414 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2415 .addReg(SrcReg);
2416 I.eraseFromParent();
2417 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2418 }
2419
2420 // Using a single 32-bit SALU to calculate the high half is smaller than
2421 // S_BFE with a literal constant operand.
2422 if (DstSize > 32 && SrcSize == 32) {
2423 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2424 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2425 if (Signed) {
2426 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2427 .addReg(SrcReg, 0, SubReg)
2428 .addImm(31)
2429 .setOperandDead(3); // Dead scc
2430 } else {
2431 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2432 .addImm(0);
2433 }
2434 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2435 .addReg(SrcReg, 0, SubReg)
2436 .addImm(AMDGPU::sub0)
2437 .addReg(HiReg)
2438 .addImm(AMDGPU::sub1);
2439 I.eraseFromParent();
2440 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2441 *MRI);
2442 }
2443
2444 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2445 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2446
2447 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2448 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2449 // We need a 64-bit register source, but the high bits don't matter.
2450 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2451 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2452 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2453
2454 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2455 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2456 .addReg(SrcReg, 0, SubReg)
2457 .addImm(AMDGPU::sub0)
2458 .addReg(UndefReg)
2459 .addImm(AMDGPU::sub1);
2460
2461 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2462 .addReg(ExtReg)
2463 .addImm(SrcSize << 16);
2464
2465 I.eraseFromParent();
2466 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2467 }
2468
2469 unsigned Mask;
2470 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2471 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2472 .addReg(SrcReg)
2473 .addImm(Mask)
2474 .setOperandDead(3); // Dead scc
2475 } else {
2476 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2477 .addReg(SrcReg)
2478 .addImm(SrcSize << 16);
2479 }
2480
2481 I.eraseFromParent();
2482 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2483 }
2484
2485 return false;
2486}
2487
2489 Register &Out) {
2490 Register LShlSrc;
2491 if (mi_match(In, MRI,
2492 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2493 Out = LShlSrc;
2494 return true;
2495 }
2496 return false;
2497}
2498
2499bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2500 if (!Subtarget->hasSALUFloatInsts())
2501 return false;
2502
2503 Register Dst = I.getOperand(0).getReg();
2504 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2505 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2506 return false;
2507
2508 Register Src = I.getOperand(1).getReg();
2509
2510 if (MRI->getType(Dst) == LLT::scalar(32) &&
2511 MRI->getType(Src) == LLT::scalar(16)) {
2512 if (isExtractHiElt(*MRI, Src, Src)) {
2513 MachineBasicBlock *BB = I.getParent();
2514 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2515 .addUse(Src);
2516 I.eraseFromParent();
2517 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2518 }
2519 }
2520
2521 return false;
2522}
2523
2524bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2525 MachineBasicBlock *BB = I.getParent();
2526 MachineOperand &ImmOp = I.getOperand(1);
2527 Register DstReg = I.getOperand(0).getReg();
2528 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2529 bool IsFP = false;
2530
2531 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2532 if (ImmOp.isFPImm()) {
2533 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2534 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2535 IsFP = true;
2536 } else if (ImmOp.isCImm()) {
2537 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2538 } else {
2539 llvm_unreachable("Not supported by g_constants");
2540 }
2541
2542 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2543 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2544
2545 unsigned Opcode;
2546 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2547 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2548 } else if (Size == 64 &&
2549 AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2550 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2551 I.setDesc(TII.get(Opcode));
2552 I.addImplicitDefUseOperands(*MF);
2553 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2554 } else {
2555 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2556
2557 // We should never produce s1 values on banks other than VCC. If the user of
2558 // this already constrained the register, we may incorrectly think it's VCC
2559 // if it wasn't originally.
2560 if (Size == 1)
2561 return false;
2562 }
2563
2564 if (Size != 64) {
2565 I.setDesc(TII.get(Opcode));
2566 I.addImplicitDefUseOperands(*MF);
2567 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2568 }
2569
2570 const DebugLoc &DL = I.getDebugLoc();
2571
2572 APInt Imm(Size, I.getOperand(1).getImm());
2573
2574 MachineInstr *ResInst;
2575 if (IsSgpr && TII.isInlineConstant(Imm)) {
2576 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2577 .addImm(I.getOperand(1).getImm());
2578 } else {
2579 const TargetRegisterClass *RC = IsSgpr ?
2580 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2581 Register LoReg = MRI->createVirtualRegister(RC);
2582 Register HiReg = MRI->createVirtualRegister(RC);
2583
2584 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2585 .addImm(Imm.trunc(32).getZExtValue());
2586
2587 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2588 .addImm(Imm.ashr(32).getZExtValue());
2589
2590 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2591 .addReg(LoReg)
2592 .addImm(AMDGPU::sub0)
2593 .addReg(HiReg)
2594 .addImm(AMDGPU::sub1);
2595 }
2596
2597 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2598 // work for target independent opcodes
2599 I.eraseFromParent();
2600 const TargetRegisterClass *DstRC =
2601 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2602 if (!DstRC)
2603 return true;
2604 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2605}
2606
2607bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2608 // Only manually handle the f64 SGPR case.
2609 //
2610 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2611 // the bit ops theoretically have a second result due to the implicit def of
2612 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2613 // that is easy by disabling the check. The result works, but uses a
2614 // nonsensical sreg32orlds_and_sreg_1 regclass.
2615 //
2616 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2617 // the variadic REG_SEQUENCE operands.
2618
2619 Register Dst = MI.getOperand(0).getReg();
2620 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2621 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2622 MRI->getType(Dst) != LLT::scalar(64))
2623 return false;
2624
2625 Register Src = MI.getOperand(1).getReg();
2626 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2627 if (Fabs)
2628 Src = Fabs->getOperand(1).getReg();
2629
2630 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2631 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2632 return false;
2633
2634 MachineBasicBlock *BB = MI.getParent();
2635 const DebugLoc &DL = MI.getDebugLoc();
2636 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2637 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2638 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2639 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2640
2641 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2642 .addReg(Src, 0, AMDGPU::sub0);
2643 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2644 .addReg(Src, 0, AMDGPU::sub1);
2645 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2646 .addImm(0x80000000);
2647
2648 // Set or toggle sign bit.
2649 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2650 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2651 .addReg(HiReg)
2652 .addReg(ConstReg)
2653 .setOperandDead(3); // Dead scc
2654 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2655 .addReg(LoReg)
2656 .addImm(AMDGPU::sub0)
2657 .addReg(OpReg)
2658 .addImm(AMDGPU::sub1);
2659 MI.eraseFromParent();
2660 return true;
2661}
2662
2663// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2664bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2665 Register Dst = MI.getOperand(0).getReg();
2666 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2667 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2668 MRI->getType(Dst) != LLT::scalar(64))
2669 return false;
2670
2671 Register Src = MI.getOperand(1).getReg();
2672 MachineBasicBlock *BB = MI.getParent();
2673 const DebugLoc &DL = MI.getDebugLoc();
2674 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2675 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2676 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2677 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2678
2679 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2680 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2681 return false;
2682
2683 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2684 .addReg(Src, 0, AMDGPU::sub0);
2685 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2686 .addReg(Src, 0, AMDGPU::sub1);
2687 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2688 .addImm(0x7fffffff);
2689
2690 // Clear sign bit.
2691 // TODO: Should this used S_BITSET0_*?
2692 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2693 .addReg(HiReg)
2694 .addReg(ConstReg)
2695 .setOperandDead(3); // Dead scc
2696 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2697 .addReg(LoReg)
2698 .addImm(AMDGPU::sub0)
2699 .addReg(OpReg)
2700 .addImm(AMDGPU::sub1);
2701
2702 MI.eraseFromParent();
2703 return true;
2704}
2705
2706static bool isConstant(const MachineInstr &MI) {
2707 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2708}
2709
2710void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2711 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2712
2713 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2714 const MachineInstr *PtrMI =
2715 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2716
2717 assert(PtrMI);
2718
2719 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2720 return;
2721
2722 GEPInfo GEPInfo;
2723
2724 for (unsigned i = 1; i != 3; ++i) {
2725 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2726 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2727 assert(OpDef);
2728 if (i == 2 && isConstant(*OpDef)) {
2729 // TODO: Could handle constant base + variable offset, but a combine
2730 // probably should have commuted it.
2731 assert(GEPInfo.Imm == 0);
2732 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2733 continue;
2734 }
2735 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2736 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2737 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2738 else
2739 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2740 }
2741
2742 AddrInfo.push_back(GEPInfo);
2743 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2744}
2745
2746bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2747 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2748}
2749
2750bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2751 if (!MI.hasOneMemOperand())
2752 return false;
2753
2754 const MachineMemOperand *MMO = *MI.memoperands_begin();
2755 const Value *Ptr = MMO->getValue();
2756
2757 // UndefValue means this is a load of a kernel input. These are uniform.
2758 // Sometimes LDS instructions have constant pointers.
2759 // If Ptr is null, then that means this mem operand contains a
2760 // PseudoSourceValue like GOT.
2761 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2762 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2763 return true;
2764
2766 return true;
2767
2768 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2769 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2770 AMDGPU::SGPRRegBankID;
2771
2772 const Instruction *I = dyn_cast<Instruction>(Ptr);
2773 return I && I->getMetadata("amdgpu.uniform");
2774}
2775
2776bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2777 for (const GEPInfo &GEPInfo : AddrInfo) {
2778 if (!GEPInfo.VgprParts.empty())
2779 return true;
2780 }
2781 return false;
2782}
2783
2784void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2785 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2786 unsigned AS = PtrTy.getAddressSpace();
2788 STI.ldsRequiresM0Init()) {
2789 MachineBasicBlock *BB = I.getParent();
2790
2791 // If DS instructions require M0 initialization, insert it before selecting.
2792 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2793 .addImm(-1);
2794 }
2795}
2796
2797bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2798 MachineInstr &I) const {
2799 initM0(I);
2800 return selectImpl(I, *CoverageInfo);
2801}
2802
2804 if (Reg.isPhysical())
2805 return false;
2806
2807 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2808 const unsigned Opcode = MI.getOpcode();
2809
2810 if (Opcode == AMDGPU::COPY)
2811 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2812
2813 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2814 Opcode == AMDGPU::G_XOR)
2815 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2816 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2817
2818 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2819 return GI->is(Intrinsic::amdgcn_class);
2820
2821 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2822}
2823
2824bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2825 MachineBasicBlock *BB = I.getParent();
2826 MachineOperand &CondOp = I.getOperand(0);
2827 Register CondReg = CondOp.getReg();
2828 const DebugLoc &DL = I.getDebugLoc();
2829
2830 unsigned BrOpcode;
2831 Register CondPhysReg;
2832 const TargetRegisterClass *ConstrainRC;
2833
2834 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2835 // whether the branch is uniform when selecting the instruction. In
2836 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2837 // RegBankSelect knows what it's doing if the branch condition is scc, even
2838 // though it currently does not.
2839 if (!isVCC(CondReg, *MRI)) {
2840 if (MRI->getType(CondReg) != LLT::scalar(32))
2841 return false;
2842
2843 CondPhysReg = AMDGPU::SCC;
2844 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2845 ConstrainRC = &AMDGPU::SReg_32RegClass;
2846 } else {
2847 // FIXME: Should scc->vcc copies and with exec?
2848
2849 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2850 // need to insert an and with exec.
2851 if (!isVCmpResult(CondReg, *MRI)) {
2852 const bool Is64 = STI.isWave64();
2853 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2854 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2855
2856 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2857 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2858 .addReg(CondReg)
2859 .addReg(Exec)
2860 .setOperandDead(3); // Dead scc
2861 CondReg = TmpReg;
2862 }
2863
2864 CondPhysReg = TRI.getVCC();
2865 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2866 ConstrainRC = TRI.getBoolRC();
2867 }
2868
2869 if (!MRI->getRegClassOrNull(CondReg))
2870 MRI->setRegClass(CondReg, ConstrainRC);
2871
2872 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2873 .addReg(CondReg);
2874 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2875 .addMBB(I.getOperand(1).getMBB());
2876
2877 I.eraseFromParent();
2878 return true;
2879}
2880
2881bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2882 MachineInstr &I) const {
2883 Register DstReg = I.getOperand(0).getReg();
2884 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2885 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2886 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2887 if (IsVGPR)
2888 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2889
2890 return RBI.constrainGenericRegister(
2891 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2892}
2893
2894bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2895 Register DstReg = I.getOperand(0).getReg();
2896 Register SrcReg = I.getOperand(1).getReg();
2897 Register MaskReg = I.getOperand(2).getReg();
2898 LLT Ty = MRI->getType(DstReg);
2899 LLT MaskTy = MRI->getType(MaskReg);
2900 MachineBasicBlock *BB = I.getParent();
2901 const DebugLoc &DL = I.getDebugLoc();
2902
2903 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2904 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2905 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2906 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2907 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2908 return false;
2909
2910 // Try to avoid emitting a bit operation when we only need to touch half of
2911 // the 64-bit pointer.
2912 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2913 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2914 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2915
2916 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2917 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2918
2919 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2920 !CanCopyLow32 && !CanCopyHi32) {
2921 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2922 .addReg(SrcReg)
2923 .addReg(MaskReg)
2924 .setOperandDead(3); // Dead scc
2925 I.eraseFromParent();
2926 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2927 }
2928
2929 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2930 const TargetRegisterClass &RegRC
2931 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2932
2933 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2934 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2935 const TargetRegisterClass *MaskRC =
2936 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2937
2938 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2939 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2940 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2941 return false;
2942
2943 if (Ty.getSizeInBits() == 32) {
2944 assert(MaskTy.getSizeInBits() == 32 &&
2945 "ptrmask should have been narrowed during legalize");
2946
2947 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2948 .addReg(SrcReg)
2949 .addReg(MaskReg);
2950
2951 if (!IsVGPR)
2952 NewOp.setOperandDead(3); // Dead scc
2953 I.eraseFromParent();
2954 return true;
2955 }
2956
2957 Register HiReg = MRI->createVirtualRegister(&RegRC);
2958 Register LoReg = MRI->createVirtualRegister(&RegRC);
2959
2960 // Extract the subregisters from the source pointer.
2961 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2962 .addReg(SrcReg, 0, AMDGPU::sub0);
2963 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2964 .addReg(SrcReg, 0, AMDGPU::sub1);
2965
2966 Register MaskedLo, MaskedHi;
2967
2968 if (CanCopyLow32) {
2969 // If all the bits in the low half are 1, we only need a copy for it.
2970 MaskedLo = LoReg;
2971 } else {
2972 // Extract the mask subregister and apply the and.
2973 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2974 MaskedLo = MRI->createVirtualRegister(&RegRC);
2975
2976 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2977 .addReg(MaskReg, 0, AMDGPU::sub0);
2978 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2979 .addReg(LoReg)
2980 .addReg(MaskLo);
2981 }
2982
2983 if (CanCopyHi32) {
2984 // If all the bits in the high half are 1, we only need a copy for it.
2985 MaskedHi = HiReg;
2986 } else {
2987 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2988 MaskedHi = MRI->createVirtualRegister(&RegRC);
2989
2990 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2991 .addReg(MaskReg, 0, AMDGPU::sub1);
2992 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2993 .addReg(HiReg)
2994 .addReg(MaskHi);
2995 }
2996
2997 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2998 .addReg(MaskedLo)
2999 .addImm(AMDGPU::sub0)
3000 .addReg(MaskedHi)
3001 .addImm(AMDGPU::sub1);
3002 I.eraseFromParent();
3003 return true;
3004}
3005
3006/// Return the register to use for the index value, and the subregister to use
3007/// for the indirectly accessed register.
3008static std::pair<Register, unsigned>
3010 const TargetRegisterClass *SuperRC, Register IdxReg,
3011 unsigned EltSize, GISelKnownBits &KnownBits) {
3012 Register IdxBaseReg;
3013 int Offset;
3014
3015 std::tie(IdxBaseReg, Offset) =
3017 if (IdxBaseReg == AMDGPU::NoRegister) {
3018 // This will happen if the index is a known constant. This should ordinarily
3019 // be legalized out, but handle it as a register just in case.
3020 assert(Offset == 0);
3021 IdxBaseReg = IdxReg;
3022 }
3023
3024 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3025
3026 // Skip out of bounds offsets, or else we would end up using an undefined
3027 // register.
3028 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3029 return std::pair(IdxReg, SubRegs[0]);
3030 return std::pair(IdxBaseReg, SubRegs[Offset]);
3031}
3032
3033bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3034 MachineInstr &MI) const {
3035 Register DstReg = MI.getOperand(0).getReg();
3036 Register SrcReg = MI.getOperand(1).getReg();
3037 Register IdxReg = MI.getOperand(2).getReg();
3038
3039 LLT DstTy = MRI->getType(DstReg);
3040 LLT SrcTy = MRI->getType(SrcReg);
3041
3042 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3043 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3044 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3045
3046 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3047 // into a waterfall loop.
3048 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3049 return false;
3050
3051 const TargetRegisterClass *SrcRC =
3052 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3053 const TargetRegisterClass *DstRC =
3054 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3055 if (!SrcRC || !DstRC)
3056 return false;
3057 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3058 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3059 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3060 return false;
3061
3062 MachineBasicBlock *BB = MI.getParent();
3063 const DebugLoc &DL = MI.getDebugLoc();
3064 const bool Is64 = DstTy.getSizeInBits() == 64;
3065
3066 unsigned SubReg;
3067 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3068 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3069
3070 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3071 if (DstTy.getSizeInBits() != 32 && !Is64)
3072 return false;
3073
3074 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3075 .addReg(IdxReg);
3076
3077 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3078 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3079 .addReg(SrcReg, 0, SubReg)
3080 .addReg(SrcReg, RegState::Implicit);
3081 MI.eraseFromParent();
3082 return true;
3083 }
3084
3085 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3086 return false;
3087
3088 if (!STI.useVGPRIndexMode()) {
3089 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3090 .addReg(IdxReg);
3091 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3092 .addReg(SrcReg, 0, SubReg)
3093 .addReg(SrcReg, RegState::Implicit);
3094 MI.eraseFromParent();
3095 return true;
3096 }
3097
3098 const MCInstrDesc &GPRIDXDesc =
3099 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3100 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3101 .addReg(SrcReg)
3102 .addReg(IdxReg)
3103 .addImm(SubReg);
3104
3105 MI.eraseFromParent();
3106 return true;
3107}
3108
3109// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3110bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3111 MachineInstr &MI) const {
3112 Register DstReg = MI.getOperand(0).getReg();
3113 Register VecReg = MI.getOperand(1).getReg();
3114 Register ValReg = MI.getOperand(2).getReg();
3115 Register IdxReg = MI.getOperand(3).getReg();
3116
3117 LLT VecTy = MRI->getType(DstReg);
3118 LLT ValTy = MRI->getType(ValReg);
3119 unsigned VecSize = VecTy.getSizeInBits();
3120 unsigned ValSize = ValTy.getSizeInBits();
3121
3122 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3123 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3124 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3125
3126 assert(VecTy.getElementType() == ValTy);
3127
3128 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3129 // into a waterfall loop.
3130 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3131 return false;
3132
3133 const TargetRegisterClass *VecRC =
3134 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3135 const TargetRegisterClass *ValRC =
3136 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3137
3138 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3139 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3140 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3141 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3142 return false;
3143
3144 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3145 return false;
3146
3147 unsigned SubReg;
3148 std::tie(IdxReg, SubReg) =
3149 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3150
3151 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3152 STI.useVGPRIndexMode();
3153
3154 MachineBasicBlock *BB = MI.getParent();
3155 const DebugLoc &DL = MI.getDebugLoc();
3156
3157 if (!IndexMode) {
3158 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3159 .addReg(IdxReg);
3160
3161 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3162 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3163 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3164 .addReg(VecReg)
3165 .addReg(ValReg)
3166 .addImm(SubReg);
3167 MI.eraseFromParent();
3168 return true;
3169 }
3170
3171 const MCInstrDesc &GPRIDXDesc =
3172 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3173 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3174 .addReg(VecReg)
3175 .addReg(ValReg)
3176 .addReg(IdxReg)
3177 .addImm(SubReg);
3178
3179 MI.eraseFromParent();
3180 return true;
3181}
3182
3183bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3185 unsigned Opc;
3186 unsigned Size = MI.getOperand(3).getImm();
3187
3188 // The struct intrinsic variants add one additional operand over raw.
3189 const bool HasVIndex = MI.getNumOperands() == 9;
3190 Register VIndex;
3191 int OpOffset = 0;
3192 if (HasVIndex) {
3193 VIndex = MI.getOperand(4).getReg();
3194 OpOffset = 1;
3195 }
3196
3197 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3198 std::optional<ValueAndVReg> MaybeVOffset =
3200 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3201
3202 switch (Size) {
3203 default:
3204 return false;
3205 case 1:
3206 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3207 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3208 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3209 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3210 break;
3211 case 2:
3212 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3213 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3214 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3215 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3216 break;
3217 case 4:
3218 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3219 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3220 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3221 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3222 break;
3223 }
3224
3225 MachineBasicBlock *MBB = MI.getParent();
3226 const DebugLoc &DL = MI.getDebugLoc();
3227 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3228 .add(MI.getOperand(2));
3229
3230 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3231
3232 if (HasVIndex && HasVOffset) {
3233 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3234 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3235 .addReg(VIndex)
3236 .addImm(AMDGPU::sub0)
3237 .addReg(VOffset)
3238 .addImm(AMDGPU::sub1);
3239
3240 MIB.addReg(IdxReg);
3241 } else if (HasVIndex) {
3242 MIB.addReg(VIndex);
3243 } else if (HasVOffset) {
3244 MIB.addReg(VOffset);
3245 }
3246
3247 MIB.add(MI.getOperand(1)); // rsrc
3248 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3249 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3250 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3251 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3252 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3253
3254 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3255 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3256 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3257 MachinePointerInfo StorePtrI = LoadPtrI;
3258 StorePtrI.V = nullptr;
3260
3261 auto F = LoadMMO->getFlags() &
3263 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3264 Size, LoadMMO->getBaseAlign());
3265
3266 MachineMemOperand *StoreMMO =
3268 sizeof(int32_t), LoadMMO->getBaseAlign());
3269
3270 MIB.setMemRefs({LoadMMO, StoreMMO});
3271
3272 MI.eraseFromParent();
3273 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3274}
3275
3276/// Match a zero extend from a 32-bit value to 64-bits.
3278 Register ZExtSrc;
3279 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3280 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3281
3282 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3283 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3284 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3285 return Register();
3286
3287 assert(Def->getNumOperands() == 3 &&
3288 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3289 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3290 return Def->getOperand(1).getReg();
3291 }
3292
3293 return Register();
3294}
3295
3296bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3297 unsigned Opc;
3298 unsigned Size = MI.getOperand(3).getImm();
3299
3300 switch (Size) {
3301 default:
3302 return false;
3303 case 1:
3304 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3305 break;
3306 case 2:
3307 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3308 break;
3309 case 4:
3310 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3311 break;
3312 }
3313
3314 MachineBasicBlock *MBB = MI.getParent();
3315 const DebugLoc &DL = MI.getDebugLoc();
3316 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3317 .add(MI.getOperand(2));
3318
3319 Register Addr = MI.getOperand(1).getReg();
3320 Register VOffset;
3321 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3322 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3323 if (!isSGPR(Addr)) {
3324 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3325 if (isSGPR(AddrDef->Reg)) {
3326 Addr = AddrDef->Reg;
3327 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3328 Register SAddr =
3329 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3330 if (isSGPR(SAddr)) {
3331 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3332 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3333 Addr = SAddr;
3334 VOffset = Off;
3335 }
3336 }
3337 }
3338 }
3339
3340 if (isSGPR(Addr)) {
3341 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3342 if (!VOffset) {
3343 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3344 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3345 .addImm(0);
3346 }
3347 }
3348
3349 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3350 .addReg(Addr);
3351
3352 if (isSGPR(Addr))
3353 MIB.addReg(VOffset);
3354
3355 MIB.add(MI.getOperand(4)) // offset
3356 .add(MI.getOperand(5)); // cpol
3357
3358 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3359 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3360 LoadPtrI.Offset = MI.getOperand(4).getImm();
3361 MachinePointerInfo StorePtrI = LoadPtrI;
3364 auto F = LoadMMO->getFlags() &
3366 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3367 Size, LoadMMO->getBaseAlign());
3368 MachineMemOperand *StoreMMO =
3370 sizeof(int32_t), Align(4));
3371
3372 MIB.setMemRefs({LoadMMO, StoreMMO});
3373
3374 MI.eraseFromParent();
3375 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3376}
3377
3378bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3379 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3380 MI.removeOperand(1);
3381 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3382 return true;
3383}
3384
3385bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3386 unsigned Opc;
3387 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3388 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3389 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3390 break;
3391 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3392 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3393 break;
3394 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3395 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3396 break;
3397 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3398 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3399 break;
3400 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3401 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3402 break;
3403 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3404 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3405 break;
3406 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3407 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3408 break;
3409 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3410 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3411 break;
3412 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3413 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3414 break;
3415 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3416 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3417 break;
3418 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3419 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3420 break;
3421 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3422 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3423 break;
3424 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3425 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3426 break;
3427 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3428 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3429 break;
3430 default:
3431 llvm_unreachable("unhandled smfmac intrinsic");
3432 }
3433
3434 auto VDst_In = MI.getOperand(4);
3435
3436 MI.setDesc(TII.get(Opc));
3437 MI.removeOperand(4); // VDst_In
3438 MI.removeOperand(1); // Intrinsic ID
3439 MI.addOperand(VDst_In); // Readd VDst_In to the end
3440 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3441 return true;
3442}
3443
3444bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3445 Register DstReg = MI.getOperand(0).getReg();
3446 Register SrcReg = MI.getOperand(1).getReg();
3447 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3448 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3449 MachineBasicBlock *MBB = MI.getParent();
3450 const DebugLoc &DL = MI.getDebugLoc();
3451
3452 if (IsVALU) {
3453 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3454 .addImm(Subtarget->getWavefrontSizeLog2())
3455 .addReg(SrcReg);
3456 } else {
3457 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3458 .addReg(SrcReg)
3459 .addImm(Subtarget->getWavefrontSizeLog2())
3460 .setOperandDead(3); // Dead scc
3461 }
3462
3463 const TargetRegisterClass &RC =
3464 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3465 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3466 return false;
3467
3468 MI.eraseFromParent();
3469 return true;
3470}
3471
3472bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3473 Register SrcReg = MI.getOperand(0).getReg();
3474 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3475 return false;
3476
3477 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3478 Register SP =
3480 Register WaveAddr = getWaveAddress(DefMI);
3481 MachineBasicBlock *MBB = MI.getParent();
3482 const DebugLoc &DL = MI.getDebugLoc();
3483
3484 if (!WaveAddr) {
3485 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3486 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3487 .addReg(SrcReg)
3488 .addImm(Subtarget->getWavefrontSizeLog2())
3489 .setOperandDead(3); // Dead scc
3490 }
3491
3492 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3493 .addReg(WaveAddr);
3494
3495 MI.eraseFromParent();
3496 return true;
3497}
3498
3500
3501 if (!I.isPreISelOpcode()) {
3502 if (I.isCopy())
3503 return selectCOPY(I);
3504 return true;
3505 }
3506
3507 switch (I.getOpcode()) {
3508 case TargetOpcode::G_AND:
3509 case TargetOpcode::G_OR:
3510 case TargetOpcode::G_XOR:
3511 if (selectImpl(I, *CoverageInfo))
3512 return true;
3513 return selectG_AND_OR_XOR(I);
3514 case TargetOpcode::G_ADD:
3515 case TargetOpcode::G_SUB:
3516 case TargetOpcode::G_PTR_ADD:
3517 if (selectImpl(I, *CoverageInfo))
3518 return true;
3519 return selectG_ADD_SUB(I);
3520 case TargetOpcode::G_UADDO:
3521 case TargetOpcode::G_USUBO:
3522 case TargetOpcode::G_UADDE:
3523 case TargetOpcode::G_USUBE:
3524 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3525 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3526 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3527 return selectG_AMDGPU_MAD_64_32(I);
3528 case TargetOpcode::G_INTTOPTR:
3529 case TargetOpcode::G_BITCAST:
3530 case TargetOpcode::G_PTRTOINT:
3531 case TargetOpcode::G_FREEZE:
3532 return selectCOPY(I);
3533 case TargetOpcode::G_CONSTANT:
3534 case TargetOpcode::G_FCONSTANT:
3535 return selectG_CONSTANT(I);
3536 case TargetOpcode::G_FNEG:
3537 if (selectImpl(I, *CoverageInfo))
3538 return true;
3539 return selectG_FNEG(I);
3540 case TargetOpcode::G_FABS:
3541 if (selectImpl(I, *CoverageInfo))
3542 return true;
3543 return selectG_FABS(I);
3544 case TargetOpcode::G_EXTRACT:
3545 return selectG_EXTRACT(I);
3546 case TargetOpcode::G_MERGE_VALUES:
3547 case TargetOpcode::G_CONCAT_VECTORS:
3548 return selectG_MERGE_VALUES(I);
3549 case TargetOpcode::G_UNMERGE_VALUES:
3550 return selectG_UNMERGE_VALUES(I);
3551 case TargetOpcode::G_BUILD_VECTOR:
3552 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3553 return selectG_BUILD_VECTOR(I);
3554 case TargetOpcode::G_IMPLICIT_DEF:
3555 return selectG_IMPLICIT_DEF(I);
3556 case TargetOpcode::G_INSERT:
3557 return selectG_INSERT(I);
3558 case TargetOpcode::G_INTRINSIC:
3559 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3560 return selectG_INTRINSIC(I);
3561 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3562 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3563 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3564 case TargetOpcode::G_ICMP:
3565 case TargetOpcode::G_FCMP:
3566 if (selectG_ICMP_or_FCMP(I))
3567 return true;
3568 return selectImpl(I, *CoverageInfo);
3569 case TargetOpcode::G_LOAD:
3570 case TargetOpcode::G_STORE:
3571 case TargetOpcode::G_ATOMIC_CMPXCHG:
3572 case TargetOpcode::G_ATOMICRMW_XCHG:
3573 case TargetOpcode::G_ATOMICRMW_ADD:
3574 case TargetOpcode::G_ATOMICRMW_SUB:
3575 case TargetOpcode::G_ATOMICRMW_AND:
3576 case TargetOpcode::G_ATOMICRMW_OR:
3577 case TargetOpcode::G_ATOMICRMW_XOR:
3578 case TargetOpcode::G_ATOMICRMW_MIN:
3579 case TargetOpcode::G_ATOMICRMW_MAX:
3580 case TargetOpcode::G_ATOMICRMW_UMIN:
3581 case TargetOpcode::G_ATOMICRMW_UMAX:
3582 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3583 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3584 case TargetOpcode::G_ATOMICRMW_FADD:
3585 case TargetOpcode::G_ATOMICRMW_FMIN:
3586 case TargetOpcode::G_ATOMICRMW_FMAX:
3587 return selectG_LOAD_STORE_ATOMICRMW(I);
3588 case TargetOpcode::G_SELECT:
3589 return selectG_SELECT(I);
3590 case TargetOpcode::G_TRUNC:
3591 return selectG_TRUNC(I);
3592 case TargetOpcode::G_SEXT:
3593 case TargetOpcode::G_ZEXT:
3594 case TargetOpcode::G_ANYEXT:
3595 case TargetOpcode::G_SEXT_INREG:
3596 // This is a workaround. For extension from type i1, `selectImpl()` uses
3597 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3598 // i1 can only be hold in a SGPR class.
3599 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3600 selectImpl(I, *CoverageInfo))
3601 return true;
3602 return selectG_SZA_EXT(I);
3603 case TargetOpcode::G_FPEXT:
3604 if (selectG_FPEXT(I))
3605 return true;
3606 return selectImpl(I, *CoverageInfo);
3607 case TargetOpcode::G_BRCOND:
3608 return selectG_BRCOND(I);
3609 case TargetOpcode::G_GLOBAL_VALUE:
3610 return selectG_GLOBAL_VALUE(I);
3611 case TargetOpcode::G_PTRMASK:
3612 return selectG_PTRMASK(I);
3613 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3614 return selectG_EXTRACT_VECTOR_ELT(I);
3615 case TargetOpcode::G_INSERT_VECTOR_ELT:
3616 return selectG_INSERT_VECTOR_ELT(I);
3617 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3618 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3619 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3620 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3623 assert(Intr && "not an image intrinsic with image pseudo");
3624 return selectImageIntrinsic(I, Intr);
3625 }
3626 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3627 return selectBVHIntrinsic(I);
3628 case AMDGPU::G_SBFX:
3629 case AMDGPU::G_UBFX:
3630 return selectG_SBFX_UBFX(I);
3631 case AMDGPU::G_SI_CALL:
3632 I.setDesc(TII.get(AMDGPU::SI_CALL));
3633 return true;
3634 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3635 return selectWaveAddress(I);
3636 case AMDGPU::G_STACKRESTORE:
3637 return selectStackRestore(I);
3638 case AMDGPU::G_PHI:
3639 return selectPHI(I);
3640 default:
3641 return selectImpl(I, *CoverageInfo);
3642 }
3643 return false;
3644}
3645
3647AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3648 return {{
3649 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3650 }};
3651
3652}
3653
3654std::pair<Register, unsigned>
3655AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3656 bool IsCanonicalizing,
3657 bool AllowAbs, bool OpSel) const {
3658 Register Src = Root.getReg();
3659 unsigned Mods = 0;
3660 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3661
3662 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3663 Src = MI->getOperand(1).getReg();
3664 Mods |= SISrcMods::NEG;
3665 MI = getDefIgnoringCopies(Src, *MRI);
3666 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3667 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3668 // denormal mode, but we're implicitly canonicalizing in a source operand.
3669 const ConstantFP *LHS =
3670 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3671 if (LHS && LHS->isZero()) {
3672 Mods |= SISrcMods::NEG;
3673 Src = MI->getOperand(2).getReg();
3674 }
3675 }
3676
3677 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3678 Src = MI->getOperand(1).getReg();
3679 Mods |= SISrcMods::ABS;
3680 }
3681
3682 if (OpSel)
3683 Mods |= SISrcMods::OP_SEL_0;
3684
3685 return std::pair(Src, Mods);
3686}
3687
3688Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3689 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3690 bool ForceVGPR) const {
3691 if ((Mods != 0 || ForceVGPR) &&
3692 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3693
3694 // If we looked through copies to find source modifiers on an SGPR operand,
3695 // we now have an SGPR register source. To avoid potentially violating the
3696 // constant bus restriction, we need to insert a copy to a VGPR.
3697 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3698 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3699 TII.get(AMDGPU::COPY), VGPRSrc)
3700 .addReg(Src);
3701 Src = VGPRSrc;
3702 }
3703
3704 return Src;
3705}
3706
3707///
3708/// This will select either an SGPR or VGPR operand and will save us from
3709/// having to write an extra tablegen pattern.
3711AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3712 return {{
3713 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3714 }};
3715}
3716
3718AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3719 Register Src;
3720 unsigned Mods;
3721 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3722
3723 return {{
3724 [=](MachineInstrBuilder &MIB) {
3725 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3726 },
3727 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3728 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3729 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3730 }};
3731}
3732
3734AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3735 Register Src;
3736 unsigned Mods;
3737 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3738 /*IsCanonicalizing=*/true,
3739 /*AllowAbs=*/false);
3740
3741 return {{
3742 [=](MachineInstrBuilder &MIB) {
3743 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3744 },
3745 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3746 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3747 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3748 }};
3749}
3750
3752AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3753 return {{
3754 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3755 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3756 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3757 }};
3758}
3759
3761AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3762 Register Src;
3763 unsigned Mods;
3764 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3765
3766 return {{
3767 [=](MachineInstrBuilder &MIB) {
3768 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3769 },
3770 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3771 }};
3772}
3773
3775AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3776 MachineOperand &Root) const {
3777 Register Src;
3778 unsigned Mods;
3779 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3780
3781 return {{
3782 [=](MachineInstrBuilder &MIB) {
3783 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3784 },
3785 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3786 }};
3787}
3788
3790AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3791 Register Src;
3792 unsigned Mods;
3793 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3794 /*AllowAbs=*/false);
3795
3796 return {{
3797 [=](MachineInstrBuilder &MIB) {
3798 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3799 },
3800 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3801 }};
3802}
3803
3805AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3806 Register Reg = Root.getReg();
3807 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3808 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3809 return {};
3810 return {{
3811 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3812 }};
3813}
3814
3815std::pair<Register, unsigned>
3816AMDGPUInstructionSelector::selectVOP3PModsImpl(
3817 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3818 unsigned Mods = 0;
3819 MachineInstr *MI = MRI.getVRegDef(Src);
3820
3821 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3822 // It's possible to see an f32 fneg here, but unlikely.
3823 // TODO: Treat f32 fneg as only high bit.
3824 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3826 Src = MI->getOperand(1).getReg();
3827 MI = MRI.getVRegDef(Src);
3828 }
3829
3830 // TODO: Handle G_FSUB 0 as fneg
3831
3832 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3833 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3834
3835 // Packed instructions do not have abs modifiers.
3836 Mods |= SISrcMods::OP_SEL_1;
3837
3838 return std::pair(Src, Mods);
3839}
3840
3842AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3844 = Root.getParent()->getParent()->getParent()->getRegInfo();
3845
3846 Register Src;
3847 unsigned Mods;
3848 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3849
3850 return {{
3851 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3852 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3853 }};
3854}
3855
3857AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3859 = Root.getParent()->getParent()->getParent()->getRegInfo();
3860
3861 Register Src;
3862 unsigned Mods;
3863 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3864
3865 return {{
3866 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3867 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3868 }};
3869}
3870
3872AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3873 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3874 // Value is in Imm operand as i1 sign extended to int64_t.
3875 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3876 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3877 "expected i1 value");
3878 unsigned Mods = SISrcMods::OP_SEL_1;
3879 if (Root.getImm() == -1)
3880 Mods ^= SISrcMods::NEG;
3881 return {{
3882 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3883 }};
3884}
3885
3887AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3888 MachineOperand &Root) const {
3889 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3890 "expected i1 value");
3891 unsigned Mods = SISrcMods::OP_SEL_1;
3892 if (Root.getImm() != 0)
3893 Mods |= SISrcMods::OP_SEL_0;
3894
3895 return {{
3896 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3897 }};
3898}
3899
3901 MachineInstr *InsertPt,
3903 const TargetRegisterClass *DstRegClass;
3904 switch (Elts.size()) {
3905 case 8:
3906 DstRegClass = &AMDGPU::VReg_256RegClass;
3907 break;
3908 case 4:
3909 DstRegClass = &AMDGPU::VReg_128RegClass;
3910 break;
3911 case 2:
3912 DstRegClass = &AMDGPU::VReg_64RegClass;
3913 break;
3914 default:
3915 llvm_unreachable("unhandled Reg sequence size");
3916 }
3917
3918 MachineIRBuilder B(*InsertPt);
3919 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3920 .addDef(MRI.createVirtualRegister(DstRegClass));
3921 for (unsigned i = 0; i < Elts.size(); ++i) {
3922 MIB.addReg(Elts[i]);
3924 }
3925 return MIB->getOperand(0).getReg();
3926}
3927
3928static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3930 MachineInstr *InsertPt,
3932 if (ModOpcode == TargetOpcode::G_FNEG) {
3933 Mods |= SISrcMods::NEG;
3934 // Check if all elements also have abs modifier
3935 SmallVector<Register, 8> NegAbsElts;
3936 for (auto El : Elts) {
3937 Register FabsSrc;
3938 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3939 break;
3940 NegAbsElts.push_back(FabsSrc);
3941 }
3942 if (Elts.size() != NegAbsElts.size()) {
3943 // Neg
3944 Src = buildRegSequence(Elts, InsertPt, MRI);
3945 } else {
3946 // Neg and Abs
3947 Mods |= SISrcMods::NEG_HI;
3948 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
3949 }
3950 } else {
3951 assert(ModOpcode == TargetOpcode::G_FABS);
3952 // Abs
3953 Mods |= SISrcMods::NEG_HI;
3954 Src = buildRegSequence(Elts, InsertPt, MRI);
3955 }
3956}
3957
3959AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3960 Register Src = Root.getReg();
3961 unsigned Mods = SISrcMods::OP_SEL_1;
3963
3964 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3965 assert(BV->getNumSources() > 0);
3966 // Based on first element decide which mod we match, neg or abs
3967 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3968 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3969 ? AMDGPU::G_FNEG
3970 : AMDGPU::G_FABS;
3971 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3972 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3973 if (ElF32->getOpcode() != ModOpcode)
3974 break;
3975 EltsF32.push_back(ElF32->getOperand(1).getReg());
3976 }
3977
3978 // All elements had ModOpcode modifier
3979 if (BV->getNumSources() == EltsF32.size()) {
3980 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
3981 *MRI);
3982 }
3983 }
3984
3985 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3986 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
3987}
3988
3990AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
3991 Register Src = Root.getReg();
3992 unsigned Mods = SISrcMods::OP_SEL_1;
3993 SmallVector<Register, 8> EltsV2F16;
3994
3995 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
3996 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
3997 Register FNegSrc;
3998 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
3999 break;
4000 EltsV2F16.push_back(FNegSrc);
4001 }
4002
4003 // All elements had ModOpcode modifier
4004 if (CV->getNumSources() == EltsV2F16.size()) {
4005 Mods |= SISrcMods::NEG;
4006 Mods |= SISrcMods::NEG_HI;
4007 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4008 }
4009 }
4010
4011 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4012 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4013}
4014
4016AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4017 Register Src = Root.getReg();
4018 unsigned Mods = SISrcMods::OP_SEL_1;
4019 SmallVector<Register, 8> EltsV2F16;
4020
4021 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4022 assert(CV->getNumSources() > 0);
4023 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4024 // Based on first element decide which mod we match, neg or abs
4025 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4026 ? AMDGPU::G_FNEG
4027 : AMDGPU::G_FABS;
4028
4029 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4030 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4031 if (ElV2F16->getOpcode() != ModOpcode)
4032 break;
4033 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4034 }
4035
4036 // All elements had ModOpcode modifier
4037 if (CV->getNumSources() == EltsV2F16.size()) {
4038 MachineIRBuilder B(*Root.getParent());
4039 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4040 *MRI);
4041 }
4042 }
4043
4044 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4045 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4046}
4047
4049AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4050 std::optional<FPValueAndVReg> FPValReg;
4051 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4052 if (TII.isInlineConstant(FPValReg->Value)) {
4053 return {{[=](MachineInstrBuilder &MIB) {
4054 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4055 }}};
4056 }
4057 // Non-inlineable splat floats should not fall-through for integer immediate
4058 // checks.
4059 return {};
4060 }
4061
4062 APInt ICst;
4063 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4064 if (TII.isInlineConstant(ICst)) {
4065 return {
4066 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4067 }
4068 }
4069
4070 return {};
4071}
4072
4074AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4075 Register Src =
4076 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4077 unsigned Key = 0;
4078
4079 Register ShiftSrc;
4080 std::optional<ValueAndVReg> ShiftAmt;
4081 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4082 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4083 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4084 Key = ShiftAmt->Value.getZExtValue() / 8;
4085 Src = ShiftSrc;
4086 }
4087
4088 return {{
4089 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4090 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4091 }};
4092}
4093
4095AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4096
4097 Register Src =
4098 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4099 unsigned Key = 0;
4100
4101 Register ShiftSrc;
4102 std::optional<ValueAndVReg> ShiftAmt;
4103 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4104 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4105 ShiftAmt->Value.getZExtValue() == 16) {
4106 Src = ShiftSrc;
4107 Key = 1;
4108 }
4109
4110 return {{
4111 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4112 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4113 }};
4114}
4115
4117AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4118 Register Src;
4119 unsigned Mods;
4120 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4121
4122 // FIXME: Handle op_sel
4123 return {{
4124 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4125 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4126 }};
4127}
4128
4130AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4131 Register Src;
4132 unsigned Mods;
4133 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4134 /*IsCanonicalizing=*/true,
4135 /*AllowAbs=*/false,
4136 /*OpSel=*/false);
4137
4138 return {{
4139 [=](MachineInstrBuilder &MIB) {
4140 MIB.addReg(
4141 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4142 },
4143 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4144 }};
4145}
4146
4148AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4149 Register Src;
4150 unsigned Mods;
4151 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4152 /*IsCanonicalizing=*/true,
4153 /*AllowAbs=*/false,
4154 /*OpSel=*/true);
4155
4156 return {{
4157 [=](MachineInstrBuilder &MIB) {
4158 MIB.addReg(
4159 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4160 },
4161 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4162 }};
4163}
4164
4165bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4166 Register &Base,
4167 Register *SOffset,
4168 int64_t *Offset) const {
4169 MachineInstr *MI = Root.getParent();
4170 MachineBasicBlock *MBB = MI->getParent();
4171
4172 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4173 // then we can select all ptr + 32-bit offsets.
4174 SmallVector<GEPInfo, 4> AddrInfo;
4175 getAddrModeInfo(*MI, *MRI, AddrInfo);
4176
4177 if (AddrInfo.empty())
4178 return false;
4179
4180 const GEPInfo &GEPI = AddrInfo[0];
4181 std::optional<int64_t> EncodedImm;
4182
4183 if (SOffset && Offset) {
4184 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4185 /*HasSOffset=*/true);
4186 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4187 AddrInfo.size() > 1) {
4188 const GEPInfo &GEPI2 = AddrInfo[1];
4189 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4190 if (Register OffsetReg =
4191 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4192 Base = GEPI2.SgprParts[0];
4193 *SOffset = OffsetReg;
4194 *Offset = *EncodedImm;
4195 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4196 return true;
4197
4198 // For unbuffered smem loads, it is illegal for the Immediate Offset
4199 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4200 // is negative. Handle the case where the Immediate Offset + SOffset
4201 // is negative.
4202 auto SKnown = KB->getKnownBits(*SOffset);
4203 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4204 return false;
4205
4206 return true;
4207 }
4208 }
4209 }
4210 return false;
4211 }
4212
4213 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4214 /*HasSOffset=*/false);
4215 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4216 Base = GEPI.SgprParts[0];
4217 *Offset = *EncodedImm;
4218 return true;
4219 }
4220
4221 // SGPR offset is unsigned.
4222 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4223 GEPI.Imm != 0) {
4224 // If we make it this far we have a load with an 32-bit immediate offset.
4225 // It is OK to select this using a sgpr offset, because we have already
4226 // failed trying to select this load into one of the _IMM variants since
4227 // the _IMM Patterns are considered before the _SGPR patterns.
4228 Base = GEPI.SgprParts[0];
4229 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4230 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4231 .addImm(GEPI.Imm);
4232 return true;
4233 }
4234
4235 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4236 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4237 Base = GEPI.SgprParts[0];
4238 *SOffset = OffsetReg;
4239 return true;
4240 }
4241 }
4242
4243 return false;
4244}
4245
4247AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4248 Register Base;
4249 int64_t Offset;
4250 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4251 return std::nullopt;
4252
4253 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4254 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4255}
4256
4258AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4259 SmallVector<GEPInfo, 4> AddrInfo;
4260 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4261
4262 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4263 return std::nullopt;
4264
4265 const GEPInfo &GEPInfo = AddrInfo[0];
4266 Register PtrReg = GEPInfo.SgprParts[0];
4267 std::optional<int64_t> EncodedImm =
4268 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4269 if (!EncodedImm)
4270 return std::nullopt;
4271
4272 return {{
4273 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4274 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4275 }};
4276}
4277
4279AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4280 Register Base, SOffset;
4281 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4282 return std::nullopt;
4283
4284 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4285 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4286}
4287
4289AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4290 Register Base, SOffset;
4291 int64_t Offset;
4292 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4293 return std::nullopt;
4294
4295 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4296 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4297 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4298}
4299
4300std::pair<Register, int>
4301AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4302 uint64_t FlatVariant) const {
4303 MachineInstr *MI = Root.getParent();
4304
4305 auto Default = std::pair(Root.getReg(), 0);
4306
4307 if (!STI.hasFlatInstOffsets())
4308 return Default;
4309
4310 Register PtrBase;
4311 int64_t ConstOffset;
4312 std::tie(PtrBase, ConstOffset) =
4313 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4314
4315 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4316 !isFlatScratchBaseLegal(Root.getReg())))
4317 return Default;
4318
4319 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4320 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4321 return Default;
4322
4323 return std::pair(PtrBase, ConstOffset);
4324}
4325
4327AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4328 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4329
4330 return {{
4331 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4332 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4333 }};
4334}
4335
4337AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4338 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4339
4340 return {{
4341 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4342 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4343 }};
4344}
4345
4347AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4348 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4349
4350 return {{
4351 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4352 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4353 }};
4354}
4355
4356// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4358AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4359 Register Addr = Root.getReg();
4360 Register PtrBase;
4361 int64_t ConstOffset;
4362 int64_t ImmOffset = 0;
4363
4364 // Match the immediate offset first, which canonically is moved as low as
4365 // possible.
4366 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4367
4368 if (ConstOffset != 0) {
4369 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4371 Addr = PtrBase;
4372 ImmOffset = ConstOffset;
4373 } else {
4374 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4375 if (isSGPR(PtrBaseDef->Reg)) {
4376 if (ConstOffset > 0) {
4377 // Offset is too large.
4378 //
4379 // saddr + large_offset -> saddr +
4380 // (voffset = large_offset & ~MaxOffset) +
4381 // (large_offset & MaxOffset);
4382 int64_t SplitImmOffset, RemainderOffset;
4383 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4385
4386 if (isUInt<32>(RemainderOffset)) {
4387 MachineInstr *MI = Root.getParent();
4388 MachineBasicBlock *MBB = MI->getParent();
4389 Register HighBits =
4390 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4391
4392 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4393 HighBits)
4394 .addImm(RemainderOffset);
4395
4396 return {{
4397 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4398 [=](MachineInstrBuilder &MIB) {
4399 MIB.addReg(HighBits);
4400 }, // voffset
4401 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4402 }};
4403 }
4404 }
4405
4406 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4407 // is 1 we would need to perform 1 or 2 extra moves for each half of
4408 // the constant and it is better to do a scalar add and then issue a
4409 // single VALU instruction to materialize zero. Otherwise it is less
4410 // instructions to perform VALU adds with immediates or inline literals.
4411 unsigned NumLiterals =
4412 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4413 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4414 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4415 return std::nullopt;
4416 }
4417 }
4418 }
4419
4420 // Match the variable offset.
4421 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4422 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4423 // Look through the SGPR->VGPR copy.
4424 Register SAddr =
4425 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4426
4427 if (isSGPR(SAddr)) {
4428 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4429
4430 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4431 // inserted later.
4432 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4433 return {{[=](MachineInstrBuilder &MIB) { // saddr
4434 MIB.addReg(SAddr);
4435 },
4436 [=](MachineInstrBuilder &MIB) { // voffset
4437 MIB.addReg(VOffset);
4438 },
4439 [=](MachineInstrBuilder &MIB) { // offset
4440 MIB.addImm(ImmOffset);
4441 }}};
4442 }
4443 }
4444 }
4445
4446 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4447 // drop this.
4448 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4449 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4450 return std::nullopt;
4451
4452 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4453 // moves required to copy a 64-bit SGPR to VGPR.
4454 MachineInstr *MI = Root.getParent();
4455 MachineBasicBlock *MBB = MI->getParent();
4456 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4457
4458 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4459 .addImm(0);
4460
4461 return {{
4462 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4463 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4464 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4465 }};
4466}
4467
4469AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4470 Register Addr = Root.getReg();
4471 Register PtrBase;
4472 int64_t ConstOffset;
4473 int64_t ImmOffset = 0;
4474
4475 // Match the immediate offset first, which canonically is moved as low as
4476 // possible.
4477 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4478
4479 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4482 Addr = PtrBase;
4483 ImmOffset = ConstOffset;
4484 }
4485
4486 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4487 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4488 int FI = AddrDef->MI->getOperand(1).getIndex();
4489 return {{
4490 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4491 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4492 }};
4493 }
4494
4495 Register SAddr = AddrDef->Reg;
4496
4497 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4498 Register LHS = AddrDef->MI->getOperand(1).getReg();
4499 Register RHS = AddrDef->MI->getOperand(2).getReg();
4500 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4501 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4502
4503 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4504 isSGPR(RHSDef->Reg)) {
4505 int FI = LHSDef->MI->getOperand(1).getIndex();
4506 MachineInstr &I = *Root.getParent();
4507 MachineBasicBlock *BB = I.getParent();
4508 const DebugLoc &DL = I.getDebugLoc();
4509 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4510
4511 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4512 .addFrameIndex(FI)
4513 .addReg(RHSDef->Reg)
4514 .setOperandDead(3); // Dead scc
4515 }
4516 }
4517
4518 if (!isSGPR(SAddr))
4519 return std::nullopt;
4520
4521 return {{
4522 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4523 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4524 }};
4525}
4526
4527// Check whether the flat scratch SVS swizzle bug affects this access.
4528bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4529 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4530 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4531 return false;
4532
4533 // The bug affects the swizzling of SVS accesses if there is any carry out
4534 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4535 // voffset to (soffset + inst_offset).
4536 auto VKnown = KB->getKnownBits(VAddr);
4537 auto SKnown = KnownBits::computeForAddSub(
4538 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4539 KnownBits::makeConstant(APInt(32, ImmOffset)));
4540 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4541 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4542 return (VMax & 3) + (SMax & 3) >= 4;
4543}
4544
4546AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4547 Register Addr = Root.getReg();
4548 Register PtrBase;
4549 int64_t ConstOffset;
4550 int64_t ImmOffset = 0;
4551
4552 // Match the immediate offset first, which canonically is moved as low as
4553 // possible.
4554 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4555
4556 Register OrigAddr = Addr;
4557 if (ConstOffset != 0 &&
4558 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4559 Addr = PtrBase;
4560 ImmOffset = ConstOffset;
4561 }
4562
4563 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4564 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4565 return std::nullopt;
4566
4567 Register RHS = AddrDef->MI->getOperand(2).getReg();
4568 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4569 return std::nullopt;
4570
4571 Register LHS = AddrDef->MI->getOperand(1).getReg();
4572 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4573
4574 if (OrigAddr != Addr) {
4575 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4576 return std::nullopt;
4577 } else {
4578 if (!isFlatScratchBaseLegalSV(OrigAddr))
4579 return std::nullopt;
4580 }
4581
4582 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4583 return std::nullopt;
4584
4585 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4586 int FI = LHSDef->MI->getOperand(1).getIndex();
4587 return {{
4588 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4589 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4590 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4591 }};
4592 }
4593
4594 if (!isSGPR(LHS))
4595 return std::nullopt;
4596
4597 return {{
4598 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4599 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4600 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4601 }};
4602}
4603
4605AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4606 MachineInstr *MI = Root.getParent();
4607 MachineBasicBlock *MBB = MI->getParent();
4610
4611 int64_t Offset = 0;
4612 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4614 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4615
4616 // TODO: Should this be inside the render function? The iterator seems to
4617 // move.
4618 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4619 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4620 HighBits)
4621 .addImm(Offset & ~MaxOffset);
4622
4623 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4624 MIB.addReg(Info->getScratchRSrcReg());
4625 },
4626 [=](MachineInstrBuilder &MIB) { // vaddr
4627 MIB.addReg(HighBits);
4628 },
4629 [=](MachineInstrBuilder &MIB) { // soffset
4630 // Use constant zero for soffset and rely on eliminateFrameIndex
4631 // to choose the appropriate frame register if need be.
4632 MIB.addImm(0);
4633 },
4634 [=](MachineInstrBuilder &MIB) { // offset
4635 MIB.addImm(Offset & MaxOffset);
4636 }}};
4637 }
4638
4639 assert(Offset == 0 || Offset == -1);
4640
4641 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4642 // offsets.
4643 std::optional<int> FI;
4644 Register VAddr = Root.getReg();
4645 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4646 Register PtrBase;
4647 int64_t ConstOffset;
4648 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4649 if (ConstOffset != 0) {
4650 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4652 KB->signBitIsZero(PtrBase))) {
4653 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4654 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4655 FI = PtrBaseDef->getOperand(1).getIndex();
4656 else
4657 VAddr = PtrBase;
4658 Offset = ConstOffset;
4659 }
4660 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4661 FI = RootDef->getOperand(1).getIndex();
4662 }
4663 }
4664
4665 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4666 MIB.addReg(Info->getScratchRSrcReg());
4667 },
4668 [=](MachineInstrBuilder &MIB) { // vaddr
4669 if (FI)
4670 MIB.addFrameIndex(*FI);
4671 else
4672 MIB.addReg(VAddr);
4673 },
4674 [=](MachineInstrBuilder &MIB) { // soffset
4675 // Use constant zero for soffset and rely on eliminateFrameIndex
4676 // to choose the appropriate frame register if need be.
4677 MIB.addImm(0);
4678 },
4679 [=](MachineInstrBuilder &MIB) { // offset
4680 MIB.addImm(Offset);
4681 }}};
4682}
4683
4684bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4685 int64_t Offset) const {
4686 if (!isUInt<16>(Offset))
4687 return false;
4688
4690 return true;
4691
4692 // On Southern Islands instruction with a negative base value and an offset
4693 // don't seem to work.
4694 return KB->signBitIsZero(Base);
4695}
4696
4697bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4698 int64_t Offset1,
4699 unsigned Size) const {
4700 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4701 return false;
4702 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4703 return false;
4704
4706 return true;
4707
4708 // On Southern Islands instruction with a negative base value and an offset
4709 // don't seem to work.
4710 return KB->signBitIsZero(Base);
4711}
4712
4713// Return whether the operation has NoUnsignedWrap property.
4715 return Addr->getOpcode() == TargetOpcode::G_OR ||
4716 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4717 Addr->getFlag(MachineInstr::NoUWrap));
4718}
4719
4720// Check that the base address of flat scratch load/store in the form of `base +
4721// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4722// requirement). We always treat the first operand as the base address here.
4723bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4724 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4725
4726 if (isNoUnsignedWrap(AddrMI))
4727 return true;
4728
4729 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4730 // values.
4731 if (STI.hasSignedScratchOffsets())
4732 return true;
4733
4734 Register LHS = AddrMI->getOperand(1).getReg();
4735 Register RHS = AddrMI->getOperand(2).getReg();
4736
4737 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4738 std::optional<ValueAndVReg> RhsValReg =
4740 // If the immediate offset is negative and within certain range, the base
4741 // address cannot also be negative. If the base is also negative, the sum
4742 // would be either negative or much larger than the valid range of scratch
4743 // memory a thread can access.
4744 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4745 RhsValReg->Value.getSExtValue() > -0x40000000)
4746 return true;
4747 }
4748
4749 return KB->signBitIsZero(LHS);
4750}
4751
4752// Check address value in SGPR/VGPR are legal for flat scratch in the form
4753// of: SGPR + VGPR.
4754bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4755 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4756
4757 if (isNoUnsignedWrap(AddrMI))
4758 return true;
4759
4760 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4761 // values.
4762 if (STI.hasSignedScratchOffsets())
4763 return true;
4764
4765 Register LHS = AddrMI->getOperand(1).getReg();
4766 Register RHS = AddrMI->getOperand(2).getReg();
4767 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4768}
4769
4770// Check address value in SGPR/VGPR are legal for flat scratch in the form
4771// of: SGPR + VGPR + Imm.
4772bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4773 Register Addr) const {
4774 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4775 // values.
4776 if (STI.hasSignedScratchOffsets())
4777 return true;
4778
4779 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4780 Register Base = AddrMI->getOperand(1).getReg();
4781 std::optional<DefinitionAndSourceRegister> BaseDef =
4783 std::optional<ValueAndVReg> RHSOffset =
4785 assert(RHSOffset);
4786
4787 // If the immediate offset is negative and within certain range, the base
4788 // address cannot also be negative. If the base is also negative, the sum
4789 // would be either negative or much larger than the valid range of scratch
4790 // memory a thread can access.
4791 if (isNoUnsignedWrap(BaseDef->MI) &&
4792 (isNoUnsignedWrap(AddrMI) ||
4793 (RHSOffset->Value.getSExtValue() < 0 &&
4794 RHSOffset->Value.getSExtValue() > -0x40000000)))
4795 return true;
4796
4797 Register LHS = BaseDef->MI->getOperand(1).getReg();
4798 Register RHS = BaseDef->MI->getOperand(2).getReg();
4799 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4800}
4801
4802bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4803 unsigned ShAmtBits) const {
4804 assert(MI.getOpcode() == TargetOpcode::G_AND);
4805
4806 std::optional<APInt> RHS =
4807 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4808 if (!RHS)
4809 return false;
4810
4811 if (RHS->countr_one() >= ShAmtBits)
4812 return true;
4813
4814 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4815 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4816}
4817
4819AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4820 MachineOperand &Root) const {
4821 Register Reg = Root.getReg();
4823
4824 std::optional<DefinitionAndSourceRegister> Def =
4825 getDefSrcRegIgnoringCopies(Reg, *MRI);
4826 assert(Def && "this shouldn't be an optional result");
4827 Reg = Def->Reg;
4828
4829 if (Register WaveBase = getWaveAddress(Def->MI)) {
4830 return {{
4831 [=](MachineInstrBuilder &MIB) { // rsrc
4832 MIB.addReg(Info->getScratchRSrcReg());
4833 },
4834 [=](MachineInstrBuilder &MIB) { // soffset
4835 MIB.addReg(WaveBase);
4836 },
4837 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4838 }};
4839 }
4840
4841 int64_t Offset = 0;
4842
4843 // FIXME: Copy check is a hack
4845 if (mi_match(Reg, *MRI,
4846 m_GPtrAdd(m_Reg(BasePtr),
4848 if (!TII.isLegalMUBUFImmOffset(Offset))
4849 return {};
4850 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4851 Register WaveBase = getWaveAddress(BasePtrDef);
4852 if (!WaveBase)
4853 return {};
4854
4855 return {{
4856 [=](MachineInstrBuilder &MIB) { // rsrc
4857 MIB.addReg(Info->getScratchRSrcReg());
4858 },
4859 [=](MachineInstrBuilder &MIB) { // soffset
4860 MIB.addReg(WaveBase);
4861 },
4862 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4863 }};
4864 }
4865
4866 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4868 return {};
4869
4870 return {{
4871 [=](MachineInstrBuilder &MIB) { // rsrc
4872 MIB.addReg(Info->getScratchRSrcReg());
4873 },
4874 [=](MachineInstrBuilder &MIB) { // soffset
4875 MIB.addImm(0);
4876 },
4877 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4878 }};
4879}
4880
4881std::pair<Register, unsigned>
4882AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4883 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4884 if (!RootDef)
4885 return std::pair(Root.getReg(), 0);
4886
4887 int64_t ConstAddr = 0;
4888
4889 Register PtrBase;
4890 int64_t Offset;
4891 std::tie(PtrBase, Offset) =
4892 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4893
4894 if (Offset) {
4895 if (isDSOffsetLegal(PtrBase, Offset)) {
4896 // (add n0, c0)
4897 return std::pair(PtrBase, Offset);
4898 }
4899 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4900 // TODO
4901
4902
4903 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4904 // TODO
4905
4906 }
4907
4908 return std::pair(Root.getReg(), 0);
4909}
4910
4912AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4913 Register Reg;
4914 unsigned Offset;
4915 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4916 return {{
4917 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4918 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4919 }};
4920}
4921
4923AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4924 return selectDSReadWrite2(Root, 4);
4925}
4926
4928AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4929 return selectDSReadWrite2(Root, 8);
4930}
4931
4933AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4934 unsigned Size) const {
4935 Register Reg;
4936 unsigned Offset;
4937 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4938 return {{
4939 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4940 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4941 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4942 }};
4943}
4944
4945std::pair<Register, unsigned>
4946AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4947 unsigned Size) const {
4948 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4949 if (!RootDef)
4950 return std::pair(Root.getReg(), 0);
4951
4952 int64_t ConstAddr = 0;
4953
4954 Register PtrBase;
4955 int64_t Offset;
4956 std::tie(PtrBase, Offset) =
4957 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4958
4959 if (Offset) {
4960 int64_t OffsetValue0 = Offset;
4961 int64_t OffsetValue1 = Offset + Size;
4962 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4963 // (add n0, c0)
4964 return std::pair(PtrBase, OffsetValue0 / Size);
4965 }
4966 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4967 // TODO
4968
4969 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4970 // TODO
4971
4972 }
4973
4974 return std::pair(Root.getReg(), 0);
4975}
4976
4977/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4978/// the base value with the constant offset. There may be intervening copies
4979/// between \p Root and the identified constant. Returns \p Root, 0 if this does
4980/// not match the pattern.
4981std::pair<Register, int64_t>
4982AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4983 Register Root, const MachineRegisterInfo &MRI) const {
4984 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4985 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4986 return {Root, 0};
4987
4988 MachineOperand &RHS = RootI->getOperand(2);
4989 std::optional<ValueAndVReg> MaybeOffset =
4991 if (!MaybeOffset)
4992 return {Root, 0};
4993 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4994}
4995
4997 MIB.addImm(0);
4998}
4999
5000/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5001/// BasePtr is not valid, a null base pointer will be used.
5003 uint32_t FormatLo, uint32_t FormatHi,
5004 Register BasePtr) {
5005 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5006 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5007 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5008 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5009
5010 B.buildInstr(AMDGPU::S_MOV_B32)
5011 .addDef(RSrc2)
5012 .addImm(FormatLo);
5013 B.buildInstr(AMDGPU::S_MOV_B32)
5014 .addDef(RSrc3)
5015 .addImm(FormatHi);
5016
5017 // Build the half of the subregister with the constants before building the
5018 // full 128-bit register. If we are building multiple resource descriptors,
5019 // this will allow CSEing of the 2-component register.
5020 B.buildInstr(AMDGPU::REG_SEQUENCE)
5021 .addDef(RSrcHi)
5022 .addReg(RSrc2)
5023 .addImm(AMDGPU::sub0)
5024 .addReg(RSrc3)
5025 .addImm(AMDGPU::sub1);
5026
5027 Register RSrcLo = BasePtr;
5028 if (!BasePtr) {
5029 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5030 B.buildInstr(AMDGPU::S_MOV_B64)
5031 .addDef(RSrcLo)
5032 .addImm(0);
5033 }
5034
5035 B.buildInstr(AMDGPU::REG_SEQUENCE)
5036 .addDef(RSrc)
5037 .addReg(RSrcLo)
5038 .addImm(AMDGPU::sub0_sub1)
5039 .addReg(RSrcHi)
5040 .addImm(AMDGPU::sub2_sub3);
5041
5042 return RSrc;
5043}
5044
5046 const SIInstrInfo &TII, Register BasePtr) {
5047 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5048
5049 // FIXME: Why are half the "default" bits ignored based on the addressing
5050 // mode?
5051 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5052}
5053
5055 const SIInstrInfo &TII, Register BasePtr) {
5056 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5057
5058 // FIXME: Why are half the "default" bits ignored based on the addressing
5059 // mode?
5060 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5061}
5062
5063AMDGPUInstructionSelector::MUBUFAddressData
5064AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5065 MUBUFAddressData Data;
5066 Data.N0 = Src;
5067
5068 Register PtrBase;
5069 int64_t Offset;
5070
5071 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5072 if (isUInt<32>(Offset)) {
5073 Data.N0 = PtrBase;
5074 Data.Offset = Offset;
5075 }
5076
5077 if (MachineInstr *InputAdd
5078 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5079 Data.N2 = InputAdd->getOperand(1).getReg();
5080 Data.N3 = InputAdd->getOperand(2).getReg();
5081
5082 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5083 // FIXME: Don't know this was defined by operand 0
5084 //
5085 // TODO: Remove this when we have copy folding optimizations after
5086 // RegBankSelect.
5087 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5088 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5089 }
5090
5091 return Data;
5092}
5093
5094/// Return if the addr64 mubuf mode should be used for the given address.
5095bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5096 // (ptr_add N2, N3) -> addr64, or
5097 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5098 if (Addr.N2)
5099 return true;
5100
5101 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5102 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5103}
5104
5105/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5106/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5107/// component.
5108void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5109 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5110 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5111 return;
5112
5113 // Illegal offset, store it in soffset.
5114 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5115 B.buildInstr(AMDGPU::S_MOV_B32)
5116 .addDef(SOffset)
5117 .addImm(ImmOffset);
5118 ImmOffset = 0;
5119}
5120
5121bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5122 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5123 Register &SOffset, int64_t &Offset) const {
5124 // FIXME: Predicates should stop this from reaching here.
5125 // addr64 bit was removed for volcanic islands.
5126 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5127 return false;
5128
5129 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5130 if (!shouldUseAddr64(AddrData))
5131 return false;
5132
5133 Register N0 = AddrData.N0;
5134 Register N2 = AddrData.N2;
5135 Register N3 = AddrData.N3;
5136 Offset = AddrData.Offset;
5137
5138 // Base pointer for the SRD.
5139 Register SRDPtr;
5140
5141 if (N2) {
5142 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5143 assert(N3);
5144 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5145 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5146 // addr64, and construct the default resource from a 0 address.
5147 VAddr = N0;
5148 } else {
5149 SRDPtr = N3;
5150 VAddr = N2;
5151 }
5152 } else {
5153 // N2 is not divergent.
5154 SRDPtr = N2;
5155 VAddr = N3;
5156 }
5157 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5158 // Use the default null pointer in the resource
5159 VAddr = N0;
5160 } else {
5161 // N0 -> offset, or
5162 // (N0 + C1) -> offset
5163 SRDPtr = N0;
5164 }
5165
5166 MachineIRBuilder B(*Root.getParent());
5167 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5168 splitIllegalMUBUFOffset(B, SOffset, Offset);
5169 return true;
5170}
5171
5172bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5173 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5174 int64_t &Offset) const {
5175
5176 // FIXME: Pattern should not reach here.
5177 if (STI.useFlatForGlobal())
5178 return false;
5179
5180 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5181 if (shouldUseAddr64(AddrData))
5182 return false;
5183
5184 // N0 -> offset, or
5185 // (N0 + C1) -> offset
5186 Register SRDPtr = AddrData.N0;
5187 Offset = AddrData.Offset;
5188
5189 // TODO: Look through extensions for 32-bit soffset.
5190 MachineIRBuilder B(*Root.getParent());
5191
5192 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5193 splitIllegalMUBUFOffset(B, SOffset, Offset);
5194 return true;
5195}
5196
5198AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5199 Register VAddr;
5200 Register RSrcReg;
5201 Register SOffset;
5202 int64_t Offset = 0;
5203
5204 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5205 return {};
5206
5207 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5208 // pattern.
5209 return {{
5210 [=](MachineInstrBuilder &MIB) { // rsrc
5211 MIB.addReg(RSrcReg);
5212 },
5213 [=](MachineInstrBuilder &MIB) { // vaddr
5214 MIB.addReg(VAddr);
5215 },
5216 [=](MachineInstrBuilder &MIB) { // soffset
5217 if (SOffset)
5218 MIB.addReg(SOffset);
5219 else if (STI.hasRestrictedSOffset())
5220 MIB.addReg(AMDGPU::SGPR_NULL);
5221 else
5222 MIB.addImm(0);
5223 },
5224 [=](MachineInstrBuilder &MIB) { // offset
5225 MIB.addImm(Offset);
5226 },
5227 addZeroImm, // cpol
5228 addZeroImm, // tfe
5229 addZeroImm // swz
5230 }};
5231}
5232
5234AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5235 Register RSrcReg;
5236 Register SOffset;
5237 int64_t Offset = 0;
5238
5239 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5240 return {};
5241
5242 return {{
5243 [=](MachineInstrBuilder &MIB) { // rsrc
5244 MIB.addReg(RSrcReg);
5245 },
5246 [=](MachineInstrBuilder &MIB) { // soffset
5247 if (SOffset)
5248 MIB.addReg(SOffset);
5249 else if (STI.hasRestrictedSOffset())
5250 MIB.addReg(AMDGPU::SGPR_NULL);
5251 else
5252 MIB.addImm(0);
5253 },
5254 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5255 addZeroImm, // cpol
5256 addZeroImm, // tfe
5257 addZeroImm, // swz
5258 }};
5259}
5260
5262AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5263
5264 Register SOffset = Root.getReg();
5265
5266 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5267 SOffset = AMDGPU::SGPR_NULL;
5268
5269 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5270}
5271
5272/// Get an immediate that must be 32-bits, and treated as zero extended.
5273static std::optional<uint64_t>
5275 // getIConstantVRegVal sexts any values, so see if that matters.
5276 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5277 if (!OffsetVal || !isInt<32>(*OffsetVal))
5278 return std::nullopt;
5279 return Lo_32(*OffsetVal);
5280}
5281
5283AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5284 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5285 if (!OffsetVal)
5286 return {};
5287
5288 std::optional<int64_t> EncodedImm =
5289 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5290 if (!EncodedImm)
5291 return {};
5292
5293 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5294}
5295
5297AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5299
5300 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5301 if (!OffsetVal)
5302 return {};
5303
5304 std::optional<int64_t> EncodedImm =
5306 if (!EncodedImm)
5307 return {};
5308
5309 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5310}
5311
5313AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5314 // Match the (soffset + offset) pair as a 32-bit register base and
5315 // an immediate offset.
5316 Register SOffset;
5317 unsigned Offset;
5318 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5319 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5320 if (!SOffset)
5321 return std::nullopt;
5322
5323 std::optional<int64_t> EncodedOffset =
5324 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5325 if (!EncodedOffset)
5326 return std::nullopt;
5327
5328 assert(MRI->getType(SOffset) == LLT::scalar(32));
5329 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5330 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5331}
5332
5333// Variant of stripBitCast that returns the instruction instead of a
5334// MachineOperand.
5336 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5337 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5338 return MI;
5339}
5340
5341// Figure out if this is really an extract of the high 16-bits of a dword,
5342// returns nullptr if it isn't.
5345 Inst = stripBitCast(Inst, MRI);
5346
5347 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5348 return nullptr;
5349
5350 MachineInstr *TruncOp =
5352 TruncOp = stripBitCast(TruncOp, MRI);
5353
5354 // G_LSHR x, (G_CONSTANT i32 16)
5355 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5356 auto SrlAmount = getIConstantVRegValWithLookThrough(
5357 TruncOp->getOperand(2).getReg(), MRI);
5358 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5359 MachineInstr *SrlOp =
5360 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5361 return stripBitCast(SrlOp, MRI);
5362 }
5363 }
5364
5365 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5366 // 1, 0 swaps the low/high 16 bits.
5367 // 1, 1 sets the high 16 bits to be the same as the low 16.
5368 // in any case, it selects the high elts.
5369 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5370 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5371 LLT::fixed_vector(2, 16));
5372
5373 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5374 assert(Mask.size() == 2);
5375
5376 if (Mask[0] == 1 && Mask[1] <= 1) {
5377 MachineInstr *LHS =
5378 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5379 return stripBitCast(LHS, MRI);
5380 }
5381 }
5382
5383 return nullptr;
5384}
5385
5386std::pair<Register, unsigned>
5387AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5388 bool &Matched) const {
5389 Matched = false;
5390
5391 Register Src;
5392 unsigned Mods;
5393 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5394
5395 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5396 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5397 MachineOperand *MO = &MI->getOperand(1);
5398 Src = MO->getReg();
5399 MI = getDefIgnoringCopies(Src, *MRI);
5400
5401 assert(MRI->getType(Src) == LLT::scalar(16));
5402
5403 // See through bitcasts.
5404 // FIXME: Would be nice to use stripBitCast here.
5405 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5406 MO = &MI->getOperand(1);
5407 Src = MO->getReg();
5408 MI = getDefIgnoringCopies(Src, *MRI);
5409 }
5410
5411 const auto CheckAbsNeg = [&]() {
5412 // Be careful about folding modifiers if we already have an abs. fneg is
5413 // applied last, so we don't want to apply an earlier fneg.
5414 if ((Mods & SISrcMods::ABS) == 0) {
5415 unsigned ModsTmp;
5416 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5417 MI = getDefIgnoringCopies(Src, *MRI);
5418
5419 if ((ModsTmp & SISrcMods::NEG) != 0)
5420 Mods ^= SISrcMods::NEG;
5421
5422 if ((ModsTmp & SISrcMods::ABS) != 0)
5423 Mods |= SISrcMods::ABS;
5424 }
5425 };
5426
5427 CheckAbsNeg();
5428
5429 // op_sel/op_sel_hi decide the source type and source.
5430 // If the source's op_sel_hi is set, it indicates to do a conversion from
5431 // fp16. If the sources's op_sel is set, it picks the high half of the
5432 // source register.
5433
5434 Mods |= SISrcMods::OP_SEL_1;
5435
5436 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5437 Mods |= SISrcMods::OP_SEL_0;
5438 MI = ExtractHiEltMI;
5439 MO = &MI->getOperand(0);
5440 Src = MO->getReg();
5441
5442 CheckAbsNeg();
5443 }
5444
5445 Matched = true;
5446 }
5447
5448 return {Src, Mods};
5449}
5450
5452AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5453 MachineOperand &Root) const {
5454 Register Src;
5455 unsigned Mods;
5456 bool Matched;
5457 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5458 if (!Matched)
5459 return {};
5460
5461 return {{
5462 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5463 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5464 }};
5465}
5466
5468AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5469 Register Src;
5470 unsigned Mods;
5471 bool Matched;
5472 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5473
5474 return {{
5475 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5476 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5477 }};
5478}
5479
5480bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5481 MachineInstr &I, Intrinsic::ID IntrID) const {
5482 MachineBasicBlock *MBB = I.getParent();
5483 const DebugLoc &DL = I.getDebugLoc();
5484 Register CCReg = I.getOperand(0).getReg();
5485
5486 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5487
5488 if (HasM0) {
5489 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5490 .addReg(I.getOperand(2).getReg());
5491 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5492 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5493 return false;
5494 } else {
5495 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5496 .addImm(I.getOperand(2).getImm());
5497 }
5498
5499 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5500
5501 I.eraseFromParent();
5502 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5503 *MRI);
5504}
5505
5506unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5507 if (HasInlineConst) {
5508 switch (IntrID) {
5509 default:
5510 llvm_unreachable("not a named barrier op");
5511 case Intrinsic::amdgcn_s_barrier_init:
5512 return AMDGPU::S_BARRIER_INIT_IMM;
5513 case Intrinsic::amdgcn_s_barrier_join:
5514 return AMDGPU::S_BARRIER_JOIN_IMM;
5515 case Intrinsic::amdgcn_s_wakeup_barrier:
5516 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5517 case Intrinsic::amdgcn_s_get_barrier_state:
5518 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5519 };
5520 } else {
5521 switch (IntrID) {
5522 default:
5523 llvm_unreachable("not a named barrier op");
5524 case Intrinsic::amdgcn_s_barrier_init:
5525 return AMDGPU::S_BARRIER_INIT_M0;
5526 case Intrinsic::amdgcn_s_barrier_join:
5527 return AMDGPU::S_BARRIER_JOIN_M0;
5528 case Intrinsic::amdgcn_s_wakeup_barrier:
5529 return AMDGPU::S_WAKEUP_BARRIER_M0;
5530 case Intrinsic::amdgcn_s_get_barrier_state:
5531 return AMDGPU::S_GET_BARRIER_STATE_M0;
5532 };
5533 }
5534}
5535
5536bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5537 MachineInstr &I, Intrinsic::ID IntrID) const {
5538 MachineBasicBlock *MBB = I.getParent();
5539 const DebugLoc &DL = I.getDebugLoc();
5540 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5541 ? I.getOperand(2)
5542 : I.getOperand(1);
5543 std::optional<int64_t> BarValImm =
5544 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5545 Register M0Val;
5546 Register TmpReg0;
5547
5548 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5549 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5550 Register MemberCount = I.getOperand(2).getReg();
5551 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5552 // TODO: This should be expanded during legalization so that the the S_LSHL
5553 // and S_OR can be constant-folded
5554 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5555 .addImm(16)
5556 .addReg(MemberCount);
5557 M0Val = TmpReg0;
5558 }
5559
5560 // If not inlinable, get reference to barrier depending on the instruction
5561 if (!BarValImm) {
5562 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5563 // If reference to barrier id is not an inlinable constant then it must be
5564 // referenced with M0[4:0]. Perform an OR with the member count to include
5565 // it in M0 for S_BARRIER_INIT.
5566 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5567 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5568 .addReg(BarOp.getReg())
5569 .addReg(TmpReg0);
5570 M0Val = TmpReg1;
5571 } else {
5572 M0Val = BarOp.getReg();
5573 }
5574 }
5575
5576 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5577 if (M0Val) {
5578 auto CopyMIB =
5579 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5580 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5581 }
5582
5584 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5585 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5586
5587 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5588 MIB.addDef(I.getOperand(0).getReg());
5589
5590 if (BarValImm)
5591 MIB.addImm(*BarValImm);
5592
5593 I.eraseFromParent();
5594 return true;
5595}
5596
5597bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5598 MachineBasicBlock *BB = I.getParent();
5599 const DebugLoc &DL = I.getDebugLoc();
5600 Register CCReg = I.getOperand(0).getReg();
5601
5602 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5603 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5604
5605 I.eraseFromParent();
5606 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5607 *MRI);
5608}
5609
5610void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5611 const MachineInstr &MI,
5612 int OpIdx) const {
5613 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5614 "Expected G_CONSTANT");
5615 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5616}
5617
5618void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5619 const MachineInstr &MI,
5620 int OpIdx) const {
5621 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5622 "Expected G_CONSTANT");
5623 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5624}
5625
5626void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5627 const MachineInstr &MI,
5628 int OpIdx) const {
5629 assert(OpIdx == -1);
5630
5631 const MachineOperand &Op = MI.getOperand(1);
5632 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5633 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5634 else {
5635 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5636 MIB.addImm(Op.getCImm()->getSExtValue());
5637 }
5638}
5639
5640void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5641 const MachineInstr &MI,
5642 int OpIdx) const {
5643 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5644 "Expected G_CONSTANT");
5645 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5646}
5647
5648/// This only really exists to satisfy DAG type checking machinery, so is a
5649/// no-op here.
5650void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5651 const MachineInstr &MI,
5652 int OpIdx) const {
5653 MIB.addImm(MI.getOperand(OpIdx).getImm());
5654}
5655
5656void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5657 const MachineInstr &MI,
5658 int OpIdx) const {
5659 assert(OpIdx >= 0 && "expected to match an immediate operand");
5660 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5661}
5662
5663void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5664 const MachineInstr &MI,
5665 int OpIdx) const {
5666 assert(OpIdx >= 0 && "expected to match an immediate operand");
5667 MIB.addImm(MI.getOperand(OpIdx).getImm() &
5670}
5671
5672void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5673 const MachineInstr &MI,
5674 int OpIdx) const {
5675 assert(OpIdx >= 0 && "expected to match an immediate operand");
5676 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5679 MIB.addImm(Swizzle);
5680}
5681
5682void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5683 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5684 assert(OpIdx >= 0 && "expected to match an immediate operand");
5685 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5688 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5689}
5690
5691void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5692 const MachineInstr &MI,
5693 int OpIdx) const {
5694 MIB.addFrameIndex(MI.getOperand(1).getIndex());
5695}
5696
5697void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5698 const MachineInstr &MI,
5699 int OpIdx) const {
5700 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5701 int ExpVal = APF.getExactLog2Abs();
5702 assert(ExpVal != INT_MIN);
5703 MIB.addImm(ExpVal);
5704}
5705
5706bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5707 return TII.isInlineConstant(Imm);
5708}
5709
5710bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5711 return TII.isInlineConstant(Imm);
5712}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static int sizeToSubRegIndex(unsigned Size)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static MachineInstr * stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
const char LLVMTargetMachineRef TM
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1398
APInt bitcastToAPInt() const
Definition: APFloat.h:1260
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:286
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:276
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1615
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:774
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:772
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:771
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:769
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:770
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:759
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
const APFloat & getValueAPF() const
Definition: Constants.h:312
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:998
int getLDSBankCount() const
Definition: GCNSubtarget.h:339
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:467
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:471
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:626
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:552
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:273
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:701
bool isWave32() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:530
Generation getGeneration() const
Definition: GCNSubtarget.h:316
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:731
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:715
bool hasAddr64() const
Definition: GCNSubtarget.h:380
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:723
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1067
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
const ConstantFP * getFPImm() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
const Triple & getTargetTriple() const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1513
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:903
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:639
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:452
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:479
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:307
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:432
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:426
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:460
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:486
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:51
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.