LLVM 19.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
50#include "AMDGPUGenGlobalISel.inc"
53#include "AMDGPUGenGlobalISel.inc"
55{
56}
57
58const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
61 CodeGenCoverage *CoverageInfo,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
68}
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 bool IsSGPR = TRI.isSGPRClass(SrcRC);
165 unsigned AndOpc =
166 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
167 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
168 .addImm(1)
169 .addReg(SrcReg);
170 if (IsSGPR)
171 And.setOperandDead(3); // Dead scc
172
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
174 .addImm(0)
175 .addReg(MaskedReg);
176 }
177
178 if (!MRI->getRegClassOrNull(SrcReg))
179 MRI->setRegClass(SrcReg, SrcRC);
180 I.eraseFromParent();
181 return true;
182 }
183
184 const TargetRegisterClass *RC =
186 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
187 return false;
188
189 return true;
190 }
191
192 for (const MachineOperand &MO : I.operands()) {
193 if (MO.getReg().isPhysical())
194 continue;
195
196 const TargetRegisterClass *RC =
198 if (!RC)
199 continue;
200 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
201 }
202 return true;
203}
204
205bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
206 const Register DefReg = I.getOperand(0).getReg();
207 const LLT DefTy = MRI->getType(DefReg);
208
209 // S1 G_PHIs should not be selected in instruction-select, instead:
210 // - divergent S1 G_PHI should go through lane mask merging algorithm
211 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
212 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
213 if (DefTy == LLT::scalar(1))
214 return false;
215
216 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
217
218 const RegClassOrRegBank &RegClassOrBank =
219 MRI->getRegClassOrRegBank(DefReg);
220
221 const TargetRegisterClass *DefRC
222 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
223 if (!DefRC) {
224 if (!DefTy.isValid()) {
225 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
226 return false;
227 }
228
229 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
230 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
231 if (!DefRC) {
232 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
233 return false;
234 }
235 }
236
237 // TODO: Verify that all registers have the same bank
238 I.setDesc(TII.get(TargetOpcode::PHI));
239 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
240}
241
243AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
244 const TargetRegisterClass &SubRC,
245 unsigned SubIdx) const {
246
247 MachineInstr *MI = MO.getParent();
249 Register DstReg = MRI->createVirtualRegister(&SubRC);
250
251 if (MO.isReg()) {
252 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
253 Register Reg = MO.getReg();
254 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
255 .addReg(Reg, 0, ComposedSubIdx);
256
257 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
258 MO.isKill(), MO.isDead(), MO.isUndef(),
259 MO.isEarlyClobber(), 0, MO.isDebug(),
260 MO.isInternalRead());
261 }
262
263 assert(MO.isImm());
264
265 APInt Imm(64, MO.getImm());
266
267 switch (SubIdx) {
268 default:
269 llvm_unreachable("do not know to split immediate with this sub index.");
270 case AMDGPU::sub0:
271 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
272 case AMDGPU::sub1:
273 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
274 }
275}
276
277static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
278 switch (Opc) {
279 case AMDGPU::G_AND:
280 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
281 case AMDGPU::G_OR:
282 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
283 case AMDGPU::G_XOR:
284 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
285 default:
286 llvm_unreachable("not a bit op");
287 }
288}
289
290bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
293
294 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
295 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
296 DstRB->getID() != AMDGPU::VCCRegBankID)
297 return false;
298
299 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
300 STI.isWave64());
301 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
302
303 // Dead implicit-def of scc
304 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
305 true, // isImp
306 false, // isKill
307 true)); // isDead
308 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
309}
310
311bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
312 MachineBasicBlock *BB = I.getParent();
314 Register DstReg = I.getOperand(0).getReg();
315 const DebugLoc &DL = I.getDebugLoc();
316 LLT Ty = MRI->getType(DstReg);
317 if (Ty.isVector())
318 return false;
319
320 unsigned Size = Ty.getSizeInBits();
321 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
322 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
323 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
324
325 if (Size == 32) {
326 if (IsSALU) {
327 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
329 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
330 .add(I.getOperand(1))
331 .add(I.getOperand(2))
332 .setOperandDead(3); // Dead scc
333 I.eraseFromParent();
334 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
335 }
336
337 if (STI.hasAddNoCarry()) {
338 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
339 I.setDesc(TII.get(Opc));
340 I.addOperand(*MF, MachineOperand::CreateImm(0));
341 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
342 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
343 }
344
345 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
346
347 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
349 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
350 .addDef(UnusedCarry, RegState::Dead)
351 .add(I.getOperand(1))
352 .add(I.getOperand(2))
353 .addImm(0);
354 I.eraseFromParent();
355 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
356 }
357
358 assert(!Sub && "illegal sub should not reach here");
359
360 const TargetRegisterClass &RC
361 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
362 const TargetRegisterClass &HalfRC
363 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
364
365 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
366 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
367 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
368 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
369
370 Register DstLo = MRI->createVirtualRegister(&HalfRC);
371 Register DstHi = MRI->createVirtualRegister(&HalfRC);
372
373 if (IsSALU) {
374 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
375 .add(Lo1)
376 .add(Lo2);
377 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
378 .add(Hi1)
379 .add(Hi2)
380 .setOperandDead(3); // Dead scc
381 } else {
382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
383 Register CarryReg = MRI->createVirtualRegister(CarryRC);
384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
385 .addDef(CarryReg)
386 .add(Lo1)
387 .add(Lo2)
388 .addImm(0);
389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
391 .add(Hi1)
392 .add(Hi2)
393 .addReg(CarryReg, RegState::Kill)
394 .addImm(0);
395
396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
397 return false;
398 }
399
400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
401 .addReg(DstLo)
402 .addImm(AMDGPU::sub0)
403 .addReg(DstHi)
404 .addImm(AMDGPU::sub1);
405
406
407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
408 return false;
409
410 I.eraseFromParent();
411 return true;
412}
413
414bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
415 MachineInstr &I) const {
416 MachineBasicBlock *BB = I.getParent();
418 const DebugLoc &DL = I.getDebugLoc();
419 Register Dst0Reg = I.getOperand(0).getReg();
420 Register Dst1Reg = I.getOperand(1).getReg();
421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
422 I.getOpcode() == AMDGPU::G_UADDE;
423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
424 I.getOpcode() == AMDGPU::G_USUBE;
425
426 if (isVCC(Dst1Reg, *MRI)) {
427 unsigned NoCarryOpc =
428 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
429 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
430 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
431 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
432 I.addOperand(*MF, MachineOperand::CreateImm(0));
433 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
434 }
435
436 Register Src0Reg = I.getOperand(2).getReg();
437 Register Src1Reg = I.getOperand(3).getReg();
438
439 if (HasCarryIn) {
440 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
441 .addReg(I.getOperand(4).getReg());
442 }
443
444 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
445 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
446
447 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
448 .add(I.getOperand(2))
449 .add(I.getOperand(3));
450
451 if (MRI->use_nodbg_empty(Dst1Reg)) {
452 CarryInst.setOperandDead(3); // Dead scc
453 } else {
454 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
455 .addReg(AMDGPU::SCC);
456 if (!MRI->getRegClassOrNull(Dst1Reg))
457 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
458 }
459
460 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
461 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
462 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
463 return false;
464
465 if (HasCarryIn &&
466 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
467 AMDGPU::SReg_32RegClass, *MRI))
468 return false;
469
470 I.eraseFromParent();
471 return true;
472}
473
474bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
475 MachineInstr &I) const {
476 MachineBasicBlock *BB = I.getParent();
478 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
479
480 unsigned Opc;
481 if (Subtarget->hasMADIntraFwdBug())
482 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
483 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
484 else
485 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
486 I.setDesc(TII.get(Opc));
487 I.addOperand(*MF, MachineOperand::CreateImm(0));
488 I.addImplicitDefUseOperands(*MF);
489 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
490}
491
492// TODO: We should probably legalize these to only using 32-bit results.
493bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
494 MachineBasicBlock *BB = I.getParent();
495 Register DstReg = I.getOperand(0).getReg();
496 Register SrcReg = I.getOperand(1).getReg();
497 LLT DstTy = MRI->getType(DstReg);
498 LLT SrcTy = MRI->getType(SrcReg);
499 const unsigned SrcSize = SrcTy.getSizeInBits();
500 unsigned DstSize = DstTy.getSizeInBits();
501
502 // TODO: Should handle any multiple of 32 offset.
503 unsigned Offset = I.getOperand(2).getImm();
504 if (Offset % 32 != 0 || DstSize > 128)
505 return false;
506
507 // 16-bit operations really use 32-bit registers.
508 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
509 if (DstSize == 16)
510 DstSize = 32;
511
512 const TargetRegisterClass *DstRC =
513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
515 return false;
516
517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
518 const TargetRegisterClass *SrcRC =
519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
520 if (!SrcRC)
521 return false;
523 DstSize / 32);
524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
525 if (!SrcRC)
526 return false;
527
528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
529 *SrcRC, I.getOperand(1));
530 const DebugLoc &DL = I.getDebugLoc();
531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
532 .addReg(SrcReg, 0, SubReg);
533
534 I.eraseFromParent();
535 return true;
536}
537
538bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
539 MachineBasicBlock *BB = MI.getParent();
540 Register DstReg = MI.getOperand(0).getReg();
541 LLT DstTy = MRI->getType(DstReg);
542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
543
544 const unsigned SrcSize = SrcTy.getSizeInBits();
545 if (SrcSize < 32)
546 return selectImpl(MI, *CoverageInfo);
547
548 const DebugLoc &DL = MI.getDebugLoc();
549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
550 const unsigned DstSize = DstTy.getSizeInBits();
551 const TargetRegisterClass *DstRC =
552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
553 if (!DstRC)
554 return false;
555
556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
560 MachineOperand &Src = MI.getOperand(I + 1);
561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
562 MIB.addImm(SubRegs[I]);
563
564 const TargetRegisterClass *SrcRC
565 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
567 return false;
568 }
569
570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
571 return false;
572
573 MI.eraseFromParent();
574 return true;
575}
576
577bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
578 MachineBasicBlock *BB = MI.getParent();
579 const int NumDst = MI.getNumOperands() - 1;
580
581 MachineOperand &Src = MI.getOperand(NumDst);
582
583 Register SrcReg = Src.getReg();
584 Register DstReg0 = MI.getOperand(0).getReg();
585 LLT DstTy = MRI->getType(DstReg0);
586 LLT SrcTy = MRI->getType(SrcReg);
587
588 const unsigned DstSize = DstTy.getSizeInBits();
589 const unsigned SrcSize = SrcTy.getSizeInBits();
590 const DebugLoc &DL = MI.getDebugLoc();
591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
592
593 const TargetRegisterClass *SrcRC =
594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
596 return false;
597
598 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
599 // source, and this relies on the fact that the same subregister indices are
600 // used for both.
601 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
602 for (int I = 0, E = NumDst; I != E; ++I) {
603 MachineOperand &Dst = MI.getOperand(I);
604 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
605 .addReg(SrcReg, 0, SubRegs[I]);
606
607 // Make sure the subregister index is valid for the source register.
608 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
609 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
610 return false;
611
612 const TargetRegisterClass *DstRC =
614 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
615 return false;
616 }
617
618 MI.eraseFromParent();
619 return true;
620}
621
622bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
623 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
624 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
625
626 Register Src0 = MI.getOperand(1).getReg();
627 Register Src1 = MI.getOperand(2).getReg();
628 LLT SrcTy = MRI->getType(Src0);
629 const unsigned SrcSize = SrcTy.getSizeInBits();
630
631 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
632 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
633 return selectG_MERGE_VALUES(MI);
634 }
635
636 // Selection logic below is for V2S16 only.
637 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
638 Register Dst = MI.getOperand(0).getReg();
639 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
640 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
641 SrcTy != LLT::scalar(32)))
642 return selectImpl(MI, *CoverageInfo);
643
644 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
645 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
646 return false;
647
648 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
649 DstBank->getID() == AMDGPU::VGPRRegBankID);
650 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
651
652 const DebugLoc &DL = MI.getDebugLoc();
653 MachineBasicBlock *BB = MI.getParent();
654
655 // First, before trying TableGen patterns, check if both sources are
656 // constants. In those cases, we can trivially compute the final constant
657 // and emit a simple move.
658 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
659 if (ConstSrc1) {
660 auto ConstSrc0 =
661 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
662 if (ConstSrc0) {
663 const int64_t K0 = ConstSrc0->Value.getSExtValue();
664 const int64_t K1 = ConstSrc1->Value.getSExtValue();
665 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
666 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
667 uint32_t Imm = Lo16 | (Hi16 << 16);
668
669 // VALU
670 if (IsVector) {
671 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
672 MI.eraseFromParent();
673 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
674 }
675
676 // SALU
677 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
678 MI.eraseFromParent();
679 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
680 }
681 }
682
683 // Now try TableGen patterns.
684 if (selectImpl(MI, *CoverageInfo))
685 return true;
686
687 // TODO: This should probably be a combine somewhere
688 // (build_vector $src0, undef) -> copy $src0
689 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
690 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
691 MI.setDesc(TII.get(AMDGPU::COPY));
692 MI.removeOperand(2);
693 const auto &RC =
694 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
695 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
696 RBI.constrainGenericRegister(Src0, RC, *MRI);
697 }
698
699 // TODO: Can be improved?
700 if (IsVector) {
701 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
702 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
703 .addImm(0xFFFF)
704 .addReg(Src0);
705 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
706 return false;
707
708 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
709 .addReg(Src1)
710 .addImm(16)
711 .addReg(TmpReg);
712 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
713 return false;
714
715 MI.eraseFromParent();
716 return true;
717 }
718
719 Register ShiftSrc0;
720 Register ShiftSrc1;
721
722 // With multiple uses of the shift, this will duplicate the shift and
723 // increase register pressure.
724 //
725 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
726 // => (S_PACK_HH_B32_B16 $src0, $src1)
727 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
728 // => (S_PACK_HL_B32_B16 $src0, $src1)
729 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
730 // => (S_PACK_LH_B32_B16 $src0, $src1)
731 // (build_vector $src0, $src1)
732 // => (S_PACK_LL_B32_B16 $src0, $src1)
733
734 bool Shift0 = mi_match(
735 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
736
737 bool Shift1 = mi_match(
738 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
739
740 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
741 if (Shift0 && Shift1) {
742 Opc = AMDGPU::S_PACK_HH_B32_B16;
743 MI.getOperand(1).setReg(ShiftSrc0);
744 MI.getOperand(2).setReg(ShiftSrc1);
745 } else if (Shift1) {
746 Opc = AMDGPU::S_PACK_LH_B32_B16;
747 MI.getOperand(2).setReg(ShiftSrc1);
748 } else if (Shift0) {
749 auto ConstSrc1 =
750 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
751 if (ConstSrc1 && ConstSrc1->Value == 0) {
752 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
753 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
754 .addReg(ShiftSrc0)
755 .addImm(16)
756 .setOperandDead(3); // Dead scc
757
758 MI.eraseFromParent();
759 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
760 }
761 if (STI.hasSPackHL()) {
762 Opc = AMDGPU::S_PACK_HL_B32_B16;
763 MI.getOperand(1).setReg(ShiftSrc0);
764 }
765 }
766
767 MI.setDesc(TII.get(Opc));
768 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
769}
770
771bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
772 const MachineOperand &MO = I.getOperand(0);
773
774 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
775 // regbank check here is to know why getConstrainedRegClassForOperand failed.
777 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
778 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
779 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
780 return true;
781 }
782
783 return false;
784}
785
786bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
787 MachineBasicBlock *BB = I.getParent();
788
789 Register DstReg = I.getOperand(0).getReg();
790 Register Src0Reg = I.getOperand(1).getReg();
791 Register Src1Reg = I.getOperand(2).getReg();
792 LLT Src1Ty = MRI->getType(Src1Reg);
793
794 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
795 unsigned InsSize = Src1Ty.getSizeInBits();
796
797 int64_t Offset = I.getOperand(3).getImm();
798
799 // FIXME: These cases should have been illegal and unnecessary to check here.
800 if (Offset % 32 != 0 || InsSize % 32 != 0)
801 return false;
802
803 // Currently not handled by getSubRegFromChannel.
804 if (InsSize > 128)
805 return false;
806
807 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
808 if (SubReg == AMDGPU::NoSubRegister)
809 return false;
810
811 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
812 const TargetRegisterClass *DstRC =
813 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
814 if (!DstRC)
815 return false;
816
817 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
818 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
819 const TargetRegisterClass *Src0RC =
820 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
821 const TargetRegisterClass *Src1RC =
822 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
823
824 // Deal with weird cases where the class only partially supports the subreg
825 // index.
826 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
827 if (!Src0RC || !Src1RC)
828 return false;
829
830 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
831 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
832 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
833 return false;
834
835 const DebugLoc &DL = I.getDebugLoc();
836 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
837 .addReg(Src0Reg)
838 .addReg(Src1Reg)
839 .addImm(SubReg);
840
841 I.eraseFromParent();
842 return true;
843}
844
845bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
846 Register DstReg = MI.getOperand(0).getReg();
847 Register SrcReg = MI.getOperand(1).getReg();
848 Register OffsetReg = MI.getOperand(2).getReg();
849 Register WidthReg = MI.getOperand(3).getReg();
850
851 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
852 "scalar BFX instructions are expanded in regbankselect");
853 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
854 "64-bit vector BFX instructions are expanded in regbankselect");
855
856 const DebugLoc &DL = MI.getDebugLoc();
857 MachineBasicBlock *MBB = MI.getParent();
858
859 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
860 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
861 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
862 .addReg(SrcReg)
863 .addReg(OffsetReg)
864 .addReg(WidthReg);
865 MI.eraseFromParent();
866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
867}
868
869bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
870 if (STI.getLDSBankCount() != 16)
871 return selectImpl(MI, *CoverageInfo);
872
873 Register Dst = MI.getOperand(0).getReg();
874 Register Src0 = MI.getOperand(2).getReg();
875 Register M0Val = MI.getOperand(6).getReg();
876 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
877 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
878 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
879 return false;
880
881 // This requires 2 instructions. It is possible to write a pattern to support
882 // this, but the generated isel emitter doesn't correctly deal with multiple
883 // output instructions using the same physical register input. The copy to m0
884 // is incorrectly placed before the second instruction.
885 //
886 // TODO: Match source modifiers.
887
888 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
889 const DebugLoc &DL = MI.getDebugLoc();
890 MachineBasicBlock *MBB = MI.getParent();
891
892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
893 .addReg(M0Val);
894 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
895 .addImm(2)
896 .addImm(MI.getOperand(4).getImm()) // $attr
897 .addImm(MI.getOperand(3).getImm()); // $attrchan
898
899 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
900 .addImm(0) // $src0_modifiers
901 .addReg(Src0) // $src0
902 .addImm(MI.getOperand(4).getImm()) // $attr
903 .addImm(MI.getOperand(3).getImm()) // $attrchan
904 .addImm(0) // $src2_modifiers
905 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
906 .addImm(MI.getOperand(5).getImm()) // $high
907 .addImm(0) // $clamp
908 .addImm(0); // $omod
909
910 MI.eraseFromParent();
911 return true;
912}
913
914// Writelane is special in that it can use SGPR and M0 (which would normally
915// count as using the constant bus twice - but in this case it is allowed since
916// the lane selector doesn't count as a use of the constant bus). However, it is
917// still required to abide by the 1 SGPR rule. Fix this up if we might have
918// multiple SGPRs.
919bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
920 // With a constant bus limit of at least 2, there's no issue.
921 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
922 return selectImpl(MI, *CoverageInfo);
923
924 MachineBasicBlock *MBB = MI.getParent();
925 const DebugLoc &DL = MI.getDebugLoc();
926 Register VDst = MI.getOperand(0).getReg();
927 Register Val = MI.getOperand(2).getReg();
928 Register LaneSelect = MI.getOperand(3).getReg();
929 Register VDstIn = MI.getOperand(4).getReg();
930
931 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
932
933 std::optional<ValueAndVReg> ConstSelect =
934 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
935 if (ConstSelect) {
936 // The selector has to be an inline immediate, so we can use whatever for
937 // the other operands.
938 MIB.addReg(Val);
939 MIB.addImm(ConstSelect->Value.getSExtValue() &
940 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
941 } else {
942 std::optional<ValueAndVReg> ConstVal =
944
945 // If the value written is an inline immediate, we can get away without a
946 // copy to m0.
947 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
948 STI.hasInv2PiInlineImm())) {
949 MIB.addImm(ConstVal->Value.getSExtValue());
950 MIB.addReg(LaneSelect);
951 } else {
952 MIB.addReg(Val);
953
954 // If the lane selector was originally in a VGPR and copied with
955 // readfirstlane, there's a hazard to read the same SGPR from the
956 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
957 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
958
959 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
960 .addReg(LaneSelect);
961 MIB.addReg(AMDGPU::M0);
962 }
963 }
964
965 MIB.addReg(VDstIn);
966
967 MI.eraseFromParent();
968 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
969}
970
971// We need to handle this here because tablegen doesn't support matching
972// instructions with multiple outputs.
973bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
974 Register Dst0 = MI.getOperand(0).getReg();
975 Register Dst1 = MI.getOperand(1).getReg();
976
977 LLT Ty = MRI->getType(Dst0);
978 unsigned Opc;
979 if (Ty == LLT::scalar(32))
980 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
981 else if (Ty == LLT::scalar(64))
982 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
983 else
984 return false;
985
986 // TODO: Match source modifiers.
987
988 const DebugLoc &DL = MI.getDebugLoc();
989 MachineBasicBlock *MBB = MI.getParent();
990
991 Register Numer = MI.getOperand(3).getReg();
992 Register Denom = MI.getOperand(4).getReg();
993 unsigned ChooseDenom = MI.getOperand(5).getImm();
994
995 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
996
997 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
998 .addDef(Dst1)
999 .addImm(0) // $src0_modifiers
1000 .addUse(Src0) // $src0
1001 .addImm(0) // $src1_modifiers
1002 .addUse(Denom) // $src1
1003 .addImm(0) // $src2_modifiers
1004 .addUse(Numer) // $src2
1005 .addImm(0) // $clamp
1006 .addImm(0); // $omod
1007
1008 MI.eraseFromParent();
1009 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1010}
1011
1012bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1013 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1014 switch (IntrinsicID) {
1015 case Intrinsic::amdgcn_if_break: {
1016 MachineBasicBlock *BB = I.getParent();
1017
1018 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1019 // SelectionDAG uses for wave32 vs wave64.
1020 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1021 .add(I.getOperand(0))
1022 .add(I.getOperand(2))
1023 .add(I.getOperand(3));
1024
1025 Register DstReg = I.getOperand(0).getReg();
1026 Register Src0Reg = I.getOperand(2).getReg();
1027 Register Src1Reg = I.getOperand(3).getReg();
1028
1029 I.eraseFromParent();
1030
1031 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1032 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1033
1034 return true;
1035 }
1036 case Intrinsic::amdgcn_interp_p1_f16:
1037 return selectInterpP1F16(I);
1038 case Intrinsic::amdgcn_wqm:
1039 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1040 case Intrinsic::amdgcn_softwqm:
1041 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1042 case Intrinsic::amdgcn_strict_wwm:
1043 case Intrinsic::amdgcn_wwm:
1044 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1045 case Intrinsic::amdgcn_strict_wqm:
1046 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1047 case Intrinsic::amdgcn_writelane:
1048 return selectWritelane(I);
1049 case Intrinsic::amdgcn_div_scale:
1050 return selectDivScale(I);
1051 case Intrinsic::amdgcn_icmp:
1052 case Intrinsic::amdgcn_fcmp:
1053 if (selectImpl(I, *CoverageInfo))
1054 return true;
1055 return selectIntrinsicCmp(I);
1056 case Intrinsic::amdgcn_ballot:
1057 return selectBallot(I);
1058 case Intrinsic::amdgcn_inverse_ballot:
1059 return selectInverseBallot(I);
1060 case Intrinsic::amdgcn_reloc_constant:
1061 return selectRelocConstant(I);
1062 case Intrinsic::amdgcn_groupstaticsize:
1063 return selectGroupStaticSize(I);
1064 case Intrinsic::returnaddress:
1065 return selectReturnAddress(I);
1066 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1067 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1068 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1069 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1070 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1071 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1072 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1073 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1074 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1075 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1076 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1077 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1078 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1079 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1080 return selectSMFMACIntrin(I);
1081 default:
1082 return selectImpl(I, *CoverageInfo);
1083 }
1084}
1085
1087 const GCNSubtarget &ST) {
1088 if (Size != 16 && Size != 32 && Size != 64)
1089 return -1;
1090
1091 if (Size == 16 && !ST.has16BitInsts())
1092 return -1;
1093
1094 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1095 unsigned S64Opc) {
1096 if (Size == 16)
1097 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1098 if (Size == 32)
1099 return S32Opc;
1100 return S64Opc;
1101 };
1102
1103 switch (P) {
1104 default:
1105 llvm_unreachable("Unknown condition code!");
1106 case CmpInst::ICMP_NE:
1107 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1108 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1109 case CmpInst::ICMP_EQ:
1110 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1111 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1112 case CmpInst::ICMP_SGT:
1113 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1114 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1115 case CmpInst::ICMP_SGE:
1116 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1117 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1118 case CmpInst::ICMP_SLT:
1119 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1120 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1121 case CmpInst::ICMP_SLE:
1122 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1123 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1124 case CmpInst::ICMP_UGT:
1125 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1126 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1127 case CmpInst::ICMP_UGE:
1128 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1129 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1130 case CmpInst::ICMP_ULT:
1131 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1132 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1133 case CmpInst::ICMP_ULE:
1134 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1135 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1136
1137 case CmpInst::FCMP_OEQ:
1138 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1139 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1140 case CmpInst::FCMP_OGT:
1141 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1142 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1143 case CmpInst::FCMP_OGE:
1144 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1145 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1146 case CmpInst::FCMP_OLT:
1147 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1148 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1149 case CmpInst::FCMP_OLE:
1150 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1151 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1152 case CmpInst::FCMP_ONE:
1153 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1154 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1155 case CmpInst::FCMP_ORD:
1156 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1157 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1158 case CmpInst::FCMP_UNO:
1159 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1160 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1161 case CmpInst::FCMP_UEQ:
1162 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1163 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1164 case CmpInst::FCMP_UGT:
1165 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1166 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1167 case CmpInst::FCMP_UGE:
1168 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1169 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1170 case CmpInst::FCMP_ULT:
1171 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1172 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1173 case CmpInst::FCMP_ULE:
1174 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1175 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1176 case CmpInst::FCMP_UNE:
1177 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1178 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1179 case CmpInst::FCMP_TRUE:
1180 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1181 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1183 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1184 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1185 }
1186}
1187
1188int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1189 unsigned Size) const {
1190 if (Size == 64) {
1191 if (!STI.hasScalarCompareEq64())
1192 return -1;
1193
1194 switch (P) {
1195 case CmpInst::ICMP_NE:
1196 return AMDGPU::S_CMP_LG_U64;
1197 case CmpInst::ICMP_EQ:
1198 return AMDGPU::S_CMP_EQ_U64;
1199 default:
1200 return -1;
1201 }
1202 }
1203
1204 if (Size == 32) {
1205 switch (P) {
1206 case CmpInst::ICMP_NE:
1207 return AMDGPU::S_CMP_LG_U32;
1208 case CmpInst::ICMP_EQ:
1209 return AMDGPU::S_CMP_EQ_U32;
1210 case CmpInst::ICMP_SGT:
1211 return AMDGPU::S_CMP_GT_I32;
1212 case CmpInst::ICMP_SGE:
1213 return AMDGPU::S_CMP_GE_I32;
1214 case CmpInst::ICMP_SLT:
1215 return AMDGPU::S_CMP_LT_I32;
1216 case CmpInst::ICMP_SLE:
1217 return AMDGPU::S_CMP_LE_I32;
1218 case CmpInst::ICMP_UGT:
1219 return AMDGPU::S_CMP_GT_U32;
1220 case CmpInst::ICMP_UGE:
1221 return AMDGPU::S_CMP_GE_U32;
1222 case CmpInst::ICMP_ULT:
1223 return AMDGPU::S_CMP_LT_U32;
1224 case CmpInst::ICMP_ULE:
1225 return AMDGPU::S_CMP_LE_U32;
1226 case CmpInst::FCMP_OEQ:
1227 return AMDGPU::S_CMP_EQ_F32;
1228 case CmpInst::FCMP_OGT:
1229 return AMDGPU::S_CMP_GT_F32;
1230 case CmpInst::FCMP_OGE:
1231 return AMDGPU::S_CMP_GE_F32;
1232 case CmpInst::FCMP_OLT:
1233 return AMDGPU::S_CMP_LT_F32;
1234 case CmpInst::FCMP_OLE:
1235 return AMDGPU::S_CMP_LE_F32;
1236 case CmpInst::FCMP_ONE:
1237 return AMDGPU::S_CMP_LG_F32;
1238 case CmpInst::FCMP_ORD:
1239 return AMDGPU::S_CMP_O_F32;
1240 case CmpInst::FCMP_UNO:
1241 return AMDGPU::S_CMP_U_F32;
1242 case CmpInst::FCMP_UEQ:
1243 return AMDGPU::S_CMP_NLG_F32;
1244 case CmpInst::FCMP_UGT:
1245 return AMDGPU::S_CMP_NLE_F32;
1246 case CmpInst::FCMP_UGE:
1247 return AMDGPU::S_CMP_NLT_F32;
1248 case CmpInst::FCMP_ULT:
1249 return AMDGPU::S_CMP_NGE_F32;
1250 case CmpInst::FCMP_ULE:
1251 return AMDGPU::S_CMP_NGT_F32;
1252 case CmpInst::FCMP_UNE:
1253 return AMDGPU::S_CMP_NEQ_F32;
1254 default:
1255 llvm_unreachable("Unknown condition code!");
1256 }
1257 }
1258
1259 if (Size == 16) {
1260 if (!STI.hasSALUFloatInsts())
1261 return -1;
1262
1263 switch (P) {
1264 case CmpInst::FCMP_OEQ:
1265 return AMDGPU::S_CMP_EQ_F16;
1266 case CmpInst::FCMP_OGT:
1267 return AMDGPU::S_CMP_GT_F16;
1268 case CmpInst::FCMP_OGE:
1269 return AMDGPU::S_CMP_GE_F16;
1270 case CmpInst::FCMP_OLT:
1271 return AMDGPU::S_CMP_LT_F16;
1272 case CmpInst::FCMP_OLE:
1273 return AMDGPU::S_CMP_LE_F16;
1274 case CmpInst::FCMP_ONE:
1275 return AMDGPU::S_CMP_LG_F16;
1276 case CmpInst::FCMP_ORD:
1277 return AMDGPU::S_CMP_O_F16;
1278 case CmpInst::FCMP_UNO:
1279 return AMDGPU::S_CMP_U_F16;
1280 case CmpInst::FCMP_UEQ:
1281 return AMDGPU::S_CMP_NLG_F16;
1282 case CmpInst::FCMP_UGT:
1283 return AMDGPU::S_CMP_NLE_F16;
1284 case CmpInst::FCMP_UGE:
1285 return AMDGPU::S_CMP_NLT_F16;
1286 case CmpInst::FCMP_ULT:
1287 return AMDGPU::S_CMP_NGE_F16;
1288 case CmpInst::FCMP_ULE:
1289 return AMDGPU::S_CMP_NGT_F16;
1290 case CmpInst::FCMP_UNE:
1291 return AMDGPU::S_CMP_NEQ_F16;
1292 default:
1293 llvm_unreachable("Unknown condition code!");
1294 }
1295 }
1296
1297 return -1;
1298}
1299
1300bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1301
1302 MachineBasicBlock *BB = I.getParent();
1303 const DebugLoc &DL = I.getDebugLoc();
1304
1305 Register SrcReg = I.getOperand(2).getReg();
1306 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1307
1308 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1309
1310 Register CCReg = I.getOperand(0).getReg();
1311 if (!isVCC(CCReg, *MRI)) {
1312 int Opcode = getS_CMPOpcode(Pred, Size);
1313 if (Opcode == -1)
1314 return false;
1315 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1316 .add(I.getOperand(2))
1317 .add(I.getOperand(3));
1318 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1319 .addReg(AMDGPU::SCC);
1320 bool Ret =
1321 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1322 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1323 I.eraseFromParent();
1324 return Ret;
1325 }
1326
1327 if (I.getOpcode() == AMDGPU::G_FCMP)
1328 return false;
1329
1330 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1331 if (Opcode == -1)
1332 return false;
1333
1334 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1335 I.getOperand(0).getReg())
1336 .add(I.getOperand(2))
1337 .add(I.getOperand(3));
1339 *TRI.getBoolRC(), *MRI);
1340 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1341 I.eraseFromParent();
1342 return Ret;
1343}
1344
1345bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1346 Register Dst = I.getOperand(0).getReg();
1347 if (isVCC(Dst, *MRI))
1348 return false;
1349
1350 LLT DstTy = MRI->getType(Dst);
1351 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1352 return false;
1353
1354 MachineBasicBlock *BB = I.getParent();
1355 const DebugLoc &DL = I.getDebugLoc();
1356 Register SrcReg = I.getOperand(2).getReg();
1357 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1358
1359 // i1 inputs are not supported in GlobalISel.
1360 if (Size == 1)
1361 return false;
1362
1363 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1364 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1365 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1366 I.eraseFromParent();
1367 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1368 }
1369
1370 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1371 if (Opcode == -1)
1372 return false;
1373
1374 MachineInstrBuilder SelectedMI;
1375 MachineOperand &LHS = I.getOperand(2);
1376 MachineOperand &RHS = I.getOperand(3);
1377 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1378 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1379 Register Src0Reg =
1380 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1381 Register Src1Reg =
1382 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1383 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1384 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1385 SelectedMI.addImm(Src0Mods);
1386 SelectedMI.addReg(Src0Reg);
1387 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1388 SelectedMI.addImm(Src1Mods);
1389 SelectedMI.addReg(Src1Reg);
1390 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1391 SelectedMI.addImm(0); // clamp
1392 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1393 SelectedMI.addImm(0); // op_sel
1394
1395 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1396 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1397 return false;
1398
1399 I.eraseFromParent();
1400 return true;
1401}
1402
1403bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1404 MachineBasicBlock *BB = I.getParent();
1405 const DebugLoc &DL = I.getDebugLoc();
1406 Register DstReg = I.getOperand(0).getReg();
1407 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1408 const bool Is64 = Size == 64;
1409 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1410
1411 // In the common case, the return type matches the wave size.
1412 // However we also support emitting i64 ballots in wave32 mode.
1413 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1414 return false;
1415
1416 std::optional<ValueAndVReg> Arg =
1417 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1418
1419 const auto BuildCopy = [&](Register SrcReg) {
1420 if (Size == STI.getWavefrontSize()) {
1421 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1422 .addReg(SrcReg);
1423 return;
1424 }
1425
1426 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1427 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1428 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1429 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1430 .addReg(SrcReg)
1431 .addImm(AMDGPU::sub0)
1432 .addReg(HiReg)
1433 .addImm(AMDGPU::sub1);
1434 };
1435
1436 if (Arg) {
1437 const int64_t Value = Arg->Value.getSExtValue();
1438 if (Value == 0) {
1439 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1440 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1441 } else if (Value == -1) // all ones
1442 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1443 else
1444 return false;
1445 } else
1446 BuildCopy(I.getOperand(2).getReg());
1447
1448 I.eraseFromParent();
1449 return true;
1450}
1451
1452bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
1453 MachineBasicBlock *BB = I.getParent();
1454 const DebugLoc &DL = I.getDebugLoc();
1455 const Register DstReg = I.getOperand(0).getReg();
1456 const Register MaskReg = I.getOperand(2).getReg();
1457
1458 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
1459 I.eraseFromParent();
1460 return true;
1461}
1462
1463bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1464 Register DstReg = I.getOperand(0).getReg();
1465 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1466 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1467 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1468 return false;
1469
1470 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1471
1473 const MDNode *Metadata = I.getOperand(2).getMetadata();
1474 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1475 auto RelocSymbol = cast<GlobalVariable>(
1476 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1477
1478 MachineBasicBlock *BB = I.getParent();
1479 BuildMI(*BB, &I, I.getDebugLoc(),
1480 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1482
1483 I.eraseFromParent();
1484 return true;
1485}
1486
1487bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1489
1490 Register DstReg = I.getOperand(0).getReg();
1491 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1492 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1493 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1494
1495 MachineBasicBlock *MBB = I.getParent();
1496 const DebugLoc &DL = I.getDebugLoc();
1497
1498 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1499
1500 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1502 MIB.addImm(MFI->getLDSSize());
1503 } else {
1505 const GlobalValue *GV
1506 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1508 }
1509
1510 I.eraseFromParent();
1511 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1512}
1513
1514bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1515 MachineBasicBlock *MBB = I.getParent();
1517 const DebugLoc &DL = I.getDebugLoc();
1518
1519 MachineOperand &Dst = I.getOperand(0);
1520 Register DstReg = Dst.getReg();
1521 unsigned Depth = I.getOperand(2).getImm();
1522
1523 const TargetRegisterClass *RC
1524 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1525 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1526 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1527 return false;
1528
1529 // Check for kernel and shader functions
1530 if (Depth != 0 ||
1532 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1533 .addImm(0);
1534 I.eraseFromParent();
1535 return true;
1536 }
1537
1539 // There is a call to @llvm.returnaddress in this function
1540 MFI.setReturnAddressIsTaken(true);
1541
1542 // Get the return address reg and mark it as an implicit live-in
1543 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1544 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1545 AMDGPU::SReg_64RegClass, DL);
1546 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1547 .addReg(LiveIn);
1548 I.eraseFromParent();
1549 return true;
1550}
1551
1552bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1553 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1554 // SelectionDAG uses for wave32 vs wave64.
1555 MachineBasicBlock *BB = MI.getParent();
1556 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1557 .add(MI.getOperand(1));
1558
1559 Register Reg = MI.getOperand(1).getReg();
1560 MI.eraseFromParent();
1561
1562 if (!MRI->getRegClassOrNull(Reg))
1563 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1564 return true;
1565}
1566
1567bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1568 MachineInstr &MI, Intrinsic::ID IntrID) const {
1569 MachineBasicBlock *MBB = MI.getParent();
1571 const DebugLoc &DL = MI.getDebugLoc();
1572
1573 unsigned IndexOperand = MI.getOperand(7).getImm();
1574 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1575 bool WaveDone = MI.getOperand(9).getImm() != 0;
1576
1577 if (WaveDone && !WaveRelease)
1578 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1579
1580 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1581 IndexOperand &= ~0x3f;
1582 unsigned CountDw = 0;
1583
1585 CountDw = (IndexOperand >> 24) & 0xf;
1586 IndexOperand &= ~(0xf << 24);
1587
1588 if (CountDw < 1 || CountDw > 4) {
1590 "ds_ordered_count: dword count must be between 1 and 4");
1591 }
1592 }
1593
1594 if (IndexOperand)
1595 report_fatal_error("ds_ordered_count: bad index operand");
1596
1597 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1598 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1599
1600 unsigned Offset0 = OrderedCountIndex << 2;
1601 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1602
1604 Offset1 |= (CountDw - 1) << 6;
1605
1607 Offset1 |= ShaderType << 2;
1608
1609 unsigned Offset = Offset0 | (Offset1 << 8);
1610
1611 Register M0Val = MI.getOperand(2).getReg();
1612 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1613 .addReg(M0Val);
1614
1615 Register DstReg = MI.getOperand(0).getReg();
1616 Register ValReg = MI.getOperand(3).getReg();
1618 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1619 .addReg(ValReg)
1620 .addImm(Offset)
1621 .cloneMemRefs(MI);
1622
1623 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1624 return false;
1625
1626 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1627 MI.eraseFromParent();
1628 return Ret;
1629}
1630
1631static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1632 switch (IntrID) {
1633 case Intrinsic::amdgcn_ds_gws_init:
1634 return AMDGPU::DS_GWS_INIT;
1635 case Intrinsic::amdgcn_ds_gws_barrier:
1636 return AMDGPU::DS_GWS_BARRIER;
1637 case Intrinsic::amdgcn_ds_gws_sema_v:
1638 return AMDGPU::DS_GWS_SEMA_V;
1639 case Intrinsic::amdgcn_ds_gws_sema_br:
1640 return AMDGPU::DS_GWS_SEMA_BR;
1641 case Intrinsic::amdgcn_ds_gws_sema_p:
1642 return AMDGPU::DS_GWS_SEMA_P;
1643 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1644 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1645 default:
1646 llvm_unreachable("not a gws intrinsic");
1647 }
1648}
1649
1650bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1651 Intrinsic::ID IID) const {
1652 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1653 !STI.hasGWSSemaReleaseAll()))
1654 return false;
1655
1656 // intrinsic ID, vsrc, offset
1657 const bool HasVSrc = MI.getNumOperands() == 3;
1658 assert(HasVSrc || MI.getNumOperands() == 2);
1659
1660 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1661 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1662 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1663 return false;
1664
1665 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1666 unsigned ImmOffset;
1667
1668 MachineBasicBlock *MBB = MI.getParent();
1669 const DebugLoc &DL = MI.getDebugLoc();
1670
1671 MachineInstr *Readfirstlane = nullptr;
1672
1673 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1674 // incoming offset, in case there's an add of a constant. We'll have to put it
1675 // back later.
1676 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1677 Readfirstlane = OffsetDef;
1678 BaseOffset = OffsetDef->getOperand(1).getReg();
1679 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1680 }
1681
1682 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1683 // If we have a constant offset, try to use the 0 in m0 as the base.
1684 // TODO: Look into changing the default m0 initialization value. If the
1685 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1686 // the immediate offset.
1687
1688 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1689 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1690 .addImm(0);
1691 } else {
1692 std::tie(BaseOffset, ImmOffset) =
1693 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1694
1695 if (Readfirstlane) {
1696 // We have the constant offset now, so put the readfirstlane back on the
1697 // variable component.
1698 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1699 return false;
1700
1701 Readfirstlane->getOperand(1).setReg(BaseOffset);
1702 BaseOffset = Readfirstlane->getOperand(0).getReg();
1703 } else {
1704 if (!RBI.constrainGenericRegister(BaseOffset,
1705 AMDGPU::SReg_32RegClass, *MRI))
1706 return false;
1707 }
1708
1709 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1710 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1711 .addReg(BaseOffset)
1712 .addImm(16)
1713 .setOperandDead(3); // Dead scc
1714
1715 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1716 .addReg(M0Base);
1717 }
1718
1719 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1720 // offset field) % 64. Some versions of the programming guide omit the m0
1721 // part, or claim it's from offset 0.
1722 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1723
1724 if (HasVSrc) {
1725 Register VSrc = MI.getOperand(1).getReg();
1726 MIB.addReg(VSrc);
1727
1728 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1729 return false;
1730 }
1731
1732 MIB.addImm(ImmOffset)
1733 .cloneMemRefs(MI);
1734
1735 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1736
1737 MI.eraseFromParent();
1738 return true;
1739}
1740
1741bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1742 bool IsAppend) const {
1743 Register PtrBase = MI.getOperand(2).getReg();
1744 LLT PtrTy = MRI->getType(PtrBase);
1745 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1746
1747 unsigned Offset;
1748 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1749
1750 // TODO: Should this try to look through readfirstlane like GWS?
1751 if (!isDSOffsetLegal(PtrBase, Offset)) {
1752 PtrBase = MI.getOperand(2).getReg();
1753 Offset = 0;
1754 }
1755
1756 MachineBasicBlock *MBB = MI.getParent();
1757 const DebugLoc &DL = MI.getDebugLoc();
1758 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1759
1760 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1761 .addReg(PtrBase);
1762 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1763 return false;
1764
1765 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1766 .addImm(Offset)
1767 .addImm(IsGDS ? -1 : 0)
1768 .cloneMemRefs(MI);
1769 MI.eraseFromParent();
1770 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1771}
1772
1773bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1775 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1776 if (WGSize <= STI.getWavefrontSize()) {
1777 MachineBasicBlock *MBB = MI.getParent();
1778 const DebugLoc &DL = MI.getDebugLoc();
1779 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1780 MI.eraseFromParent();
1781 return true;
1782 }
1783 }
1784
1785 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1786 if (STI.hasSplitBarriers()) {
1787 MachineBasicBlock *MBB = MI.getParent();
1788 const DebugLoc &DL = MI.getDebugLoc();
1789 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1791 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1793 MI.eraseFromParent();
1794 return true;
1795 }
1796
1797 return selectImpl(MI, *CoverageInfo);
1798}
1799
1800static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1801 bool &IsTexFail) {
1802 if (TexFailCtrl)
1803 IsTexFail = true;
1804
1805 TFE = (TexFailCtrl & 0x1) ? true : false;
1806 TexFailCtrl &= ~(uint64_t)0x1;
1807 LWE = (TexFailCtrl & 0x2) ? true : false;
1808 TexFailCtrl &= ~(uint64_t)0x2;
1809
1810 return TexFailCtrl == 0;
1811}
1812
1813bool AMDGPUInstructionSelector::selectImageIntrinsic(
1815 MachineBasicBlock *MBB = MI.getParent();
1816 const DebugLoc &DL = MI.getDebugLoc();
1817
1818 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1820
1821 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1822 unsigned IntrOpcode = Intr->BaseOpcode;
1823 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1824 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1825 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1826
1827 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1828
1829 Register VDataIn, VDataOut;
1830 LLT VDataTy;
1831 int NumVDataDwords = -1;
1832 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1833 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1834
1835 bool Unorm;
1836 if (!BaseOpcode->Sampler)
1837 Unorm = true;
1838 else
1839 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1840
1841 bool TFE;
1842 bool LWE;
1843 bool IsTexFail = false;
1844 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1845 TFE, LWE, IsTexFail))
1846 return false;
1847
1848 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1849 const bool IsA16 = (Flags & 1) != 0;
1850 const bool IsG16 = (Flags & 2) != 0;
1851
1852 // A16 implies 16 bit gradients if subtarget doesn't support G16
1853 if (IsA16 && !STI.hasG16() && !IsG16)
1854 return false;
1855
1856 unsigned DMask = 0;
1857 unsigned DMaskLanes = 0;
1858
1859 if (BaseOpcode->Atomic) {
1860 VDataOut = MI.getOperand(0).getReg();
1861 VDataIn = MI.getOperand(2).getReg();
1862 LLT Ty = MRI->getType(VDataIn);
1863
1864 // Be careful to allow atomic swap on 16-bit element vectors.
1865 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1866 Ty.getSizeInBits() == 128 :
1867 Ty.getSizeInBits() == 64;
1868
1869 if (BaseOpcode->AtomicX2) {
1870 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1871
1872 DMask = Is64Bit ? 0xf : 0x3;
1873 NumVDataDwords = Is64Bit ? 4 : 2;
1874 } else {
1875 DMask = Is64Bit ? 0x3 : 0x1;
1876 NumVDataDwords = Is64Bit ? 2 : 1;
1877 }
1878 } else {
1879 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1880 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1881
1882 if (BaseOpcode->Store) {
1883 VDataIn = MI.getOperand(1).getReg();
1884 VDataTy = MRI->getType(VDataIn);
1885 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1886 } else {
1887 VDataOut = MI.getOperand(0).getReg();
1888 VDataTy = MRI->getType(VDataOut);
1889 NumVDataDwords = DMaskLanes;
1890
1891 if (IsD16 && !STI.hasUnpackedD16VMem())
1892 NumVDataDwords = (DMaskLanes + 1) / 2;
1893 }
1894 }
1895
1896 // Set G16 opcode
1897 if (Subtarget->hasG16() && IsG16) {
1898 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1900 assert(G16MappingInfo);
1901 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1902 }
1903
1904 // TODO: Check this in verifier.
1905 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1906
1907 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1908 if (BaseOpcode->Atomic)
1909 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1910 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1912 return false;
1913
1914 int NumVAddrRegs = 0;
1915 int NumVAddrDwords = 0;
1916 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1917 // Skip the $noregs and 0s inserted during legalization.
1918 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1919 if (!AddrOp.isReg())
1920 continue; // XXX - Break?
1921
1922 Register Addr = AddrOp.getReg();
1923 if (!Addr)
1924 break;
1925
1926 ++NumVAddrRegs;
1927 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1928 }
1929
1930 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1931 // NSA, these should have been packed into a single value in the first
1932 // address register
1933 const bool UseNSA =
1934 NumVAddrRegs != 1 &&
1935 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1936 : NumVAddrDwords == NumVAddrRegs);
1937 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1938 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1939 return false;
1940 }
1941
1942 if (IsTexFail)
1943 ++NumVDataDwords;
1944
1945 int Opcode = -1;
1946 if (IsGFX12Plus) {
1947 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1948 NumVDataDwords, NumVAddrDwords);
1949 } else if (IsGFX11Plus) {
1950 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1951 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1952 : AMDGPU::MIMGEncGfx11Default,
1953 NumVDataDwords, NumVAddrDwords);
1954 } else if (IsGFX10Plus) {
1955 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1956 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1957 : AMDGPU::MIMGEncGfx10Default,
1958 NumVDataDwords, NumVAddrDwords);
1959 } else {
1960 if (Subtarget->hasGFX90AInsts()) {
1961 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1962 NumVDataDwords, NumVAddrDwords);
1963 if (Opcode == -1) {
1964 LLVM_DEBUG(
1965 dbgs()
1966 << "requested image instruction is not supported on this GPU\n");
1967 return false;
1968 }
1969 }
1970 if (Opcode == -1 &&
1972 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1973 NumVDataDwords, NumVAddrDwords);
1974 if (Opcode == -1)
1975 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1976 NumVDataDwords, NumVAddrDwords);
1977 }
1978 if (Opcode == -1)
1979 return false;
1980
1981 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1982 .cloneMemRefs(MI);
1983
1984 if (VDataOut) {
1985 if (BaseOpcode->AtomicX2) {
1986 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1987
1988 Register TmpReg = MRI->createVirtualRegister(
1989 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1990 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1991
1992 MIB.addDef(TmpReg);
1993 if (!MRI->use_empty(VDataOut)) {
1994 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1995 .addReg(TmpReg, RegState::Kill, SubReg);
1996 }
1997
1998 } else {
1999 MIB.addDef(VDataOut); // vdata output
2000 }
2001 }
2002
2003 if (VDataIn)
2004 MIB.addReg(VDataIn); // vdata input
2005
2006 for (int I = 0; I != NumVAddrRegs; ++I) {
2007 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2008 if (SrcOp.isReg()) {
2009 assert(SrcOp.getReg() != 0);
2010 MIB.addReg(SrcOp.getReg());
2011 }
2012 }
2013
2014 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2015 if (BaseOpcode->Sampler)
2016 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2017
2018 MIB.addImm(DMask); // dmask
2019
2020 if (IsGFX10Plus)
2021 MIB.addImm(DimInfo->Encoding);
2022 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2023 MIB.addImm(Unorm);
2024
2025 MIB.addImm(CPol);
2026 MIB.addImm(IsA16 && // a16 or r128
2027 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2028 if (IsGFX10Plus)
2029 MIB.addImm(IsA16 ? -1 : 0);
2030
2031 if (!Subtarget->hasGFX90AInsts()) {
2032 MIB.addImm(TFE); // tfe
2033 } else if (TFE) {
2034 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2035 return false;
2036 }
2037
2038 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2039 MIB.addImm(LWE); // lwe
2040 if (!IsGFX10Plus)
2041 MIB.addImm(DimInfo->DA ? -1 : 0);
2042 if (BaseOpcode->HasD16)
2043 MIB.addImm(IsD16 ? -1 : 0);
2044
2045 MI.eraseFromParent();
2046 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2047 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2048 return true;
2049}
2050
2051// We need to handle this here because tablegen doesn't support matching
2052// instructions with multiple outputs.
2053bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2054 MachineInstr &MI) const {
2055 Register Dst0 = MI.getOperand(0).getReg();
2056 Register Dst1 = MI.getOperand(1).getReg();
2057
2058 const DebugLoc &DL = MI.getDebugLoc();
2059 MachineBasicBlock *MBB = MI.getParent();
2060
2061 Register Addr = MI.getOperand(3).getReg();
2062 Register Data0 = MI.getOperand(4).getReg();
2063 Register Data1 = MI.getOperand(5).getReg();
2064 unsigned Offset = MI.getOperand(6).getImm();
2065
2066 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2067 .addDef(Dst1)
2068 .addUse(Addr)
2069 .addUse(Data0)
2070 .addUse(Data1)
2071 .addImm(Offset)
2072 .cloneMemRefs(MI);
2073
2074 MI.eraseFromParent();
2075 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2076}
2077
2078bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2079 MachineInstr &I) const {
2080 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2081 switch (IntrinsicID) {
2082 case Intrinsic::amdgcn_end_cf:
2083 return selectEndCfIntrinsic(I);
2084 case Intrinsic::amdgcn_ds_ordered_add:
2085 case Intrinsic::amdgcn_ds_ordered_swap:
2086 return selectDSOrderedIntrinsic(I, IntrinsicID);
2087 case Intrinsic::amdgcn_ds_gws_init:
2088 case Intrinsic::amdgcn_ds_gws_barrier:
2089 case Intrinsic::amdgcn_ds_gws_sema_v:
2090 case Intrinsic::amdgcn_ds_gws_sema_br:
2091 case Intrinsic::amdgcn_ds_gws_sema_p:
2092 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2093 return selectDSGWSIntrinsic(I, IntrinsicID);
2094 case Intrinsic::amdgcn_ds_append:
2095 return selectDSAppendConsume(I, true);
2096 case Intrinsic::amdgcn_ds_consume:
2097 return selectDSAppendConsume(I, false);
2098 case Intrinsic::amdgcn_s_barrier:
2099 return selectSBarrier(I);
2100 case Intrinsic::amdgcn_raw_buffer_load_lds:
2101 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2102 case Intrinsic::amdgcn_struct_buffer_load_lds:
2103 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2104 return selectBufferLoadLds(I);
2105 case Intrinsic::amdgcn_global_load_lds:
2106 return selectGlobalLoadLds(I);
2107 case Intrinsic::amdgcn_exp_compr:
2108 if (!STI.hasCompressedExport()) {
2109 Function &F = I.getMF()->getFunction();
2111 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2112 F.getContext().diagnose(NoFpRet);
2113 return false;
2114 }
2115 break;
2116 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2117 return selectDSBvhStackIntrinsic(I);
2118 case Intrinsic::amdgcn_s_barrier_init:
2119 case Intrinsic::amdgcn_s_barrier_join:
2120 case Intrinsic::amdgcn_s_wakeup_barrier:
2121 case Intrinsic::amdgcn_s_get_barrier_state:
2122 return selectNamedBarrierInst(I, IntrinsicID);
2123 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2124 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2125 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2126 case Intrinsic::amdgcn_s_barrier_leave:
2127 return selectSBarrierLeave(I);
2128 }
2129 return selectImpl(I, *CoverageInfo);
2130}
2131
2132bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2133 if (selectImpl(I, *CoverageInfo))
2134 return true;
2135
2136 MachineBasicBlock *BB = I.getParent();
2137 const DebugLoc &DL = I.getDebugLoc();
2138
2139 Register DstReg = I.getOperand(0).getReg();
2140 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2141 assert(Size <= 32 || Size == 64);
2142 const MachineOperand &CCOp = I.getOperand(1);
2143 Register CCReg = CCOp.getReg();
2144 if (!isVCC(CCReg, *MRI)) {
2145 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2146 AMDGPU::S_CSELECT_B32;
2147 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2148 .addReg(CCReg);
2149
2150 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2151 // bank, because it does not cover the register class that we used to represent
2152 // for it. So we need to manually set the register class here.
2153 if (!MRI->getRegClassOrNull(CCReg))
2154 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2155 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2156 .add(I.getOperand(2))
2157 .add(I.getOperand(3));
2158
2159 bool Ret = false;
2160 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2161 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2162 I.eraseFromParent();
2163 return Ret;
2164 }
2165
2166 // Wide VGPR select should have been split in RegBankSelect.
2167 if (Size > 32)
2168 return false;
2169
2171 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2172 .addImm(0)
2173 .add(I.getOperand(3))
2174 .addImm(0)
2175 .add(I.getOperand(2))
2176 .add(I.getOperand(1));
2177
2178 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2179 I.eraseFromParent();
2180 return Ret;
2181}
2182
2183static int sizeToSubRegIndex(unsigned Size) {
2184 switch (Size) {
2185 case 32:
2186 return AMDGPU::sub0;
2187 case 64:
2188 return AMDGPU::sub0_sub1;
2189 case 96:
2190 return AMDGPU::sub0_sub1_sub2;
2191 case 128:
2192 return AMDGPU::sub0_sub1_sub2_sub3;
2193 case 256:
2194 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2195 default:
2196 if (Size < 32)
2197 return AMDGPU::sub0;
2198 if (Size > 256)
2199 return -1;
2201 }
2202}
2203
2204bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2205 Register DstReg = I.getOperand(0).getReg();
2206 Register SrcReg = I.getOperand(1).getReg();
2207 const LLT DstTy = MRI->getType(DstReg);
2208 const LLT SrcTy = MRI->getType(SrcReg);
2209 const LLT S1 = LLT::scalar(1);
2210
2211 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2212 const RegisterBank *DstRB;
2213 if (DstTy == S1) {
2214 // This is a special case. We don't treat s1 for legalization artifacts as
2215 // vcc booleans.
2216 DstRB = SrcRB;
2217 } else {
2218 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2219 if (SrcRB != DstRB)
2220 return false;
2221 }
2222
2223 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2224
2225 unsigned DstSize = DstTy.getSizeInBits();
2226 unsigned SrcSize = SrcTy.getSizeInBits();
2227
2228 const TargetRegisterClass *SrcRC =
2229 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2230 const TargetRegisterClass *DstRC =
2231 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2232 if (!SrcRC || !DstRC)
2233 return false;
2234
2235 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2236 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2237 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2238 return false;
2239 }
2240
2241 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2242 MachineBasicBlock *MBB = I.getParent();
2243 const DebugLoc &DL = I.getDebugLoc();
2244
2245 Register LoReg = MRI->createVirtualRegister(DstRC);
2246 Register HiReg = MRI->createVirtualRegister(DstRC);
2247 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2248 .addReg(SrcReg, 0, AMDGPU::sub0);
2249 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2250 .addReg(SrcReg, 0, AMDGPU::sub1);
2251
2252 if (IsVALU && STI.hasSDWA()) {
2253 // Write the low 16-bits of the high element into the high 16-bits of the
2254 // low element.
2255 MachineInstr *MovSDWA =
2256 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2257 .addImm(0) // $src0_modifiers
2258 .addReg(HiReg) // $src0
2259 .addImm(0) // $clamp
2260 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2261 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2262 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2263 .addReg(LoReg, RegState::Implicit);
2264 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2265 } else {
2266 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2267 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2268 Register ImmReg = MRI->createVirtualRegister(DstRC);
2269 if (IsVALU) {
2270 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2271 .addImm(16)
2272 .addReg(HiReg);
2273 } else {
2274 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2275 .addReg(HiReg)
2276 .addImm(16)
2277 .setOperandDead(3); // Dead scc
2278 }
2279
2280 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2281 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2282 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2283
2284 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2285 .addImm(0xffff);
2286 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2287 .addReg(LoReg)
2288 .addReg(ImmReg);
2289 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2290 .addReg(TmpReg0)
2291 .addReg(TmpReg1);
2292
2293 if (!IsVALU) {
2294 And.setOperandDead(3); // Dead scc
2295 Or.setOperandDead(3); // Dead scc
2296 }
2297 }
2298
2299 I.eraseFromParent();
2300 return true;
2301 }
2302
2303 if (!DstTy.isScalar())
2304 return false;
2305
2306 if (SrcSize > 32) {
2307 int SubRegIdx = sizeToSubRegIndex(DstSize);
2308 if (SubRegIdx == -1)
2309 return false;
2310
2311 // Deal with weird cases where the class only partially supports the subreg
2312 // index.
2313 const TargetRegisterClass *SrcWithSubRC
2314 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2315 if (!SrcWithSubRC)
2316 return false;
2317
2318 if (SrcWithSubRC != SrcRC) {
2319 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2320 return false;
2321 }
2322
2323 I.getOperand(1).setSubReg(SubRegIdx);
2324 }
2325
2326 I.setDesc(TII.get(TargetOpcode::COPY));
2327 return true;
2328}
2329
2330/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2331static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2332 Mask = maskTrailingOnes<unsigned>(Size);
2333 int SignedMask = static_cast<int>(Mask);
2334 return SignedMask >= -16 && SignedMask <= 64;
2335}
2336
2337// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2338const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2339 Register Reg, const MachineRegisterInfo &MRI,
2340 const TargetRegisterInfo &TRI) const {
2341 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2342 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2343 return RB;
2344
2345 // Ignore the type, since we don't use vcc in artifacts.
2346 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2347 return &RBI.getRegBankFromRegClass(*RC, LLT());
2348 return nullptr;
2349}
2350
2351bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2352 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2353 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2354 const DebugLoc &DL = I.getDebugLoc();
2355 MachineBasicBlock &MBB = *I.getParent();
2356 const Register DstReg = I.getOperand(0).getReg();
2357 const Register SrcReg = I.getOperand(1).getReg();
2358
2359 const LLT DstTy = MRI->getType(DstReg);
2360 const LLT SrcTy = MRI->getType(SrcReg);
2361 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2362 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2363 const unsigned DstSize = DstTy.getSizeInBits();
2364 if (!DstTy.isScalar())
2365 return false;
2366
2367 // Artifact casts should never use vcc.
2368 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2369
2370 // FIXME: This should probably be illegal and split earlier.
2371 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2372 if (DstSize <= 32)
2373 return selectCOPY(I);
2374
2375 const TargetRegisterClass *SrcRC =
2376 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2377 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2378 const TargetRegisterClass *DstRC =
2379 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2380
2381 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2382 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2383 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2384 .addReg(SrcReg)
2385 .addImm(AMDGPU::sub0)
2386 .addReg(UndefReg)
2387 .addImm(AMDGPU::sub1);
2388 I.eraseFromParent();
2389
2390 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2391 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2392 }
2393
2394 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2395 // 64-bit should have been split up in RegBankSelect
2396
2397 // Try to use an and with a mask if it will save code size.
2398 unsigned Mask;
2399 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2400 MachineInstr *ExtI =
2401 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2402 .addImm(Mask)
2403 .addReg(SrcReg);
2404 I.eraseFromParent();
2405 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2406 }
2407
2408 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2409 MachineInstr *ExtI =
2410 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2411 .addReg(SrcReg)
2412 .addImm(0) // Offset
2413 .addImm(SrcSize); // Width
2414 I.eraseFromParent();
2415 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2416 }
2417
2418 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2419 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2420 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2421 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2422 return false;
2423
2424 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2425 const unsigned SextOpc = SrcSize == 8 ?
2426 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2427 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2428 .addReg(SrcReg);
2429 I.eraseFromParent();
2430 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2431 }
2432
2433 // Using a single 32-bit SALU to calculate the high half is smaller than
2434 // S_BFE with a literal constant operand.
2435 if (DstSize > 32 && SrcSize == 32) {
2436 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2437 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2438 if (Signed) {
2439 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2440 .addReg(SrcReg, 0, SubReg)
2441 .addImm(31)
2442 .setOperandDead(3); // Dead scc
2443 } else {
2444 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2445 .addImm(0);
2446 }
2447 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2448 .addReg(SrcReg, 0, SubReg)
2449 .addImm(AMDGPU::sub0)
2450 .addReg(HiReg)
2451 .addImm(AMDGPU::sub1);
2452 I.eraseFromParent();
2453 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2454 *MRI);
2455 }
2456
2457 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2458 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2459
2460 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2461 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2462 // We need a 64-bit register source, but the high bits don't matter.
2463 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2464 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2465 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2466
2467 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2468 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2469 .addReg(SrcReg, 0, SubReg)
2470 .addImm(AMDGPU::sub0)
2471 .addReg(UndefReg)
2472 .addImm(AMDGPU::sub1);
2473
2474 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2475 .addReg(ExtReg)
2476 .addImm(SrcSize << 16);
2477
2478 I.eraseFromParent();
2479 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2480 }
2481
2482 unsigned Mask;
2483 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2484 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2485 .addReg(SrcReg)
2486 .addImm(Mask)
2487 .setOperandDead(3); // Dead scc
2488 } else {
2489 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2490 .addReg(SrcReg)
2491 .addImm(SrcSize << 16);
2492 }
2493
2494 I.eraseFromParent();
2495 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2496 }
2497
2498 return false;
2499}
2500
2502 Register &Out) {
2503 Register LShlSrc;
2504 if (mi_match(In, MRI,
2505 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2506 Out = LShlSrc;
2507 return true;
2508 }
2509 return false;
2510}
2511
2512bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2513 if (!Subtarget->hasSALUFloatInsts())
2514 return false;
2515
2516 Register Dst = I.getOperand(0).getReg();
2517 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2518 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2519 return false;
2520
2521 Register Src = I.getOperand(1).getReg();
2522
2523 if (MRI->getType(Dst) == LLT::scalar(32) &&
2524 MRI->getType(Src) == LLT::scalar(16)) {
2525 if (isExtractHiElt(*MRI, Src, Src)) {
2526 MachineBasicBlock *BB = I.getParent();
2527 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2528 .addUse(Src);
2529 I.eraseFromParent();
2530 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2531 }
2532 }
2533
2534 return false;
2535}
2536
2537bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2538 MachineBasicBlock *BB = I.getParent();
2539 MachineOperand &ImmOp = I.getOperand(1);
2540 Register DstReg = I.getOperand(0).getReg();
2541 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2542 bool IsFP = false;
2543
2544 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2545 if (ImmOp.isFPImm()) {
2546 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2547 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2548 IsFP = true;
2549 } else if (ImmOp.isCImm()) {
2550 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2551 } else {
2552 llvm_unreachable("Not supported by g_constants");
2553 }
2554
2555 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2556 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2557
2558 unsigned Opcode;
2559 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2560 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2561 } else if (Size == 64 &&
2562 AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2563 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2564 I.setDesc(TII.get(Opcode));
2565 I.addImplicitDefUseOperands(*MF);
2566 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2567 } else {
2568 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2569
2570 // We should never produce s1 values on banks other than VCC. If the user of
2571 // this already constrained the register, we may incorrectly think it's VCC
2572 // if it wasn't originally.
2573 if (Size == 1)
2574 return false;
2575 }
2576
2577 if (Size != 64) {
2578 I.setDesc(TII.get(Opcode));
2579 I.addImplicitDefUseOperands(*MF);
2580 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2581 }
2582
2583 const DebugLoc &DL = I.getDebugLoc();
2584
2585 APInt Imm(Size, I.getOperand(1).getImm());
2586
2587 MachineInstr *ResInst;
2588 if (IsSgpr && TII.isInlineConstant(Imm)) {
2589 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2590 .addImm(I.getOperand(1).getImm());
2591 } else {
2592 const TargetRegisterClass *RC = IsSgpr ?
2593 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2594 Register LoReg = MRI->createVirtualRegister(RC);
2595 Register HiReg = MRI->createVirtualRegister(RC);
2596
2597 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2598 .addImm(Imm.trunc(32).getZExtValue());
2599
2600 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2601 .addImm(Imm.ashr(32).getZExtValue());
2602
2603 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2604 .addReg(LoReg)
2605 .addImm(AMDGPU::sub0)
2606 .addReg(HiReg)
2607 .addImm(AMDGPU::sub1);
2608 }
2609
2610 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2611 // work for target independent opcodes
2612 I.eraseFromParent();
2613 const TargetRegisterClass *DstRC =
2614 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2615 if (!DstRC)
2616 return true;
2617 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2618}
2619
2620bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2621 // Only manually handle the f64 SGPR case.
2622 //
2623 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2624 // the bit ops theoretically have a second result due to the implicit def of
2625 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2626 // that is easy by disabling the check. The result works, but uses a
2627 // nonsensical sreg32orlds_and_sreg_1 regclass.
2628 //
2629 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2630 // the variadic REG_SEQUENCE operands.
2631
2632 Register Dst = MI.getOperand(0).getReg();
2633 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2634 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2635 MRI->getType(Dst) != LLT::scalar(64))
2636 return false;
2637
2638 Register Src = MI.getOperand(1).getReg();
2639 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2640 if (Fabs)
2641 Src = Fabs->getOperand(1).getReg();
2642
2643 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2644 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2645 return false;
2646
2647 MachineBasicBlock *BB = MI.getParent();
2648 const DebugLoc &DL = MI.getDebugLoc();
2649 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2650 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2651 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2652 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2653
2654 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2655 .addReg(Src, 0, AMDGPU::sub0);
2656 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2657 .addReg(Src, 0, AMDGPU::sub1);
2658 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2659 .addImm(0x80000000);
2660
2661 // Set or toggle sign bit.
2662 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2663 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2664 .addReg(HiReg)
2665 .addReg(ConstReg)
2666 .setOperandDead(3); // Dead scc
2667 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2668 .addReg(LoReg)
2669 .addImm(AMDGPU::sub0)
2670 .addReg(OpReg)
2671 .addImm(AMDGPU::sub1);
2672 MI.eraseFromParent();
2673 return true;
2674}
2675
2676// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2677bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2678 Register Dst = MI.getOperand(0).getReg();
2679 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2680 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2681 MRI->getType(Dst) != LLT::scalar(64))
2682 return false;
2683
2684 Register Src = MI.getOperand(1).getReg();
2685 MachineBasicBlock *BB = MI.getParent();
2686 const DebugLoc &DL = MI.getDebugLoc();
2687 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2688 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2689 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2690 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2691
2692 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2693 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2694 return false;
2695
2696 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2697 .addReg(Src, 0, AMDGPU::sub0);
2698 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2699 .addReg(Src, 0, AMDGPU::sub1);
2700 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2701 .addImm(0x7fffffff);
2702
2703 // Clear sign bit.
2704 // TODO: Should this used S_BITSET0_*?
2705 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2706 .addReg(HiReg)
2707 .addReg(ConstReg)
2708 .setOperandDead(3); // Dead scc
2709 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2710 .addReg(LoReg)
2711 .addImm(AMDGPU::sub0)
2712 .addReg(OpReg)
2713 .addImm(AMDGPU::sub1);
2714
2715 MI.eraseFromParent();
2716 return true;
2717}
2718
2719static bool isConstant(const MachineInstr &MI) {
2720 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2721}
2722
2723void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2724 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2725
2726 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2727 const MachineInstr *PtrMI =
2728 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2729
2730 assert(PtrMI);
2731
2732 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2733 return;
2734
2735 GEPInfo GEPInfo;
2736
2737 for (unsigned i = 1; i != 3; ++i) {
2738 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2739 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2740 assert(OpDef);
2741 if (i == 2 && isConstant(*OpDef)) {
2742 // TODO: Could handle constant base + variable offset, but a combine
2743 // probably should have commuted it.
2744 assert(GEPInfo.Imm == 0);
2745 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2746 continue;
2747 }
2748 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2749 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2750 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2751 else
2752 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2753 }
2754
2755 AddrInfo.push_back(GEPInfo);
2756 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2757}
2758
2759bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2760 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2761}
2762
2763bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2764 if (!MI.hasOneMemOperand())
2765 return false;
2766
2767 const MachineMemOperand *MMO = *MI.memoperands_begin();
2768 const Value *Ptr = MMO->getValue();
2769
2770 // UndefValue means this is a load of a kernel input. These are uniform.
2771 // Sometimes LDS instructions have constant pointers.
2772 // If Ptr is null, then that means this mem operand contains a
2773 // PseudoSourceValue like GOT.
2774 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2775 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2776 return true;
2777
2779 return true;
2780
2781 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2782 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2783 AMDGPU::SGPRRegBankID;
2784
2785 const Instruction *I = dyn_cast<Instruction>(Ptr);
2786 return I && I->getMetadata("amdgpu.uniform");
2787}
2788
2789bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2790 for (const GEPInfo &GEPInfo : AddrInfo) {
2791 if (!GEPInfo.VgprParts.empty())
2792 return true;
2793 }
2794 return false;
2795}
2796
2797void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2798 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2799 unsigned AS = PtrTy.getAddressSpace();
2801 STI.ldsRequiresM0Init()) {
2802 MachineBasicBlock *BB = I.getParent();
2803
2804 // If DS instructions require M0 initialization, insert it before selecting.
2805 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2806 .addImm(-1);
2807 }
2808}
2809
2810bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2811 MachineInstr &I) const {
2812 initM0(I);
2813 return selectImpl(I, *CoverageInfo);
2814}
2815
2817 if (Reg.isPhysical())
2818 return false;
2819
2820 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2821 const unsigned Opcode = MI.getOpcode();
2822
2823 if (Opcode == AMDGPU::COPY)
2824 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2825
2826 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2827 Opcode == AMDGPU::G_XOR)
2828 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2829 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2830
2831 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2832 return GI->is(Intrinsic::amdgcn_class);
2833
2834 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2835}
2836
2837bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2838 MachineBasicBlock *BB = I.getParent();
2839 MachineOperand &CondOp = I.getOperand(0);
2840 Register CondReg = CondOp.getReg();
2841 const DebugLoc &DL = I.getDebugLoc();
2842
2843 unsigned BrOpcode;
2844 Register CondPhysReg;
2845 const TargetRegisterClass *ConstrainRC;
2846
2847 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2848 // whether the branch is uniform when selecting the instruction. In
2849 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2850 // RegBankSelect knows what it's doing if the branch condition is scc, even
2851 // though it currently does not.
2852 if (!isVCC(CondReg, *MRI)) {
2853 if (MRI->getType(CondReg) != LLT::scalar(32))
2854 return false;
2855
2856 CondPhysReg = AMDGPU::SCC;
2857 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2858 ConstrainRC = &AMDGPU::SReg_32RegClass;
2859 } else {
2860 // FIXME: Should scc->vcc copies and with exec?
2861
2862 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2863 // need to insert an and with exec.
2864 if (!isVCmpResult(CondReg, *MRI)) {
2865 const bool Is64 = STI.isWave64();
2866 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2867 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2868
2869 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2870 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2871 .addReg(CondReg)
2872 .addReg(Exec)
2873 .setOperandDead(3); // Dead scc
2874 CondReg = TmpReg;
2875 }
2876
2877 CondPhysReg = TRI.getVCC();
2878 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2879 ConstrainRC = TRI.getBoolRC();
2880 }
2881
2882 if (!MRI->getRegClassOrNull(CondReg))
2883 MRI->setRegClass(CondReg, ConstrainRC);
2884
2885 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2886 .addReg(CondReg);
2887 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2888 .addMBB(I.getOperand(1).getMBB());
2889
2890 I.eraseFromParent();
2891 return true;
2892}
2893
2894bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2895 MachineInstr &I) const {
2896 Register DstReg = I.getOperand(0).getReg();
2897 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2898 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2899 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2900 if (IsVGPR)
2901 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2902
2903 return RBI.constrainGenericRegister(
2904 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2905}
2906
2907bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2908 Register DstReg = I.getOperand(0).getReg();
2909 Register SrcReg = I.getOperand(1).getReg();
2910 Register MaskReg = I.getOperand(2).getReg();
2911 LLT Ty = MRI->getType(DstReg);
2912 LLT MaskTy = MRI->getType(MaskReg);
2913 MachineBasicBlock *BB = I.getParent();
2914 const DebugLoc &DL = I.getDebugLoc();
2915
2916 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2917 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2918 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2919 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2920 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2921 return false;
2922
2923 // Try to avoid emitting a bit operation when we only need to touch half of
2924 // the 64-bit pointer.
2925 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2926 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2927 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2928
2929 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2930 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2931
2932 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2933 !CanCopyLow32 && !CanCopyHi32) {
2934 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2935 .addReg(SrcReg)
2936 .addReg(MaskReg)
2937 .setOperandDead(3); // Dead scc
2938 I.eraseFromParent();
2939 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2940 }
2941
2942 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2943 const TargetRegisterClass &RegRC
2944 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2945
2946 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2947 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2948 const TargetRegisterClass *MaskRC =
2949 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2950
2951 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2952 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2953 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2954 return false;
2955
2956 if (Ty.getSizeInBits() == 32) {
2957 assert(MaskTy.getSizeInBits() == 32 &&
2958 "ptrmask should have been narrowed during legalize");
2959
2960 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2961 .addReg(SrcReg)
2962 .addReg(MaskReg);
2963
2964 if (!IsVGPR)
2965 NewOp.setOperandDead(3); // Dead scc
2966 I.eraseFromParent();
2967 return true;
2968 }
2969
2970 Register HiReg = MRI->createVirtualRegister(&RegRC);
2971 Register LoReg = MRI->createVirtualRegister(&RegRC);
2972
2973 // Extract the subregisters from the source pointer.
2974 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2975 .addReg(SrcReg, 0, AMDGPU::sub0);
2976 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2977 .addReg(SrcReg, 0, AMDGPU::sub1);
2978
2979 Register MaskedLo, MaskedHi;
2980
2981 if (CanCopyLow32) {
2982 // If all the bits in the low half are 1, we only need a copy for it.
2983 MaskedLo = LoReg;
2984 } else {
2985 // Extract the mask subregister and apply the and.
2986 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2987 MaskedLo = MRI->createVirtualRegister(&RegRC);
2988
2989 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2990 .addReg(MaskReg, 0, AMDGPU::sub0);
2991 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2992 .addReg(LoReg)
2993 .addReg(MaskLo);
2994 }
2995
2996 if (CanCopyHi32) {
2997 // If all the bits in the high half are 1, we only need a copy for it.
2998 MaskedHi = HiReg;
2999 } else {
3000 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3001 MaskedHi = MRI->createVirtualRegister(&RegRC);
3002
3003 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3004 .addReg(MaskReg, 0, AMDGPU::sub1);
3005 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3006 .addReg(HiReg)
3007 .addReg(MaskHi);
3008 }
3009
3010 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3011 .addReg(MaskedLo)
3012 .addImm(AMDGPU::sub0)
3013 .addReg(MaskedHi)
3014 .addImm(AMDGPU::sub1);
3015 I.eraseFromParent();
3016 return true;
3017}
3018
3019/// Return the register to use for the index value, and the subregister to use
3020/// for the indirectly accessed register.
3021static std::pair<Register, unsigned>
3023 const TargetRegisterClass *SuperRC, Register IdxReg,
3024 unsigned EltSize, GISelKnownBits &KnownBits) {
3025 Register IdxBaseReg;
3026 int Offset;
3027
3028 std::tie(IdxBaseReg, Offset) =
3030 if (IdxBaseReg == AMDGPU::NoRegister) {
3031 // This will happen if the index is a known constant. This should ordinarily
3032 // be legalized out, but handle it as a register just in case.
3033 assert(Offset == 0);
3034 IdxBaseReg = IdxReg;
3035 }
3036
3037 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3038
3039 // Skip out of bounds offsets, or else we would end up using an undefined
3040 // register.
3041 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3042 return std::pair(IdxReg, SubRegs[0]);
3043 return std::pair(IdxBaseReg, SubRegs[Offset]);
3044}
3045
3046bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3047 MachineInstr &MI) const {
3048 Register DstReg = MI.getOperand(0).getReg();
3049 Register SrcReg = MI.getOperand(1).getReg();
3050 Register IdxReg = MI.getOperand(2).getReg();
3051
3052 LLT DstTy = MRI->getType(DstReg);
3053 LLT SrcTy = MRI->getType(SrcReg);
3054
3055 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3056 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3057 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3058
3059 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3060 // into a waterfall loop.
3061 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3062 return false;
3063
3064 const TargetRegisterClass *SrcRC =
3065 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3066 const TargetRegisterClass *DstRC =
3067 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3068 if (!SrcRC || !DstRC)
3069 return false;
3070 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3071 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3072 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3073 return false;
3074
3075 MachineBasicBlock *BB = MI.getParent();
3076 const DebugLoc &DL = MI.getDebugLoc();
3077 const bool Is64 = DstTy.getSizeInBits() == 64;
3078
3079 unsigned SubReg;
3080 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3081 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3082
3083 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3084 if (DstTy.getSizeInBits() != 32 && !Is64)
3085 return false;
3086
3087 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3088 .addReg(IdxReg);
3089
3090 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3091 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3092 .addReg(SrcReg, 0, SubReg)
3093 .addReg(SrcReg, RegState::Implicit);
3094 MI.eraseFromParent();
3095 return true;
3096 }
3097
3098 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3099 return false;
3100
3101 if (!STI.useVGPRIndexMode()) {
3102 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3103 .addReg(IdxReg);
3104 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3105 .addReg(SrcReg, 0, SubReg)
3106 .addReg(SrcReg, RegState::Implicit);
3107 MI.eraseFromParent();
3108 return true;
3109 }
3110
3111 const MCInstrDesc &GPRIDXDesc =
3112 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3113 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3114 .addReg(SrcReg)
3115 .addReg(IdxReg)
3116 .addImm(SubReg);
3117
3118 MI.eraseFromParent();
3119 return true;
3120}
3121
3122// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3123bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3124 MachineInstr &MI) const {
3125 Register DstReg = MI.getOperand(0).getReg();
3126 Register VecReg = MI.getOperand(1).getReg();
3127 Register ValReg = MI.getOperand(2).getReg();
3128 Register IdxReg = MI.getOperand(3).getReg();
3129
3130 LLT VecTy = MRI->getType(DstReg);
3131 LLT ValTy = MRI->getType(ValReg);
3132 unsigned VecSize = VecTy.getSizeInBits();
3133 unsigned ValSize = ValTy.getSizeInBits();
3134
3135 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3136 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3137 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3138
3139 assert(VecTy.getElementType() == ValTy);
3140
3141 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3142 // into a waterfall loop.
3143 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3144 return false;
3145
3146 const TargetRegisterClass *VecRC =
3147 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3148 const TargetRegisterClass *ValRC =
3149 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3150
3151 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3152 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3153 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3154 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3155 return false;
3156
3157 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3158 return false;
3159
3160 unsigned SubReg;
3161 std::tie(IdxReg, SubReg) =
3162 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3163
3164 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3165 STI.useVGPRIndexMode();
3166
3167 MachineBasicBlock *BB = MI.getParent();
3168 const DebugLoc &DL = MI.getDebugLoc();
3169
3170 if (!IndexMode) {
3171 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3172 .addReg(IdxReg);
3173
3174 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3175 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3176 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3177 .addReg(VecReg)
3178 .addReg(ValReg)
3179 .addImm(SubReg);
3180 MI.eraseFromParent();
3181 return true;
3182 }
3183
3184 const MCInstrDesc &GPRIDXDesc =
3185 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3186 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3187 .addReg(VecReg)
3188 .addReg(ValReg)
3189 .addReg(IdxReg)
3190 .addImm(SubReg);
3191
3192 MI.eraseFromParent();
3193 return true;
3194}
3195
3196bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3198 unsigned Opc;
3199 unsigned Size = MI.getOperand(3).getImm();
3200
3201 // The struct intrinsic variants add one additional operand over raw.
3202 const bool HasVIndex = MI.getNumOperands() == 9;
3203 Register VIndex;
3204 int OpOffset = 0;
3205 if (HasVIndex) {
3206 VIndex = MI.getOperand(4).getReg();
3207 OpOffset = 1;
3208 }
3209
3210 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3211 std::optional<ValueAndVReg> MaybeVOffset =
3213 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3214
3215 switch (Size) {
3216 default:
3217 return false;
3218 case 1:
3219 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3220 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3221 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3222 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3223 break;
3224 case 2:
3225 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3226 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3227 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3228 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3229 break;
3230 case 4:
3231 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3232 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3233 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3234 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3235 break;
3236 }
3237
3238 MachineBasicBlock *MBB = MI.getParent();
3239 const DebugLoc &DL = MI.getDebugLoc();
3240 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3241 .add(MI.getOperand(2));
3242
3243 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3244
3245 if (HasVIndex && HasVOffset) {
3246 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3247 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3248 .addReg(VIndex)
3249 .addImm(AMDGPU::sub0)
3250 .addReg(VOffset)
3251 .addImm(AMDGPU::sub1);
3252
3253 MIB.addReg(IdxReg);
3254 } else if (HasVIndex) {
3255 MIB.addReg(VIndex);
3256 } else if (HasVOffset) {
3257 MIB.addReg(VOffset);
3258 }
3259
3260 MIB.add(MI.getOperand(1)); // rsrc
3261 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3262 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3263 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3264 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3265 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3266
3267 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3268 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3269 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3270 MachinePointerInfo StorePtrI = LoadPtrI;
3271 StorePtrI.V = nullptr;
3273
3274 auto F = LoadMMO->getFlags() &
3276 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3277 Size, LoadMMO->getBaseAlign());
3278
3279 MachineMemOperand *StoreMMO =
3281 sizeof(int32_t), LoadMMO->getBaseAlign());
3282
3283 MIB.setMemRefs({LoadMMO, StoreMMO});
3284
3285 MI.eraseFromParent();
3286 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3287}
3288
3289/// Match a zero extend from a 32-bit value to 64-bits.
3291 Register ZExtSrc;
3292 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3293 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3294
3295 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3296 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3297 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3298 return Register();
3299
3300 assert(Def->getNumOperands() == 3 &&
3301 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3302 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3303 return Def->getOperand(1).getReg();
3304 }
3305
3306 return Register();
3307}
3308
3309bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3310 unsigned Opc;
3311 unsigned Size = MI.getOperand(3).getImm();
3312
3313 switch (Size) {
3314 default:
3315 return false;
3316 case 1:
3317 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3318 break;
3319 case 2:
3320 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3321 break;
3322 case 4:
3323 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3324 break;
3325 }
3326
3327 MachineBasicBlock *MBB = MI.getParent();
3328 const DebugLoc &DL = MI.getDebugLoc();
3329 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3330 .add(MI.getOperand(2));
3331
3332 Register Addr = MI.getOperand(1).getReg();
3333 Register VOffset;
3334 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3335 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3336 if (!isSGPR(Addr)) {
3337 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3338 if (isSGPR(AddrDef->Reg)) {
3339 Addr = AddrDef->Reg;
3340 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3341 Register SAddr =
3342 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3343 if (isSGPR(SAddr)) {
3344 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3345 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3346 Addr = SAddr;
3347 VOffset = Off;
3348 }
3349 }
3350 }
3351 }
3352
3353 if (isSGPR(Addr)) {
3354 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3355 if (!VOffset) {
3356 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3357 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3358 .addImm(0);
3359 }
3360 }
3361
3362 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3363 .addReg(Addr);
3364
3365 if (isSGPR(Addr))
3366 MIB.addReg(VOffset);
3367
3368 MIB.add(MI.getOperand(4)) // offset
3369 .add(MI.getOperand(5)); // cpol
3370
3371 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3372 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3373 LoadPtrI.Offset = MI.getOperand(4).getImm();
3374 MachinePointerInfo StorePtrI = LoadPtrI;
3377 auto F = LoadMMO->getFlags() &
3379 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3380 Size, LoadMMO->getBaseAlign());
3381 MachineMemOperand *StoreMMO =
3383 sizeof(int32_t), Align(4));
3384
3385 MIB.setMemRefs({LoadMMO, StoreMMO});
3386
3387 MI.eraseFromParent();
3388 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3389}
3390
3391bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3392 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3393 MI.removeOperand(1);
3394 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3395 return true;
3396}
3397
3398bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3399 unsigned Opc;
3400 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3401 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3402 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3403 break;
3404 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3405 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3406 break;
3407 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3408 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3409 break;
3410 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3411 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3412 break;
3413 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3414 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3415 break;
3416 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3417 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3418 break;
3419 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3420 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3421 break;
3422 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3423 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3424 break;
3425 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3426 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3427 break;
3428 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3429 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3430 break;
3431 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3432 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3433 break;
3434 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3435 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3436 break;
3437 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3438 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3439 break;
3440 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3441 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3442 break;
3443 default:
3444 llvm_unreachable("unhandled smfmac intrinsic");
3445 }
3446
3447 auto VDst_In = MI.getOperand(4);
3448
3449 MI.setDesc(TII.get(Opc));
3450 MI.removeOperand(4); // VDst_In
3451 MI.removeOperand(1); // Intrinsic ID
3452 MI.addOperand(VDst_In); // Readd VDst_In to the end
3453 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3454 return true;
3455}
3456
3457bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3458 Register DstReg = MI.getOperand(0).getReg();
3459 Register SrcReg = MI.getOperand(1).getReg();
3460 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3461 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3462 MachineBasicBlock *MBB = MI.getParent();
3463 const DebugLoc &DL = MI.getDebugLoc();
3464
3465 if (IsVALU) {
3466 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3467 .addImm(Subtarget->getWavefrontSizeLog2())
3468 .addReg(SrcReg);
3469 } else {
3470 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3471 .addReg(SrcReg)
3472 .addImm(Subtarget->getWavefrontSizeLog2())
3473 .setOperandDead(3); // Dead scc
3474 }
3475
3476 const TargetRegisterClass &RC =
3477 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3478 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3479 return false;
3480
3481 MI.eraseFromParent();
3482 return true;
3483}
3484
3485bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3486 Register SrcReg = MI.getOperand(0).getReg();
3487 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3488 return false;
3489
3490 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3491 Register SP =
3493 Register WaveAddr = getWaveAddress(DefMI);
3494 MachineBasicBlock *MBB = MI.getParent();
3495 const DebugLoc &DL = MI.getDebugLoc();
3496
3497 if (!WaveAddr) {
3498 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3499 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3500 .addReg(SrcReg)
3501 .addImm(Subtarget->getWavefrontSizeLog2())
3502 .setOperandDead(3); // Dead scc
3503 }
3504
3505 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3506 .addReg(WaveAddr);
3507
3508 MI.eraseFromParent();
3509 return true;
3510}
3511
3513
3514 if (!I.isPreISelOpcode()) {
3515 if (I.isCopy())
3516 return selectCOPY(I);
3517 return true;
3518 }
3519
3520 switch (I.getOpcode()) {
3521 case TargetOpcode::G_AND:
3522 case TargetOpcode::G_OR:
3523 case TargetOpcode::G_XOR:
3524 if (selectImpl(I, *CoverageInfo))
3525 return true;
3526 return selectG_AND_OR_XOR(I);
3527 case TargetOpcode::G_ADD:
3528 case TargetOpcode::G_SUB:
3529 case TargetOpcode::G_PTR_ADD:
3530 if (selectImpl(I, *CoverageInfo))
3531 return true;
3532 return selectG_ADD_SUB(I);
3533 case TargetOpcode::G_UADDO:
3534 case TargetOpcode::G_USUBO:
3535 case TargetOpcode::G_UADDE:
3536 case TargetOpcode::G_USUBE:
3537 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3538 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3539 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3540 return selectG_AMDGPU_MAD_64_32(I);
3541 case TargetOpcode::G_INTTOPTR:
3542 case TargetOpcode::G_BITCAST:
3543 case TargetOpcode::G_PTRTOINT:
3544 case TargetOpcode::G_FREEZE:
3545 return selectCOPY(I);
3546 case TargetOpcode::G_CONSTANT:
3547 case TargetOpcode::G_FCONSTANT:
3548 return selectG_CONSTANT(I);
3549 case TargetOpcode::G_FNEG:
3550 if (selectImpl(I, *CoverageInfo))
3551 return true;
3552 return selectG_FNEG(I);
3553 case TargetOpcode::G_FABS:
3554 if (selectImpl(I, *CoverageInfo))
3555 return true;
3556 return selectG_FABS(I);
3557 case TargetOpcode::G_EXTRACT:
3558 return selectG_EXTRACT(I);
3559 case TargetOpcode::G_MERGE_VALUES:
3560 case TargetOpcode::G_CONCAT_VECTORS:
3561 return selectG_MERGE_VALUES(I);
3562 case TargetOpcode::G_UNMERGE_VALUES:
3563 return selectG_UNMERGE_VALUES(I);
3564 case TargetOpcode::G_BUILD_VECTOR:
3565 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3566 return selectG_BUILD_VECTOR(I);
3567 case TargetOpcode::G_IMPLICIT_DEF:
3568 return selectG_IMPLICIT_DEF(I);
3569 case TargetOpcode::G_INSERT:
3570 return selectG_INSERT(I);
3571 case TargetOpcode::G_INTRINSIC:
3572 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3573 return selectG_INTRINSIC(I);
3574 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3575 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3576 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3577 case TargetOpcode::G_ICMP:
3578 case TargetOpcode::G_FCMP:
3579 if (selectG_ICMP_or_FCMP(I))
3580 return true;
3581 return selectImpl(I, *CoverageInfo);
3582 case TargetOpcode::G_LOAD:
3583 case TargetOpcode::G_STORE:
3584 case TargetOpcode::G_ATOMIC_CMPXCHG:
3585 case TargetOpcode::G_ATOMICRMW_XCHG:
3586 case TargetOpcode::G_ATOMICRMW_ADD:
3587 case TargetOpcode::G_ATOMICRMW_SUB:
3588 case TargetOpcode::G_ATOMICRMW_AND:
3589 case TargetOpcode::G_ATOMICRMW_OR:
3590 case TargetOpcode::G_ATOMICRMW_XOR:
3591 case TargetOpcode::G_ATOMICRMW_MIN:
3592 case TargetOpcode::G_ATOMICRMW_MAX:
3593 case TargetOpcode::G_ATOMICRMW_UMIN:
3594 case TargetOpcode::G_ATOMICRMW_UMAX:
3595 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3596 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3597 case TargetOpcode::G_ATOMICRMW_FADD:
3598 case TargetOpcode::G_ATOMICRMW_FMIN:
3599 case TargetOpcode::G_ATOMICRMW_FMAX:
3600 return selectG_LOAD_STORE_ATOMICRMW(I);
3601 case TargetOpcode::G_SELECT:
3602 return selectG_SELECT(I);
3603 case TargetOpcode::G_TRUNC:
3604 return selectG_TRUNC(I);
3605 case TargetOpcode::G_SEXT:
3606 case TargetOpcode::G_ZEXT:
3607 case TargetOpcode::G_ANYEXT:
3608 case TargetOpcode::G_SEXT_INREG:
3609 // This is a workaround. For extension from type i1, `selectImpl()` uses
3610 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3611 // i1 can only be hold in a SGPR class.
3612 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3613 selectImpl(I, *CoverageInfo))
3614 return true;
3615 return selectG_SZA_EXT(I);
3616 case TargetOpcode::G_FPEXT:
3617 if (selectG_FPEXT(I))
3618 return true;
3619 return selectImpl(I, *CoverageInfo);
3620 case TargetOpcode::G_BRCOND:
3621 return selectG_BRCOND(I);
3622 case TargetOpcode::G_GLOBAL_VALUE:
3623 return selectG_GLOBAL_VALUE(I);
3624 case TargetOpcode::G_PTRMASK:
3625 return selectG_PTRMASK(I);
3626 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3627 return selectG_EXTRACT_VECTOR_ELT(I);
3628 case TargetOpcode::G_INSERT_VECTOR_ELT:
3629 return selectG_INSERT_VECTOR_ELT(I);
3630 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3631 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3632 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3633 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3636 assert(Intr && "not an image intrinsic with image pseudo");
3637 return selectImageIntrinsic(I, Intr);
3638 }
3639 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3640 return selectBVHIntrinsic(I);
3641 case AMDGPU::G_SBFX:
3642 case AMDGPU::G_UBFX:
3643 return selectG_SBFX_UBFX(I);
3644 case AMDGPU::G_SI_CALL:
3645 I.setDesc(TII.get(AMDGPU::SI_CALL));
3646 return true;
3647 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3648 return selectWaveAddress(I);
3649 case AMDGPU::G_STACKRESTORE:
3650 return selectStackRestore(I);
3651 case AMDGPU::G_PHI:
3652 return selectPHI(I);
3653 default:
3654 return selectImpl(I, *CoverageInfo);
3655 }
3656 return false;
3657}
3658
3660AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3661 return {{
3662 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3663 }};
3664
3665}
3666
3667std::pair<Register, unsigned>
3668AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3669 bool IsCanonicalizing,
3670 bool AllowAbs, bool OpSel) const {
3671 Register Src = Root.getReg();
3672 unsigned Mods = 0;
3673 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3674
3675 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3676 Src = MI->getOperand(1).getReg();
3677 Mods |= SISrcMods::NEG;
3678 MI = getDefIgnoringCopies(Src, *MRI);
3679 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3680 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3681 // denormal mode, but we're implicitly canonicalizing in a source operand.
3682 const ConstantFP *LHS =
3683 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3684 if (LHS && LHS->isZero()) {
3685 Mods |= SISrcMods::NEG;
3686 Src = MI->getOperand(2).getReg();
3687 }
3688 }
3689
3690 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3691 Src = MI->getOperand(1).getReg();
3692 Mods |= SISrcMods::ABS;
3693 }
3694
3695 if (OpSel)
3696 Mods |= SISrcMods::OP_SEL_0;
3697
3698 return std::pair(Src, Mods);
3699}
3700
3701Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3702 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3703 bool ForceVGPR) const {
3704 if ((Mods != 0 || ForceVGPR) &&
3705 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3706
3707 // If we looked through copies to find source modifiers on an SGPR operand,
3708 // we now have an SGPR register source. To avoid potentially violating the
3709 // constant bus restriction, we need to insert a copy to a VGPR.
3710 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3711 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3712 TII.get(AMDGPU::COPY), VGPRSrc)
3713 .addReg(Src);
3714 Src = VGPRSrc;
3715 }
3716
3717 return Src;
3718}
3719
3720///
3721/// This will select either an SGPR or VGPR operand and will save us from
3722/// having to write an extra tablegen pattern.
3724AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3725 return {{
3726 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3727 }};
3728}
3729
3731AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3732 Register Src;
3733 unsigned Mods;
3734 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3735
3736 return {{
3737 [=](MachineInstrBuilder &MIB) {
3738 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3739 },
3740 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3741 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3742 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3743 }};
3744}
3745
3747AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3748 Register Src;
3749 unsigned Mods;
3750 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3751 /*IsCanonicalizing=*/true,
3752 /*AllowAbs=*/false);
3753
3754 return {{
3755 [=](MachineInstrBuilder &MIB) {
3756 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3757 },
3758 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3759 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3760 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3761 }};
3762}
3763
3765AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3766 return {{
3767 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3768 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3769 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3770 }};
3771}
3772
3774AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3775 Register Src;
3776 unsigned Mods;
3777 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3778
3779 return {{
3780 [=](MachineInstrBuilder &MIB) {
3781 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3782 },
3783 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3784 }};
3785}
3786
3788AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3789 MachineOperand &Root) const {
3790 Register Src;
3791 unsigned Mods;
3792 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3793
3794 return {{
3795 [=](MachineInstrBuilder &MIB) {
3796 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3797 },
3798 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3799 }};
3800}
3801
3803AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3804 Register Src;
3805 unsigned Mods;
3806 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3807 /*AllowAbs=*/false);
3808
3809 return {{
3810 [=](MachineInstrBuilder &MIB) {
3811 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3812 },
3813 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3814 }};
3815}
3816
3818AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3819 Register Reg = Root.getReg();
3820 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3821 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3822 return {};
3823 return {{
3824 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3825 }};
3826}
3827
3828std::pair<Register, unsigned>
3829AMDGPUInstructionSelector::selectVOP3PModsImpl(
3830 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3831 unsigned Mods = 0;
3832 MachineInstr *MI = MRI.getVRegDef(Src);
3833
3834 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3835 // It's possible to see an f32 fneg here, but unlikely.
3836 // TODO: Treat f32 fneg as only high bit.
3837 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3839 Src = MI->getOperand(1).getReg();
3840 MI = MRI.getVRegDef(Src);
3841 }
3842
3843 // TODO: Handle G_FSUB 0 as fneg
3844
3845 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3846 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3847
3848 // Packed instructions do not have abs modifiers.
3849 Mods |= SISrcMods::OP_SEL_1;
3850
3851 return std::pair(Src, Mods);
3852}
3853
3855AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3857 = Root.getParent()->getParent()->getParent()->getRegInfo();
3858
3859 Register Src;
3860 unsigned Mods;
3861 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3862
3863 return {{
3864 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3865 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3866 }};
3867}
3868
3870AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3872 = Root.getParent()->getParent()->getParent()->getRegInfo();
3873
3874 Register Src;
3875 unsigned Mods;
3876 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3877
3878 return {{
3879 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3880 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3881 }};
3882}
3883
3885AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3886 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3887 // Value is in Imm operand as i1 sign extended to int64_t.
3888 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3889 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3890 "expected i1 value");
3891 unsigned Mods = SISrcMods::OP_SEL_1;
3892 if (Root.getImm() == -1)
3893 Mods ^= SISrcMods::NEG;
3894 return {{
3895 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3896 }};
3897}
3898
3900AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3901 MachineOperand &Root) const {
3902 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3903 "expected i1 value");
3904 unsigned Mods = SISrcMods::OP_SEL_1;
3905 if (Root.getImm() != 0)
3906 Mods |= SISrcMods::OP_SEL_0;
3907
3908 return {{
3909 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3910 }};
3911}
3912
3914 MachineInstr *InsertPt,
3916 const TargetRegisterClass *DstRegClass;
3917 switch (Elts.size()) {
3918 case 8:
3919 DstRegClass = &AMDGPU::VReg_256RegClass;
3920 break;
3921 case 4:
3922 DstRegClass = &AMDGPU::VReg_128RegClass;
3923 break;
3924 case 2:
3925 DstRegClass = &AMDGPU::VReg_64RegClass;
3926 break;
3927 default:
3928 llvm_unreachable("unhandled Reg sequence size");
3929 }
3930
3931 MachineIRBuilder B(*InsertPt);
3932 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3933 .addDef(MRI.createVirtualRegister(DstRegClass));
3934 for (unsigned i = 0; i < Elts.size(); ++i) {
3935 MIB.addReg(Elts[i]);
3937 }
3938 return MIB->getOperand(0).getReg();
3939}
3940
3941static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3943 MachineInstr *InsertPt,
3945 if (ModOpcode == TargetOpcode::G_FNEG) {
3946 Mods |= SISrcMods::NEG;
3947 // Check if all elements also have abs modifier
3948 SmallVector<Register, 8> NegAbsElts;
3949 for (auto El : Elts) {
3950 Register FabsSrc;
3951 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3952 break;
3953 NegAbsElts.push_back(FabsSrc);
3954 }
3955 if (Elts.size() != NegAbsElts.size()) {
3956 // Neg
3957 Src = buildRegSequence(Elts, InsertPt, MRI);
3958 } else {
3959 // Neg and Abs
3960 Mods |= SISrcMods::NEG_HI;
3961 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
3962 }
3963 } else {
3964 assert(ModOpcode == TargetOpcode::G_FABS);
3965 // Abs
3966 Mods |= SISrcMods::NEG_HI;
3967 Src = buildRegSequence(Elts, InsertPt, MRI);
3968 }
3969}
3970
3972AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3973 Register Src = Root.getReg();
3974 unsigned Mods = SISrcMods::OP_SEL_1;
3976
3977 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3978 assert(BV->getNumSources() > 0);
3979 // Based on first element decide which mod we match, neg or abs
3980 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3981 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3982 ? AMDGPU::G_FNEG
3983 : AMDGPU::G_FABS;
3984 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3985 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3986 if (ElF32->getOpcode() != ModOpcode)
3987 break;
3988 EltsF32.push_back(ElF32->getOperand(1).getReg());
3989 }
3990
3991 // All elements had ModOpcode modifier
3992 if (BV->getNumSources() == EltsF32.size()) {
3993 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
3994 *MRI);
3995 }
3996 }
3997
3998 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3999 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4000}
4001
4003AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4004 Register Src = Root.getReg();
4005 unsigned Mods = SISrcMods::OP_SEL_1;
4006 SmallVector<Register, 8> EltsV2F16;
4007
4008 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4009 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4010 Register FNegSrc;
4011 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4012 break;
4013 EltsV2F16.push_back(FNegSrc);
4014 }
4015
4016 // All elements had ModOpcode modifier
4017 if (CV->getNumSources() == EltsV2F16.size()) {
4018 Mods |= SISrcMods::NEG;
4019 Mods |= SISrcMods::NEG_HI;
4020 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4021 }
4022 }
4023
4024 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4025 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4026}
4027
4029AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4030 Register Src = Root.getReg();
4031 unsigned Mods = SISrcMods::OP_SEL_1;
4032 SmallVector<Register, 8> EltsV2F16;
4033
4034 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4035 assert(CV->getNumSources() > 0);
4036 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4037 // Based on first element decide which mod we match, neg or abs
4038 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4039 ? AMDGPU::G_FNEG
4040 : AMDGPU::G_FABS;
4041
4042 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4043 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4044 if (ElV2F16->getOpcode() != ModOpcode)
4045 break;
4046 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4047 }
4048
4049 // All elements had ModOpcode modifier
4050 if (CV->getNumSources() == EltsV2F16.size()) {
4051 MachineIRBuilder B(*Root.getParent());
4052 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4053 *MRI);
4054 }
4055 }
4056
4057 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4058 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4059}
4060
4062AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4063 std::optional<FPValueAndVReg> FPValReg;
4064 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4065 if (TII.isInlineConstant(FPValReg->Value)) {
4066 return {{[=](MachineInstrBuilder &MIB) {
4067 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4068 }}};
4069 }
4070 // Non-inlineable splat floats should not fall-through for integer immediate
4071 // checks.
4072 return {};
4073 }
4074
4075 APInt ICst;
4076 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4077 if (TII.isInlineConstant(ICst)) {
4078 return {
4079 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4080 }
4081 }
4082
4083 return {};
4084}
4085
4087AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4088 Register Src =
4089 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4090 unsigned Key = 0;
4091
4092 Register ShiftSrc;
4093 std::optional<ValueAndVReg> ShiftAmt;
4094 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4095 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4096 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4097 Key = ShiftAmt->Value.getZExtValue() / 8;
4098 Src = ShiftSrc;
4099 }
4100
4101 return {{
4102 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4103 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4104 }};
4105}
4106
4108AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4109
4110 Register Src =
4111 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4112 unsigned Key = 0;
4113
4114 Register ShiftSrc;
4115 std::optional<ValueAndVReg> ShiftAmt;
4116 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4117 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4118 ShiftAmt->Value.getZExtValue() == 16) {
4119 Src = ShiftSrc;
4120 Key = 1;
4121 }
4122
4123 return {{
4124 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4125 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4126 }};
4127}
4128
4130AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4131 Register Src;
4132 unsigned Mods;
4133 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4134
4135 // FIXME: Handle op_sel
4136 return {{
4137 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4138 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4139 }};
4140}
4141
4143AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4144 Register Src;
4145 unsigned Mods;
4146 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4147 /*IsCanonicalizing=*/true,
4148 /*AllowAbs=*/false,
4149 /*OpSel=*/false);
4150
4151 return {{
4152 [=](MachineInstrBuilder &MIB) {
4153 MIB.addReg(
4154 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4155 },
4156 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4157 }};
4158}
4159
4161AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4162 Register Src;
4163 unsigned Mods;
4164 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4165 /*IsCanonicalizing=*/true,
4166 /*AllowAbs=*/false,
4167 /*OpSel=*/true);
4168
4169 return {{
4170 [=](MachineInstrBuilder &MIB) {
4171 MIB.addReg(
4172 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4173 },
4174 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4175 }};
4176}
4177
4178bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4179 Register &Base,
4180 Register *SOffset,
4181 int64_t *Offset) const {
4182 MachineInstr *MI = Root.getParent();
4183 MachineBasicBlock *MBB = MI->getParent();
4184
4185 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4186 // then we can select all ptr + 32-bit offsets.
4187 SmallVector<GEPInfo, 4> AddrInfo;
4188 getAddrModeInfo(*MI, *MRI, AddrInfo);
4189
4190 if (AddrInfo.empty())
4191 return false;
4192
4193 const GEPInfo &GEPI = AddrInfo[0];
4194 std::optional<int64_t> EncodedImm;
4195
4196 if (SOffset && Offset) {
4197 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4198 /*HasSOffset=*/true);
4199 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4200 AddrInfo.size() > 1) {
4201 const GEPInfo &GEPI2 = AddrInfo[1];
4202 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4203 if (Register OffsetReg =
4204 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4205 Base = GEPI2.SgprParts[0];
4206 *SOffset = OffsetReg;
4207 *Offset = *EncodedImm;
4208 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4209 return true;
4210
4211 // For unbuffered smem loads, it is illegal for the Immediate Offset
4212 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4213 // is negative. Handle the case where the Immediate Offset + SOffset
4214 // is negative.
4215 auto SKnown = KB->getKnownBits(*SOffset);
4216 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4217 return false;
4218
4219 return true;
4220 }
4221 }
4222 }
4223 return false;
4224 }
4225
4226 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4227 /*HasSOffset=*/false);
4228 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4229 Base = GEPI.SgprParts[0];
4230 *Offset = *EncodedImm;
4231 return true;
4232 }
4233
4234 // SGPR offset is unsigned.
4235 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4236 GEPI.Imm != 0) {
4237 // If we make it this far we have a load with an 32-bit immediate offset.
4238 // It is OK to select this using a sgpr offset, because we have already
4239 // failed trying to select this load into one of the _IMM variants since
4240 // the _IMM Patterns are considered before the _SGPR patterns.
4241 Base = GEPI.SgprParts[0];
4242 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4243 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4244 .addImm(GEPI.Imm);
4245 return true;
4246 }
4247
4248 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4249 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4250 Base = GEPI.SgprParts[0];
4251 *SOffset = OffsetReg;
4252 return true;
4253 }
4254 }
4255
4256 return false;
4257}
4258
4260AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4261 Register Base;
4262 int64_t Offset;
4263 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4264 return std::nullopt;
4265
4266 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4267 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4268}
4269
4271AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4272 SmallVector<GEPInfo, 4> AddrInfo;
4273 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4274
4275 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4276 return std::nullopt;
4277
4278 const GEPInfo &GEPInfo = AddrInfo[0];
4279 Register PtrReg = GEPInfo.SgprParts[0];
4280 std::optional<int64_t> EncodedImm =
4281 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4282 if (!EncodedImm)
4283 return std::nullopt;
4284
4285 return {{
4286 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4287 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4288 }};
4289}
4290
4292AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4293 Register Base, SOffset;
4294 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4295 return std::nullopt;
4296
4297 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4298 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4299}
4300
4302AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4303 Register Base, SOffset;
4304 int64_t Offset;
4305 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4306 return std::nullopt;
4307
4308 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4309 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4310 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4311}
4312
4313std::pair<Register, int>
4314AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4315 uint64_t FlatVariant) const {
4316 MachineInstr *MI = Root.getParent();
4317
4318 auto Default = std::pair(Root.getReg(), 0);
4319
4320 if (!STI.hasFlatInstOffsets())
4321 return Default;
4322
4323 Register PtrBase;
4324 int64_t ConstOffset;
4325 std::tie(PtrBase, ConstOffset) =
4326 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4327
4328 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4329 !isFlatScratchBaseLegal(Root.getReg())))
4330 return Default;
4331
4332 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4333 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4334 return Default;
4335
4336 return std::pair(PtrBase, ConstOffset);
4337}
4338
4340AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4341 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4342
4343 return {{
4344 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4345 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4346 }};
4347}
4348
4350AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4351 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4352
4353 return {{
4354 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4355 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4356 }};
4357}
4358
4360AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4361 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4362
4363 return {{
4364 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4365 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4366 }};
4367}
4368
4369// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4371AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4372 Register Addr = Root.getReg();
4373 Register PtrBase;
4374 int64_t ConstOffset;
4375 int64_t ImmOffset = 0;
4376
4377 // Match the immediate offset first, which canonically is moved as low as
4378 // possible.
4379 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4380
4381 if (ConstOffset != 0) {
4382 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4384 Addr = PtrBase;
4385 ImmOffset = ConstOffset;
4386 } else {
4387 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4388 if (isSGPR(PtrBaseDef->Reg)) {
4389 if (ConstOffset > 0) {
4390 // Offset is too large.
4391 //
4392 // saddr + large_offset -> saddr +
4393 // (voffset = large_offset & ~MaxOffset) +
4394 // (large_offset & MaxOffset);
4395 int64_t SplitImmOffset, RemainderOffset;
4396 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4398
4399 if (isUInt<32>(RemainderOffset)) {
4400 MachineInstr *MI = Root.getParent();
4401 MachineBasicBlock *MBB = MI->getParent();
4402 Register HighBits =
4403 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4404
4405 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4406 HighBits)
4407 .addImm(RemainderOffset);
4408
4409 return {{
4410 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4411 [=](MachineInstrBuilder &MIB) {
4412 MIB.addReg(HighBits);
4413 }, // voffset
4414 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4415 }};
4416 }
4417 }
4418
4419 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4420 // is 1 we would need to perform 1 or 2 extra moves for each half of
4421 // the constant and it is better to do a scalar add and then issue a
4422 // single VALU instruction to materialize zero. Otherwise it is less
4423 // instructions to perform VALU adds with immediates or inline literals.
4424 unsigned NumLiterals =
4425 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4426 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4427 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4428 return std::nullopt;
4429 }
4430 }
4431 }
4432
4433 // Match the variable offset.
4434 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4435 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4436 // Look through the SGPR->VGPR copy.
4437 Register SAddr =
4438 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4439
4440 if (isSGPR(SAddr)) {
4441 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4442
4443 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4444 // inserted later.
4445 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4446 return {{[=](MachineInstrBuilder &MIB) { // saddr
4447 MIB.addReg(SAddr);
4448 },
4449 [=](MachineInstrBuilder &MIB) { // voffset
4450 MIB.addReg(VOffset);
4451 },
4452 [=](MachineInstrBuilder &MIB) { // offset
4453 MIB.addImm(ImmOffset);
4454 }}};
4455 }
4456 }
4457 }
4458
4459 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4460 // drop this.
4461 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4462 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4463 return std::nullopt;
4464
4465 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4466 // moves required to copy a 64-bit SGPR to VGPR.
4467 MachineInstr *MI = Root.getParent();
4468 MachineBasicBlock *MBB = MI->getParent();
4469 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4470
4471 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4472 .addImm(0);
4473
4474 return {{
4475 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4476 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4477 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4478 }};
4479}
4480
4482AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4483 Register Addr = Root.getReg();
4484 Register PtrBase;
4485 int64_t ConstOffset;
4486 int64_t ImmOffset = 0;
4487
4488 // Match the immediate offset first, which canonically is moved as low as
4489 // possible.
4490 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4491
4492 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4495 Addr = PtrBase;
4496 ImmOffset = ConstOffset;
4497 }
4498
4499 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4500 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4501 int FI = AddrDef->MI->getOperand(1).getIndex();
4502 return {{
4503 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4504 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4505 }};
4506 }
4507
4508 Register SAddr = AddrDef->Reg;
4509
4510 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4511 Register LHS = AddrDef->MI->getOperand(1).getReg();
4512 Register RHS = AddrDef->MI->getOperand(2).getReg();
4513 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4514 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4515
4516 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4517 isSGPR(RHSDef->Reg)) {
4518 int FI = LHSDef->MI->getOperand(1).getIndex();
4519 MachineInstr &I = *Root.getParent();
4520 MachineBasicBlock *BB = I.getParent();
4521 const DebugLoc &DL = I.getDebugLoc();
4522 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4523
4524 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4525 .addFrameIndex(FI)
4526 .addReg(RHSDef->Reg)
4527 .setOperandDead(3); // Dead scc
4528 }
4529 }
4530
4531 if (!isSGPR(SAddr))
4532 return std::nullopt;
4533
4534 return {{
4535 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4536 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4537 }};
4538}
4539
4540// Check whether the flat scratch SVS swizzle bug affects this access.
4541bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4542 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4543 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4544 return false;
4545
4546 // The bug affects the swizzling of SVS accesses if there is any carry out
4547 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4548 // voffset to (soffset + inst_offset).
4549 auto VKnown = KB->getKnownBits(VAddr);
4550 auto SKnown = KnownBits::computeForAddSub(
4551 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4552 KnownBits::makeConstant(APInt(32, ImmOffset)));
4553 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4554 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4555 return (VMax & 3) + (SMax & 3) >= 4;
4556}
4557
4559AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4560 Register Addr = Root.getReg();
4561 Register PtrBase;
4562 int64_t ConstOffset;
4563 int64_t ImmOffset = 0;
4564
4565 // Match the immediate offset first, which canonically is moved as low as
4566 // possible.
4567 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4568
4569 Register OrigAddr = Addr;
4570 if (ConstOffset != 0 &&
4571 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4572 Addr = PtrBase;
4573 ImmOffset = ConstOffset;
4574 }
4575
4576 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4577 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4578 return std::nullopt;
4579
4580 Register RHS = AddrDef->MI->getOperand(2).getReg();
4581 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4582 return std::nullopt;
4583
4584 Register LHS = AddrDef->MI->getOperand(1).getReg();
4585 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4586
4587 if (OrigAddr != Addr) {
4588 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4589 return std::nullopt;
4590 } else {
4591 if (!isFlatScratchBaseLegalSV(OrigAddr))
4592 return std::nullopt;
4593 }
4594
4595 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4596 return std::nullopt;
4597
4598 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4599 int FI = LHSDef->MI->getOperand(1).getIndex();
4600 return {{
4601 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4602 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4603 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4604 }};
4605 }
4606
4607 if (!isSGPR(LHS))
4608 return std::nullopt;
4609
4610 return {{
4611 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4612 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4613 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4614 }};
4615}
4616
4618AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4619 MachineInstr *MI = Root.getParent();
4620 MachineBasicBlock *MBB = MI->getParent();
4623
4624 int64_t Offset = 0;
4625 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4627 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4628
4629 // TODO: Should this be inside the render function? The iterator seems to
4630 // move.
4631 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4632 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4633 HighBits)
4634 .addImm(Offset & ~MaxOffset);
4635
4636 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4637 MIB.addReg(Info->getScratchRSrcReg());
4638 },
4639 [=](MachineInstrBuilder &MIB) { // vaddr
4640 MIB.addReg(HighBits);
4641 },
4642 [=](MachineInstrBuilder &MIB) { // soffset
4643 // Use constant zero for soffset and rely on eliminateFrameIndex
4644 // to choose the appropriate frame register if need be.
4645 MIB.addImm(0);
4646 },
4647 [=](MachineInstrBuilder &MIB) { // offset
4648 MIB.addImm(Offset & MaxOffset);
4649 }}};
4650 }
4651
4652 assert(Offset == 0 || Offset == -1);
4653
4654 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4655 // offsets.
4656 std::optional<int> FI;
4657 Register VAddr = Root.getReg();
4658 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4659 Register PtrBase;
4660 int64_t ConstOffset;
4661 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4662 if (ConstOffset != 0) {
4663 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4665 KB->signBitIsZero(PtrBase))) {
4666 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4667 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4668 FI = PtrBaseDef->getOperand(1).getIndex();
4669 else
4670 VAddr = PtrBase;
4671 Offset = ConstOffset;
4672 }
4673 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4674 FI = RootDef->getOperand(1).getIndex();
4675 }
4676 }
4677
4678 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4679 MIB.addReg(Info->getScratchRSrcReg());
4680 },
4681 [=](MachineInstrBuilder &MIB) { // vaddr
4682 if (FI)
4683 MIB.addFrameIndex(*FI);
4684 else
4685 MIB.addReg(VAddr);
4686 },
4687 [=](MachineInstrBuilder &MIB) { // soffset
4688 // Use constant zero for soffset and rely on eliminateFrameIndex
4689 // to choose the appropriate frame register if need be.
4690 MIB.addImm(0);
4691 },
4692 [=](MachineInstrBuilder &MIB) { // offset
4693 MIB.addImm(Offset);
4694 }}};
4695}
4696
4697bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4698 int64_t Offset) const {
4699 if (!isUInt<16>(Offset))
4700 return false;
4701
4703 return true;
4704
4705 // On Southern Islands instruction with a negative base value and an offset
4706 // don't seem to work.
4707 return KB->signBitIsZero(Base);
4708}
4709
4710bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4711 int64_t Offset1,
4712 unsigned Size) const {
4713 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4714 return false;
4715 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4716 return false;
4717
4719 return true;
4720
4721 // On Southern Islands instruction with a negative base value and an offset
4722 // don't seem to work.
4723 return KB->signBitIsZero(Base);
4724}
4725
4726// Return whether the operation has NoUnsignedWrap property.
4728 return Addr->getOpcode() == TargetOpcode::G_OR ||
4729 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4730 Addr->getFlag(MachineInstr::NoUWrap));
4731}
4732
4733// Check that the base address of flat scratch load/store in the form of `base +
4734// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4735// requirement). We always treat the first operand as the base address here.
4736bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4737 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4738
4739 if (isNoUnsignedWrap(AddrMI))
4740 return true;
4741
4742 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4743 // values.
4744 if (STI.hasSignedScratchOffsets())
4745 return true;
4746
4747 Register LHS = AddrMI->getOperand(1).getReg();
4748 Register RHS = AddrMI->getOperand(2).getReg();
4749
4750 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4751 std::optional<ValueAndVReg> RhsValReg =
4753 // If the immediate offset is negative and within certain range, the base
4754 // address cannot also be negative. If the base is also negative, the sum
4755 // would be either negative or much larger than the valid range of scratch
4756 // memory a thread can access.
4757 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4758 RhsValReg->Value.getSExtValue() > -0x40000000)
4759 return true;
4760 }
4761
4762 return KB->signBitIsZero(LHS);
4763}
4764
4765// Check address value in SGPR/VGPR are legal for flat scratch in the form
4766// of: SGPR + VGPR.
4767bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4768 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4769
4770 if (isNoUnsignedWrap(AddrMI))
4771 return true;
4772
4773 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4774 // values.
4775 if (STI.hasSignedScratchOffsets())
4776 return true;
4777
4778 Register LHS = AddrMI->getOperand(1).getReg();
4779 Register RHS = AddrMI->getOperand(2).getReg();
4780 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4781}
4782
4783// Check address value in SGPR/VGPR are legal for flat scratch in the form
4784// of: SGPR + VGPR + Imm.
4785bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4786 Register Addr) const {
4787 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4788 // values.
4789 if (STI.hasSignedScratchOffsets())
4790 return true;
4791
4792 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4793 Register Base = AddrMI->getOperand(1).getReg();
4794 std::optional<DefinitionAndSourceRegister> BaseDef =
4796 std::optional<ValueAndVReg> RHSOffset =
4798 assert(RHSOffset);
4799
4800 // If the immediate offset is negative and within certain range, the base
4801 // address cannot also be negative. If the base is also negative, the sum
4802 // would be either negative or much larger than the valid range of scratch
4803 // memory a thread can access.
4804 if (isNoUnsignedWrap(BaseDef->MI) &&
4805 (isNoUnsignedWrap(AddrMI) ||
4806 (RHSOffset->Value.getSExtValue() < 0 &&
4807 RHSOffset->Value.getSExtValue() > -0x40000000)))
4808 return true;
4809
4810 Register LHS = BaseDef->MI->getOperand(1).getReg();
4811 Register RHS = BaseDef->MI->getOperand(2).getReg();
4812 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4813}
4814
4815bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4816 unsigned ShAmtBits) const {
4817 assert(MI.getOpcode() == TargetOpcode::G_AND);
4818
4819 std::optional<APInt> RHS =
4820 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4821 if (!RHS)
4822 return false;
4823
4824 if (RHS->countr_one() >= ShAmtBits)
4825 return true;
4826
4827 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4828 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4829}
4830
4832AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4833 MachineOperand &Root) const {
4834 Register Reg = Root.getReg();
4836
4837 std::optional<DefinitionAndSourceRegister> Def =
4838 getDefSrcRegIgnoringCopies(Reg, *MRI);
4839 assert(Def && "this shouldn't be an optional result");
4840 Reg = Def->Reg;
4841
4842 if (Register WaveBase = getWaveAddress(Def->MI)) {
4843 return {{
4844 [=](MachineInstrBuilder &MIB) { // rsrc
4845 MIB.addReg(Info->getScratchRSrcReg());
4846 },
4847 [=](MachineInstrBuilder &MIB) { // soffset
4848 MIB.addReg(WaveBase);
4849 },
4850 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4851 }};
4852 }
4853
4854 int64_t Offset = 0;
4855
4856 // FIXME: Copy check is a hack
4858 if (mi_match(Reg, *MRI,
4859 m_GPtrAdd(m_Reg(BasePtr),
4861 if (!TII.isLegalMUBUFImmOffset(Offset))
4862 return {};
4863 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4864 Register WaveBase = getWaveAddress(BasePtrDef);
4865 if (!WaveBase)
4866 return {};
4867
4868 return {{
4869 [=](MachineInstrBuilder &MIB) { // rsrc
4870 MIB.addReg(Info->getScratchRSrcReg());
4871 },
4872 [=](MachineInstrBuilder &MIB) { // soffset
4873 MIB.addReg(WaveBase);
4874 },
4875 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4876 }};
4877 }
4878
4879 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4881 return {};
4882
4883 return {{
4884 [=](MachineInstrBuilder &MIB) { // rsrc
4885 MIB.addReg(Info->getScratchRSrcReg());
4886 },
4887 [=](MachineInstrBuilder &MIB) { // soffset
4888 MIB.addImm(0);
4889 },
4890 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4891 }};
4892}
4893
4894std::pair<Register, unsigned>
4895AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4896 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4897 if (!RootDef)
4898 return std::pair(Root.getReg(), 0);
4899
4900 int64_t ConstAddr = 0;
4901
4902 Register PtrBase;
4903 int64_t Offset;
4904 std::tie(PtrBase, Offset) =
4905 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4906
4907 if (Offset) {
4908 if (isDSOffsetLegal(PtrBase, Offset)) {
4909 // (add n0, c0)
4910 return std::pair(PtrBase, Offset);
4911 }
4912 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4913 // TODO
4914
4915
4916 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4917 // TODO
4918
4919 }
4920
4921 return std::pair(Root.getReg(), 0);
4922}
4923
4925AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4926 Register Reg;
4927 unsigned Offset;
4928 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4929 return {{
4930 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4931 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4932 }};
4933}
4934
4936AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4937 return selectDSReadWrite2(Root, 4);
4938}
4939
4941AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4942 return selectDSReadWrite2(Root, 8);
4943}
4944
4946AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4947 unsigned Size) const {
4948 Register Reg;
4949 unsigned Offset;
4950 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4951 return {{
4952 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4953 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4954 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4955 }};
4956}
4957
4958std::pair<Register, unsigned>
4959AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4960 unsigned Size) const {
4961 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4962 if (!RootDef)
4963 return std::pair(Root.getReg(), 0);
4964
4965 int64_t ConstAddr = 0;
4966
4967 Register PtrBase;
4968 int64_t Offset;
4969 std::tie(PtrBase, Offset) =
4970 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4971
4972 if (Offset) {
4973 int64_t OffsetValue0 = Offset;
4974 int64_t OffsetValue1 = Offset + Size;
4975 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4976 // (add n0, c0)
4977 return std::pair(PtrBase, OffsetValue0 / Size);
4978 }
4979 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4980 // TODO
4981
4982 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4983 // TODO
4984
4985 }
4986
4987 return std::pair(Root.getReg(), 0);
4988}
4989
4990/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4991/// the base value with the constant offset. There may be intervening copies
4992/// between \p Root and the identified constant. Returns \p Root, 0 if this does
4993/// not match the pattern.
4994std::pair<Register, int64_t>
4995AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4996 Register Root, const MachineRegisterInfo &MRI) const {
4997 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4998 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4999 return {Root, 0};
5000
5001 MachineOperand &RHS = RootI->getOperand(2);
5002 std::optional<ValueAndVReg> MaybeOffset =
5004 if (!MaybeOffset)
5005 return {Root, 0};
5006 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5007}
5008
5010 MIB.addImm(0);
5011}
5012
5013/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5014/// BasePtr is not valid, a null base pointer will be used.
5016 uint32_t FormatLo, uint32_t FormatHi,
5017 Register BasePtr) {
5018 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5019 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5020 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5021 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5022
5023 B.buildInstr(AMDGPU::S_MOV_B32)
5024 .addDef(RSrc2)
5025 .addImm(FormatLo);
5026 B.buildInstr(AMDGPU::S_MOV_B32)
5027 .addDef(RSrc3)
5028 .addImm(FormatHi);
5029
5030 // Build the half of the subregister with the constants before building the
5031 // full 128-bit register. If we are building multiple resource descriptors,
5032 // this will allow CSEing of the 2-component register.
5033 B.buildInstr(AMDGPU::REG_SEQUENCE)
5034 .addDef(RSrcHi)
5035 .addReg(RSrc2)
5036 .addImm(AMDGPU::sub0)
5037 .addReg(RSrc3)
5038 .addImm(AMDGPU::sub1);
5039
5040 Register RSrcLo = BasePtr;
5041 if (!BasePtr) {
5042 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5043 B.buildInstr(AMDGPU::S_MOV_B64)
5044 .addDef(RSrcLo)
5045 .addImm(0);
5046 }
5047
5048 B.buildInstr(AMDGPU::REG_SEQUENCE)
5049 .addDef(RSrc)
5050 .addReg(RSrcLo)
5051 .addImm(AMDGPU::sub0_sub1)
5052 .addReg(RSrcHi)
5053 .addImm(AMDGPU::sub2_sub3);
5054
5055 return RSrc;
5056}
5057
5059 const SIInstrInfo &TII, Register BasePtr) {
5060 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5061
5062 // FIXME: Why are half the "default" bits ignored based on the addressing
5063 // mode?
5064 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5065}
5066
5068 const SIInstrInfo &TII, Register BasePtr) {
5069 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5070
5071 // FIXME: Why are half the "default" bits ignored based on the addressing
5072 // mode?
5073 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5074}
5075
5076AMDGPUInstructionSelector::MUBUFAddressData
5077AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5078 MUBUFAddressData Data;
5079 Data.N0 = Src;
5080
5081 Register PtrBase;
5082 int64_t Offset;
5083
5084 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5085 if (isUInt<32>(Offset)) {
5086 Data.N0 = PtrBase;
5087 Data.Offset = Offset;
5088 }
5089
5090 if (MachineInstr *InputAdd
5091 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5092 Data.N2 = InputAdd->getOperand(1).getReg();
5093 Data.N3 = InputAdd->getOperand(2).getReg();
5094
5095 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5096 // FIXME: Don't know this was defined by operand 0
5097 //
5098 // TODO: Remove this when we have copy folding optimizations after
5099 // RegBankSelect.
5100 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5101 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5102 }
5103
5104 return Data;
5105}
5106
5107/// Return if the addr64 mubuf mode should be used for the given address.
5108bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5109 // (ptr_add N2, N3) -> addr64, or
5110 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5111 if (Addr.N2)
5112 return true;
5113
5114 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5115 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5116}
5117
5118/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5119/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5120/// component.
5121void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5122 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5123 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5124 return;
5125
5126 // Illegal offset, store it in soffset.
5127 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5128 B.buildInstr(AMDGPU::S_MOV_B32)
5129 .addDef(SOffset)
5130 .addImm(ImmOffset);
5131 ImmOffset = 0;
5132}
5133
5134bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5135 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5136 Register &SOffset, int64_t &Offset) const {
5137 // FIXME: Predicates should stop this from reaching here.
5138 // addr64 bit was removed for volcanic islands.
5139 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5140 return false;
5141
5142 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5143 if (!shouldUseAddr64(AddrData))
5144 return false;
5145
5146 Register N0 = AddrData.N0;
5147 Register N2 = AddrData.N2;
5148 Register N3 = AddrData.N3;
5149 Offset = AddrData.Offset;
5150
5151 // Base pointer for the SRD.
5152 Register SRDPtr;
5153
5154 if (N2) {
5155 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5156 assert(N3);
5157 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5158 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5159 // addr64, and construct the default resource from a 0 address.
5160 VAddr = N0;
5161 } else {
5162 SRDPtr = N3;
5163 VAddr = N2;
5164 }
5165 } else {
5166 // N2 is not divergent.
5167 SRDPtr = N2;
5168 VAddr = N3;
5169 }
5170 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5171 // Use the default null pointer in the resource
5172 VAddr = N0;
5173 } else {
5174 // N0 -> offset, or
5175 // (N0 + C1) -> offset
5176 SRDPtr = N0;
5177 }
5178
5179 MachineIRBuilder B(*Root.getParent());
5180 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5181 splitIllegalMUBUFOffset(B, SOffset, Offset);
5182 return true;
5183}
5184
5185bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5186 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5187 int64_t &Offset) const {
5188
5189 // FIXME: Pattern should not reach here.
5190 if (STI.useFlatForGlobal())
5191 return false;
5192
5193 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5194 if (shouldUseAddr64(AddrData))
5195 return false;
5196
5197 // N0 -> offset, or
5198 // (N0 + C1) -> offset
5199 Register SRDPtr = AddrData.N0;
5200 Offset = AddrData.Offset;
5201
5202 // TODO: Look through extensions for 32-bit soffset.
5203 MachineIRBuilder B(*Root.getParent());
5204
5205 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5206 splitIllegalMUBUFOffset(B, SOffset, Offset);
5207 return true;
5208}
5209
5211AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5212 Register VAddr;
5213 Register RSrcReg;
5214 Register SOffset;
5215 int64_t Offset = 0;
5216
5217 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5218 return {};
5219
5220 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5221 // pattern.
5222 return {{
5223 [=](MachineInstrBuilder &MIB) { // rsrc
5224 MIB.addReg(RSrcReg);
5225 },
5226 [=](MachineInstrBuilder &MIB) { // vaddr
5227 MIB.addReg(VAddr);
5228 },
5229 [=](MachineInstrBuilder &MIB) { // soffset
5230 if (SOffset)
5231 MIB.addReg(SOffset);
5232 else if (STI.hasRestrictedSOffset())
5233 MIB.addReg(AMDGPU::SGPR_NULL);
5234 else
5235 MIB.addImm(0);
5236 },
5237 [=](MachineInstrBuilder &MIB) { // offset
5238 MIB.addImm(Offset);
5239 },
5240 addZeroImm, // cpol
5241 addZeroImm, // tfe
5242 addZeroImm // swz
5243 }};
5244}
5245
5247AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5248 Register RSrcReg;
5249 Register SOffset;
5250 int64_t Offset = 0;
5251
5252 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5253 return {};
5254
5255 return {{
5256 [=](MachineInstrBuilder &MIB) { // rsrc
5257 MIB.addReg(RSrcReg);
5258 },
5259 [=](MachineInstrBuilder &MIB) { // soffset
5260 if (SOffset)
5261 MIB.addReg(SOffset);
5262 else if (STI.hasRestrictedSOffset())
5263 MIB.addReg(AMDGPU::SGPR_NULL);
5264 else
5265 MIB.addImm(0);
5266 },
5267 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5268 addZeroImm, // cpol
5269 addZeroImm, // tfe
5270 addZeroImm, // swz
5271 }};
5272}
5273
5275AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5276
5277 Register SOffset = Root.getReg();
5278
5279 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5280 SOffset = AMDGPU::SGPR_NULL;
5281
5282 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5283}
5284
5285/// Get an immediate that must be 32-bits, and treated as zero extended.
5286static std::optional<uint64_t>
5288 // getIConstantVRegVal sexts any values, so see if that matters.
5289 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5290 if (!OffsetVal || !isInt<32>(*OffsetVal))
5291 return std::nullopt;
5292 return Lo_32(*OffsetVal);
5293}
5294
5296AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5297 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5298 if (!OffsetVal)
5299 return {};
5300
5301 std::optional<int64_t> EncodedImm =
5302 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5303 if (!EncodedImm)
5304 return {};
5305
5306 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5307}
5308
5310AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5312
5313 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5314 if (!OffsetVal)
5315 return {};
5316
5317 std::optional<int64_t> EncodedImm =
5319 if (!EncodedImm)
5320 return {};
5321
5322 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5323}
5324
5326AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5327 // Match the (soffset + offset) pair as a 32-bit register base and
5328 // an immediate offset.
5329 Register SOffset;
5330 unsigned Offset;
5331 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5332 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5333 if (!SOffset)
5334 return std::nullopt;
5335
5336 std::optional<int64_t> EncodedOffset =
5337 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5338 if (!EncodedOffset)
5339 return std::nullopt;
5340
5341 assert(MRI->getType(SOffset) == LLT::scalar(32));
5342 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5343 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5344}
5345
5346// Variant of stripBitCast that returns the instruction instead of a
5347// MachineOperand.
5349 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5350 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5351 return MI;
5352}
5353
5354// Figure out if this is really an extract of the high 16-bits of a dword,
5355// returns nullptr if it isn't.
5358 Inst = stripBitCast(Inst, MRI);
5359
5360 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5361 return nullptr;
5362
5363 MachineInstr *TruncOp =
5365 TruncOp = stripBitCast(TruncOp, MRI);
5366
5367 // G_LSHR x, (G_CONSTANT i32 16)
5368 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5369 auto SrlAmount = getIConstantVRegValWithLookThrough(
5370 TruncOp->getOperand(2).getReg(), MRI);
5371 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5372 MachineInstr *SrlOp =
5373 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5374 return stripBitCast(SrlOp, MRI);
5375 }
5376 }
5377
5378 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5379 // 1, 0 swaps the low/high 16 bits.
5380 // 1, 1 sets the high 16 bits to be the same as the low 16.
5381 // in any case, it selects the high elts.
5382 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5383 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5384 LLT::fixed_vector(2, 16));
5385
5386 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5387 assert(Mask.size() == 2);
5388
5389 if (Mask[0] == 1 && Mask[1] <= 1) {
5390 MachineInstr *LHS =
5391 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5392 return stripBitCast(LHS, MRI);
5393 }
5394 }
5395
5396 return nullptr;
5397}
5398
5399std::pair<Register, unsigned>
5400AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5401 bool &Matched) const {
5402 Matched = false;
5403
5404 Register Src;
5405 unsigned Mods;
5406 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5407
5408 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5409 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5410 MachineOperand *MO = &MI->getOperand(1);
5411 Src = MO->getReg();
5412 MI = getDefIgnoringCopies(Src, *MRI);
5413
5414 assert(MRI->getType(Src) == LLT::scalar(16));
5415
5416 // See through bitcasts.
5417 // FIXME: Would be nice to use stripBitCast here.
5418 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5419 MO = &MI->getOperand(1);
5420 Src = MO->getReg();
5421 MI = getDefIgnoringCopies(Src, *MRI);
5422 }
5423
5424 const auto CheckAbsNeg = [&]() {
5425 // Be careful about folding modifiers if we already have an abs. fneg is
5426 // applied last, so we don't want to apply an earlier fneg.
5427 if ((Mods & SISrcMods::ABS) == 0) {
5428 unsigned ModsTmp;
5429 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5430 MI = getDefIgnoringCopies(Src, *MRI);
5431
5432 if ((ModsTmp & SISrcMods::NEG) != 0)
5433 Mods ^= SISrcMods::NEG;
5434
5435 if ((ModsTmp & SISrcMods::ABS) != 0)
5436 Mods |= SISrcMods::ABS;
5437 }
5438 };
5439
5440 CheckAbsNeg();
5441
5442 // op_sel/op_sel_hi decide the source type and source.
5443 // If the source's op_sel_hi is set, it indicates to do a conversion from
5444 // fp16. If the sources's op_sel is set, it picks the high half of the
5445 // source register.
5446
5447 Mods |= SISrcMods::OP_SEL_1;
5448
5449 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5450 Mods |= SISrcMods::OP_SEL_0;
5451 MI = ExtractHiEltMI;
5452 MO = &MI->getOperand(0);
5453 Src = MO->getReg();
5454
5455 CheckAbsNeg();
5456 }
5457
5458 Matched = true;
5459 }
5460
5461 return {Src, Mods};
5462}
5463
5465AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5466 MachineOperand &Root) const {
5467 Register Src;
5468 unsigned Mods;
5469 bool Matched;
5470 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5471 if (!Matched)
5472 return {};
5473
5474 return {{
5475 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5476 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5477 }};
5478}
5479
5481AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5482 Register Src;
5483 unsigned Mods;
5484 bool Matched;
5485 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5486
5487 return {{
5488 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5489 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5490 }};
5491}
5492
5493bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5494 MachineInstr &I, Intrinsic::ID IntrID) const {
5495 MachineBasicBlock *MBB = I.getParent();
5496 const DebugLoc &DL = I.getDebugLoc();
5497 Register CCReg = I.getOperand(0).getReg();
5498
5499 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5500
5501 if (HasM0) {
5502 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5503 .addReg(I.getOperand(2).getReg());
5504 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5505 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5506 return false;
5507 } else {
5508 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5509 .addImm(I.getOperand(2).getImm());
5510 }
5511
5512 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5513
5514 I.eraseFromParent();
5515 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5516 *MRI);
5517}
5518
5519unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5520 if (HasInlineConst) {
5521 switch (IntrID) {
5522 default:
5523 llvm_unreachable("not a named barrier op");
5524 case Intrinsic::amdgcn_s_barrier_init:
5525 return AMDGPU::S_BARRIER_INIT_IMM;
5526 case Intrinsic::amdgcn_s_barrier_join:
5527 return AMDGPU::S_BARRIER_JOIN_IMM;
5528 case Intrinsic::amdgcn_s_wakeup_barrier:
5529 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5530 case Intrinsic::amdgcn_s_get_barrier_state:
5531 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5532 };
5533 } else {
5534 switch (IntrID) {
5535 default:
5536 llvm_unreachable("not a named barrier op");
5537 case Intrinsic::amdgcn_s_barrier_init:
5538 return AMDGPU::S_BARRIER_INIT_M0;
5539 case Intrinsic::amdgcn_s_barrier_join:
5540 return AMDGPU::S_BARRIER_JOIN_M0;
5541 case Intrinsic::amdgcn_s_wakeup_barrier:
5542 return AMDGPU::S_WAKEUP_BARRIER_M0;
5543 case Intrinsic::amdgcn_s_get_barrier_state:
5544 return AMDGPU::S_GET_BARRIER_STATE_M0;
5545 };
5546 }
5547}
5548
5549bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5550 MachineInstr &I, Intrinsic::ID IntrID) const {
5551 MachineBasicBlock *MBB = I.getParent();
5552 const DebugLoc &DL = I.getDebugLoc();
5553 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5554 ? I.getOperand(2)
5555 : I.getOperand(1);
5556 std::optional<int64_t> BarValImm =
5557 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5558 Register M0Val;
5559 Register TmpReg0;
5560
5561 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5562 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5563 Register MemberCount = I.getOperand(2).getReg();
5564 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5565 // TODO: This should be expanded during legalization so that the the S_LSHL
5566 // and S_OR can be constant-folded
5567 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5568 .addImm(16)
5569 .addReg(MemberCount);
5570 M0Val = TmpReg0;
5571 }
5572
5573 // If not inlinable, get reference to barrier depending on the instruction
5574 if (!BarValImm) {
5575 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5576 // If reference to barrier id is not an inlinable constant then it must be
5577 // referenced with M0[4:0]. Perform an OR with the member count to include
5578 // it in M0 for S_BARRIER_INIT.
5579 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5580 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5581 .addReg(BarOp.getReg())
5582 .addReg(TmpReg0);
5583 M0Val = TmpReg1;
5584 } else {
5585 M0Val = BarOp.getReg();
5586 }
5587 }
5588
5589 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5590 if (M0Val) {
5591 auto CopyMIB =
5592 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5593 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5594 }
5595
5597 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5598 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5599
5600 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5601 MIB.addDef(I.getOperand(0).getReg());
5602
5603 if (BarValImm)
5604 MIB.addImm(*BarValImm);
5605
5606 I.eraseFromParent();
5607 return true;
5608}
5609
5610bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5611 MachineBasicBlock *BB = I.getParent();
5612 const DebugLoc &DL = I.getDebugLoc();
5613 Register CCReg = I.getOperand(0).getReg();
5614
5615 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5616 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5617
5618 I.eraseFromParent();
5619 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5620 *MRI);
5621}
5622
5623void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5624 const MachineInstr &MI,
5625 int OpIdx) const {
5626 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5627 "Expected G_CONSTANT");
5628 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5629}
5630
5631void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5632 const MachineInstr &MI,
5633 int OpIdx) const {
5634 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5635 "Expected G_CONSTANT");
5636 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5637}
5638
5639void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5640 const MachineInstr &MI,
5641 int OpIdx) const {
5642 assert(OpIdx == -1);
5643
5644 const MachineOperand &Op = MI.getOperand(1);
5645 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5646 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5647 else {
5648 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5649 MIB.addImm(Op.getCImm()->getSExtValue());
5650 }
5651}
5652
5653void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5654 const MachineInstr &MI,
5655 int OpIdx) const {
5656 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5657 "Expected G_CONSTANT");
5658 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5659}
5660
5661/// This only really exists to satisfy DAG type checking machinery, so is a
5662/// no-op here.
5663void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5664 const MachineInstr &MI,
5665 int OpIdx) const {
5666 MIB.addImm(MI.getOperand(OpIdx).getImm());
5667}
5668
5669void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5670 const MachineInstr &MI,
5671 int OpIdx) const {
5672 assert(OpIdx >= 0 && "expected to match an immediate operand");
5673 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5674}
5675
5676void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5677 const MachineInstr &MI,
5678 int OpIdx) const {
5679 assert(OpIdx >= 0 && "expected to match an immediate operand");
5680 MIB.addImm(MI.getOperand(OpIdx).getImm() &
5683}
5684
5685void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5686 const MachineInstr &MI,
5687 int OpIdx) const {
5688 assert(OpIdx >= 0 && "expected to match an immediate operand");
5689 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5692 MIB.addImm(Swizzle);
5693}
5694
5695void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5696 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5697 assert(OpIdx >= 0 && "expected to match an immediate operand");
5698 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5701 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5702}
5703
5704void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5705 const MachineInstr &MI,
5706 int OpIdx) const {
5707 MIB.addFrameIndex(MI.getOperand(1).getIndex());
5708}
5709
5710void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5711 const MachineInstr &MI,
5712 int OpIdx) const {
5713 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5714 int ExpVal = APF.getExactLog2Abs();
5715 assert(ExpVal != INT_MIN);
5716 MIB.addImm(ExpVal);
5717}
5718
5719bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5720 return TII.isInlineConstant(Imm);
5721}
5722
5723bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5724 return TII.isInlineConstant(Imm);
5725}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static int sizeToSubRegIndex(unsigned Size)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static MachineInstr * stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
const char LLVMTargetMachineRef TM
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1392
APInt bitcastToAPInt() const
Definition: APFloat.h:1254
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:286
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:276
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1615
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:774
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:772
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:771
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:769
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:770
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:759
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
const APFloat & getValueAPF() const
Definition: Constants.h:312
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:998
int getLDSBankCount() const
Definition: GCNSubtarget.h:339
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:467
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:471
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:626
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:552
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:273
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:701
bool isWave32() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:530
Generation getGeneration() const
Definition: GCNSubtarget.h:316
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:731
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:715
bool hasAddr64() const
Definition: GCNSubtarget.h:380
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:723
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1067
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
const ConstantFP * getFPImm() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
const Triple & getTargetTriple() const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1513
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:903
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:639
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:452
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:479
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:307
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:432
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:426
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:460
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:486
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:51
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.