LLVM 20.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
50#include "AMDGPUGenGlobalISel.inc"
53#include "AMDGPUGenGlobalISel.inc"
55{
56}
57
58const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
61 CodeGenCoverage *CoverageInfo,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
68}
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
165 assert(Subtarget->useRealTrue16Insts());
166 const int64_t NoMods = 0;
167 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
168 .addImm(NoMods)
169 .addImm(1)
170 .addImm(NoMods)
171 .addReg(SrcReg)
172 .addImm(NoMods);
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
174 .addImm(NoMods)
175 .addImm(0)
176 .addImm(NoMods)
177 .addReg(MaskedReg)
178 .addImm(NoMods);
179 } else {
180 bool IsSGPR = TRI.isSGPRClass(SrcRC);
181 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
182 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
183 .addImm(1)
184 .addReg(SrcReg);
185 if (IsSGPR)
186 And.setOperandDead(3); // Dead scc
187
188 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
189 .addImm(0)
190 .addReg(MaskedReg);
191 }
192 }
193
194 if (!MRI->getRegClassOrNull(SrcReg))
195 MRI->setRegClass(SrcReg, SrcRC);
196 I.eraseFromParent();
197 return true;
198 }
199
200 const TargetRegisterClass *RC =
202 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
203 return false;
204
205 return true;
206 }
207
208 for (const MachineOperand &MO : I.operands()) {
209 if (MO.getReg().isPhysical())
210 continue;
211
212 const TargetRegisterClass *RC =
214 if (!RC)
215 continue;
216 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
217 }
218 return true;
219}
220
221bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
222 const Register DefReg = I.getOperand(0).getReg();
223 const LLT DefTy = MRI->getType(DefReg);
224
225 // S1 G_PHIs should not be selected in instruction-select, instead:
226 // - divergent S1 G_PHI should go through lane mask merging algorithm
227 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
228 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
229 if (DefTy == LLT::scalar(1))
230 return false;
231
232 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
233
234 const RegClassOrRegBank &RegClassOrBank =
235 MRI->getRegClassOrRegBank(DefReg);
236
237 const TargetRegisterClass *DefRC
238 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
239 if (!DefRC) {
240 if (!DefTy.isValid()) {
241 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
242 return false;
243 }
244
245 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
246 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
247 if (!DefRC) {
248 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
249 return false;
250 }
251 }
252
253 // TODO: Verify that all registers have the same bank
254 I.setDesc(TII.get(TargetOpcode::PHI));
255 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
256}
257
259AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
260 const TargetRegisterClass &SubRC,
261 unsigned SubIdx) const {
262
263 MachineInstr *MI = MO.getParent();
265 Register DstReg = MRI->createVirtualRegister(&SubRC);
266
267 if (MO.isReg()) {
268 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
269 Register Reg = MO.getReg();
270 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
271 .addReg(Reg, 0, ComposedSubIdx);
272
273 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
274 MO.isKill(), MO.isDead(), MO.isUndef(),
275 MO.isEarlyClobber(), 0, MO.isDebug(),
276 MO.isInternalRead());
277 }
278
279 assert(MO.isImm());
280
281 APInt Imm(64, MO.getImm());
282
283 switch (SubIdx) {
284 default:
285 llvm_unreachable("do not know to split immediate with this sub index.");
286 case AMDGPU::sub0:
287 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
288 case AMDGPU::sub1:
289 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
290 }
291}
292
293static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
294 switch (Opc) {
295 case AMDGPU::G_AND:
296 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
297 case AMDGPU::G_OR:
298 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
299 case AMDGPU::G_XOR:
300 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
301 default:
302 llvm_unreachable("not a bit op");
303 }
304}
305
306bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
307 Register DstReg = I.getOperand(0).getReg();
308 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
309
310 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
311 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
312 DstRB->getID() != AMDGPU::VCCRegBankID)
313 return false;
314
315 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
316 STI.isWave64());
317 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
318
319 // Dead implicit-def of scc
320 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
321 true, // isImp
322 false, // isKill
323 true)); // isDead
324 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
325}
326
327bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
328 MachineBasicBlock *BB = I.getParent();
330 Register DstReg = I.getOperand(0).getReg();
331 const DebugLoc &DL = I.getDebugLoc();
332 LLT Ty = MRI->getType(DstReg);
333 if (Ty.isVector())
334 return false;
335
336 unsigned Size = Ty.getSizeInBits();
337 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
338 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
339 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
340
341 if (Size == 32) {
342 if (IsSALU) {
343 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
345 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
346 .add(I.getOperand(1))
347 .add(I.getOperand(2))
348 .setOperandDead(3); // Dead scc
349 I.eraseFromParent();
350 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
351 }
352
353 if (STI.hasAddNoCarry()) {
354 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
355 I.setDesc(TII.get(Opc));
356 I.addOperand(*MF, MachineOperand::CreateImm(0));
357 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
358 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
359 }
360
361 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
362
363 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
365 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
366 .addDef(UnusedCarry, RegState::Dead)
367 .add(I.getOperand(1))
368 .add(I.getOperand(2))
369 .addImm(0);
370 I.eraseFromParent();
371 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
372 }
373
374 assert(!Sub && "illegal sub should not reach here");
375
376 const TargetRegisterClass &RC
377 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
378 const TargetRegisterClass &HalfRC
379 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
380
381 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
382 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
383 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
384 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
385
386 Register DstLo = MRI->createVirtualRegister(&HalfRC);
387 Register DstHi = MRI->createVirtualRegister(&HalfRC);
388
389 if (IsSALU) {
390 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
391 .add(Lo1)
392 .add(Lo2);
393 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
394 .add(Hi1)
395 .add(Hi2)
396 .setOperandDead(3); // Dead scc
397 } else {
398 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
399 Register CarryReg = MRI->createVirtualRegister(CarryRC);
400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
401 .addDef(CarryReg)
402 .add(Lo1)
403 .add(Lo2)
404 .addImm(0);
405 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
406 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
407 .add(Hi1)
408 .add(Hi2)
409 .addReg(CarryReg, RegState::Kill)
410 .addImm(0);
411
412 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
413 return false;
414 }
415
416 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
417 .addReg(DstLo)
418 .addImm(AMDGPU::sub0)
419 .addReg(DstHi)
420 .addImm(AMDGPU::sub1);
421
422
423 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
424 return false;
425
426 I.eraseFromParent();
427 return true;
428}
429
430bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
431 MachineInstr &I) const {
432 MachineBasicBlock *BB = I.getParent();
434 const DebugLoc &DL = I.getDebugLoc();
435 Register Dst0Reg = I.getOperand(0).getReg();
436 Register Dst1Reg = I.getOperand(1).getReg();
437 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
438 I.getOpcode() == AMDGPU::G_UADDE;
439 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
440 I.getOpcode() == AMDGPU::G_USUBE;
441
442 if (isVCC(Dst1Reg, *MRI)) {
443 unsigned NoCarryOpc =
444 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
445 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
446 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
447 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
448 I.addOperand(*MF, MachineOperand::CreateImm(0));
449 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
450 }
451
452 Register Src0Reg = I.getOperand(2).getReg();
453 Register Src1Reg = I.getOperand(3).getReg();
454
455 if (HasCarryIn) {
456 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
457 .addReg(I.getOperand(4).getReg());
458 }
459
460 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
461 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
462
463 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
464 .add(I.getOperand(2))
465 .add(I.getOperand(3));
466
467 if (MRI->use_nodbg_empty(Dst1Reg)) {
468 CarryInst.setOperandDead(3); // Dead scc
469 } else {
470 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
471 .addReg(AMDGPU::SCC);
472 if (!MRI->getRegClassOrNull(Dst1Reg))
473 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
474 }
475
476 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
477 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
478 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
479 return false;
480
481 if (HasCarryIn &&
482 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
483 AMDGPU::SReg_32RegClass, *MRI))
484 return false;
485
486 I.eraseFromParent();
487 return true;
488}
489
490bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
491 MachineInstr &I) const {
492 MachineBasicBlock *BB = I.getParent();
494 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
495
496 unsigned Opc;
497 if (Subtarget->hasMADIntraFwdBug())
498 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
499 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
500 else
501 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
502 I.setDesc(TII.get(Opc));
503 I.addOperand(*MF, MachineOperand::CreateImm(0));
504 I.addImplicitDefUseOperands(*MF);
505 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
506}
507
508// TODO: We should probably legalize these to only using 32-bit results.
509bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
510 MachineBasicBlock *BB = I.getParent();
511 Register DstReg = I.getOperand(0).getReg();
512 Register SrcReg = I.getOperand(1).getReg();
513 LLT DstTy = MRI->getType(DstReg);
514 LLT SrcTy = MRI->getType(SrcReg);
515 const unsigned SrcSize = SrcTy.getSizeInBits();
516 unsigned DstSize = DstTy.getSizeInBits();
517
518 // TODO: Should handle any multiple of 32 offset.
519 unsigned Offset = I.getOperand(2).getImm();
520 if (Offset % 32 != 0 || DstSize > 128)
521 return false;
522
523 // 16-bit operations really use 32-bit registers.
524 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
525 if (DstSize == 16)
526 DstSize = 32;
527
528 const TargetRegisterClass *DstRC =
529 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
530 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
531 return false;
532
533 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
534 const TargetRegisterClass *SrcRC =
535 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
536 if (!SrcRC)
537 return false;
539 DstSize / 32);
540 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
541 if (!SrcRC)
542 return false;
543
544 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
545 *SrcRC, I.getOperand(1));
546 const DebugLoc &DL = I.getDebugLoc();
547 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
548 .addReg(SrcReg, 0, SubReg);
549
550 I.eraseFromParent();
551 return true;
552}
553
554bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
555 MachineBasicBlock *BB = MI.getParent();
556 Register DstReg = MI.getOperand(0).getReg();
557 LLT DstTy = MRI->getType(DstReg);
558 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
559
560 const unsigned SrcSize = SrcTy.getSizeInBits();
561 if (SrcSize < 32)
562 return selectImpl(MI, *CoverageInfo);
563
564 const DebugLoc &DL = MI.getDebugLoc();
565 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
566 const unsigned DstSize = DstTy.getSizeInBits();
567 const TargetRegisterClass *DstRC =
568 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
569 if (!DstRC)
570 return false;
571
572 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
574 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
575 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
576 MachineOperand &Src = MI.getOperand(I + 1);
577 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
578 MIB.addImm(SubRegs[I]);
579
580 const TargetRegisterClass *SrcRC
581 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
582 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
583 return false;
584 }
585
586 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
587 return false;
588
589 MI.eraseFromParent();
590 return true;
591}
592
593bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
594 MachineBasicBlock *BB = MI.getParent();
595 const int NumDst = MI.getNumOperands() - 1;
596
597 MachineOperand &Src = MI.getOperand(NumDst);
598
599 Register SrcReg = Src.getReg();
600 Register DstReg0 = MI.getOperand(0).getReg();
601 LLT DstTy = MRI->getType(DstReg0);
602 LLT SrcTy = MRI->getType(SrcReg);
603
604 const unsigned DstSize = DstTy.getSizeInBits();
605 const unsigned SrcSize = SrcTy.getSizeInBits();
606 const DebugLoc &DL = MI.getDebugLoc();
607 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
608
609 const TargetRegisterClass *SrcRC =
610 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
611 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
612 return false;
613
614 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
615 // source, and this relies on the fact that the same subregister indices are
616 // used for both.
617 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
618 for (int I = 0, E = NumDst; I != E; ++I) {
619 MachineOperand &Dst = MI.getOperand(I);
620 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
621 .addReg(SrcReg, 0, SubRegs[I]);
622
623 // Make sure the subregister index is valid for the source register.
624 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
625 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
626 return false;
627
628 const TargetRegisterClass *DstRC =
630 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
631 return false;
632 }
633
634 MI.eraseFromParent();
635 return true;
636}
637
638bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
639 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
640 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
641
642 Register Src0 = MI.getOperand(1).getReg();
643 Register Src1 = MI.getOperand(2).getReg();
644 LLT SrcTy = MRI->getType(Src0);
645 const unsigned SrcSize = SrcTy.getSizeInBits();
646
647 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
648 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
649 return selectG_MERGE_VALUES(MI);
650 }
651
652 // Selection logic below is for V2S16 only.
653 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
654 Register Dst = MI.getOperand(0).getReg();
655 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
656 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
657 SrcTy != LLT::scalar(32)))
658 return selectImpl(MI, *CoverageInfo);
659
660 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
661 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
662 return false;
663
664 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
665 DstBank->getID() == AMDGPU::VGPRRegBankID);
666 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
667
668 const DebugLoc &DL = MI.getDebugLoc();
669 MachineBasicBlock *BB = MI.getParent();
670
671 // First, before trying TableGen patterns, check if both sources are
672 // constants. In those cases, we can trivially compute the final constant
673 // and emit a simple move.
674 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
675 if (ConstSrc1) {
676 auto ConstSrc0 =
677 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
678 if (ConstSrc0) {
679 const int64_t K0 = ConstSrc0->Value.getSExtValue();
680 const int64_t K1 = ConstSrc1->Value.getSExtValue();
681 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
682 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
683 uint32_t Imm = Lo16 | (Hi16 << 16);
684
685 // VALU
686 if (IsVector) {
687 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
688 MI.eraseFromParent();
689 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
690 }
691
692 // SALU
693 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
694 MI.eraseFromParent();
695 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
696 }
697 }
698
699 // Now try TableGen patterns.
700 if (selectImpl(MI, *CoverageInfo))
701 return true;
702
703 // TODO: This should probably be a combine somewhere
704 // (build_vector $src0, undef) -> copy $src0
705 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
706 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
707 MI.setDesc(TII.get(AMDGPU::COPY));
708 MI.removeOperand(2);
709 const auto &RC =
710 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
711 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
712 RBI.constrainGenericRegister(Src0, RC, *MRI);
713 }
714
715 // TODO: Can be improved?
716 if (IsVector) {
717 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
718 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
719 .addImm(0xFFFF)
720 .addReg(Src0);
721 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
722 return false;
723
724 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
725 .addReg(Src1)
726 .addImm(16)
727 .addReg(TmpReg);
728 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
729 return false;
730
731 MI.eraseFromParent();
732 return true;
733 }
734
735 Register ShiftSrc0;
736 Register ShiftSrc1;
737
738 // With multiple uses of the shift, this will duplicate the shift and
739 // increase register pressure.
740 //
741 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
742 // => (S_PACK_HH_B32_B16 $src0, $src1)
743 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
744 // => (S_PACK_HL_B32_B16 $src0, $src1)
745 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
746 // => (S_PACK_LH_B32_B16 $src0, $src1)
747 // (build_vector $src0, $src1)
748 // => (S_PACK_LL_B32_B16 $src0, $src1)
749
750 bool Shift0 = mi_match(
751 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
752
753 bool Shift1 = mi_match(
754 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
755
756 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
757 if (Shift0 && Shift1) {
758 Opc = AMDGPU::S_PACK_HH_B32_B16;
759 MI.getOperand(1).setReg(ShiftSrc0);
760 MI.getOperand(2).setReg(ShiftSrc1);
761 } else if (Shift1) {
762 Opc = AMDGPU::S_PACK_LH_B32_B16;
763 MI.getOperand(2).setReg(ShiftSrc1);
764 } else if (Shift0) {
765 auto ConstSrc1 =
766 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
767 if (ConstSrc1 && ConstSrc1->Value == 0) {
768 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
769 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
770 .addReg(ShiftSrc0)
771 .addImm(16)
772 .setOperandDead(3); // Dead scc
773
774 MI.eraseFromParent();
775 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
776 }
777 if (STI.hasSPackHL()) {
778 Opc = AMDGPU::S_PACK_HL_B32_B16;
779 MI.getOperand(1).setReg(ShiftSrc0);
780 }
781 }
782
783 MI.setDesc(TII.get(Opc));
784 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
785}
786
787bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
788 const MachineOperand &MO = I.getOperand(0);
789
790 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
791 // regbank check here is to know why getConstrainedRegClassForOperand failed.
793 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
794 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
795 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
796 return true;
797 }
798
799 return false;
800}
801
802bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
803 MachineBasicBlock *BB = I.getParent();
804
805 Register DstReg = I.getOperand(0).getReg();
806 Register Src0Reg = I.getOperand(1).getReg();
807 Register Src1Reg = I.getOperand(2).getReg();
808 LLT Src1Ty = MRI->getType(Src1Reg);
809
810 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
811 unsigned InsSize = Src1Ty.getSizeInBits();
812
813 int64_t Offset = I.getOperand(3).getImm();
814
815 // FIXME: These cases should have been illegal and unnecessary to check here.
816 if (Offset % 32 != 0 || InsSize % 32 != 0)
817 return false;
818
819 // Currently not handled by getSubRegFromChannel.
820 if (InsSize > 128)
821 return false;
822
823 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
824 if (SubReg == AMDGPU::NoSubRegister)
825 return false;
826
827 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
828 const TargetRegisterClass *DstRC =
829 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
830 if (!DstRC)
831 return false;
832
833 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
834 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
835 const TargetRegisterClass *Src0RC =
836 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
837 const TargetRegisterClass *Src1RC =
838 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
839
840 // Deal with weird cases where the class only partially supports the subreg
841 // index.
842 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
843 if (!Src0RC || !Src1RC)
844 return false;
845
846 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
847 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
848 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
849 return false;
850
851 const DebugLoc &DL = I.getDebugLoc();
852 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
853 .addReg(Src0Reg)
854 .addReg(Src1Reg)
855 .addImm(SubReg);
856
857 I.eraseFromParent();
858 return true;
859}
860
861bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
862 Register DstReg = MI.getOperand(0).getReg();
863 Register SrcReg = MI.getOperand(1).getReg();
864 Register OffsetReg = MI.getOperand(2).getReg();
865 Register WidthReg = MI.getOperand(3).getReg();
866
867 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
868 "scalar BFX instructions are expanded in regbankselect");
869 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
870 "64-bit vector BFX instructions are expanded in regbankselect");
871
872 const DebugLoc &DL = MI.getDebugLoc();
873 MachineBasicBlock *MBB = MI.getParent();
874
875 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
876 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
877 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
878 .addReg(SrcReg)
879 .addReg(OffsetReg)
880 .addReg(WidthReg);
881 MI.eraseFromParent();
882 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
883}
884
885bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
886 if (STI.getLDSBankCount() != 16)
887 return selectImpl(MI, *CoverageInfo);
888
889 Register Dst = MI.getOperand(0).getReg();
890 Register Src0 = MI.getOperand(2).getReg();
891 Register M0Val = MI.getOperand(6).getReg();
892 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
893 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
894 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
895 return false;
896
897 // This requires 2 instructions. It is possible to write a pattern to support
898 // this, but the generated isel emitter doesn't correctly deal with multiple
899 // output instructions using the same physical register input. The copy to m0
900 // is incorrectly placed before the second instruction.
901 //
902 // TODO: Match source modifiers.
903
904 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
905 const DebugLoc &DL = MI.getDebugLoc();
906 MachineBasicBlock *MBB = MI.getParent();
907
908 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
909 .addReg(M0Val);
910 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
911 .addImm(2)
912 .addImm(MI.getOperand(4).getImm()) // $attr
913 .addImm(MI.getOperand(3).getImm()); // $attrchan
914
915 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
916 .addImm(0) // $src0_modifiers
917 .addReg(Src0) // $src0
918 .addImm(MI.getOperand(4).getImm()) // $attr
919 .addImm(MI.getOperand(3).getImm()) // $attrchan
920 .addImm(0) // $src2_modifiers
921 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
922 .addImm(MI.getOperand(5).getImm()) // $high
923 .addImm(0) // $clamp
924 .addImm(0); // $omod
925
926 MI.eraseFromParent();
927 return true;
928}
929
930// Writelane is special in that it can use SGPR and M0 (which would normally
931// count as using the constant bus twice - but in this case it is allowed since
932// the lane selector doesn't count as a use of the constant bus). However, it is
933// still required to abide by the 1 SGPR rule. Fix this up if we might have
934// multiple SGPRs.
935bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
936 // With a constant bus limit of at least 2, there's no issue.
937 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
938 return selectImpl(MI, *CoverageInfo);
939
940 MachineBasicBlock *MBB = MI.getParent();
941 const DebugLoc &DL = MI.getDebugLoc();
942 Register VDst = MI.getOperand(0).getReg();
943 Register Val = MI.getOperand(2).getReg();
944 Register LaneSelect = MI.getOperand(3).getReg();
945 Register VDstIn = MI.getOperand(4).getReg();
946
947 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
948
949 std::optional<ValueAndVReg> ConstSelect =
950 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
951 if (ConstSelect) {
952 // The selector has to be an inline immediate, so we can use whatever for
953 // the other operands.
954 MIB.addReg(Val);
955 MIB.addImm(ConstSelect->Value.getSExtValue() &
956 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
957 } else {
958 std::optional<ValueAndVReg> ConstVal =
960
961 // If the value written is an inline immediate, we can get away without a
962 // copy to m0.
963 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
964 STI.hasInv2PiInlineImm())) {
965 MIB.addImm(ConstVal->Value.getSExtValue());
966 MIB.addReg(LaneSelect);
967 } else {
968 MIB.addReg(Val);
969
970 // If the lane selector was originally in a VGPR and copied with
971 // readfirstlane, there's a hazard to read the same SGPR from the
972 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
973 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
974
975 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
976 .addReg(LaneSelect);
977 MIB.addReg(AMDGPU::M0);
978 }
979 }
980
981 MIB.addReg(VDstIn);
982
983 MI.eraseFromParent();
984 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
985}
986
987// We need to handle this here because tablegen doesn't support matching
988// instructions with multiple outputs.
989bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
990 Register Dst0 = MI.getOperand(0).getReg();
991 Register Dst1 = MI.getOperand(1).getReg();
992
993 LLT Ty = MRI->getType(Dst0);
994 unsigned Opc;
995 if (Ty == LLT::scalar(32))
996 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
997 else if (Ty == LLT::scalar(64))
998 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
999 else
1000 return false;
1001
1002 // TODO: Match source modifiers.
1003
1004 const DebugLoc &DL = MI.getDebugLoc();
1005 MachineBasicBlock *MBB = MI.getParent();
1006
1007 Register Numer = MI.getOperand(3).getReg();
1008 Register Denom = MI.getOperand(4).getReg();
1009 unsigned ChooseDenom = MI.getOperand(5).getImm();
1010
1011 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1012
1013 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1014 .addDef(Dst1)
1015 .addImm(0) // $src0_modifiers
1016 .addUse(Src0) // $src0
1017 .addImm(0) // $src1_modifiers
1018 .addUse(Denom) // $src1
1019 .addImm(0) // $src2_modifiers
1020 .addUse(Numer) // $src2
1021 .addImm(0) // $clamp
1022 .addImm(0); // $omod
1023
1024 MI.eraseFromParent();
1025 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1026}
1027
1028bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1029 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1030 switch (IntrinsicID) {
1031 case Intrinsic::amdgcn_if_break: {
1032 MachineBasicBlock *BB = I.getParent();
1033
1034 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1035 // SelectionDAG uses for wave32 vs wave64.
1036 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1037 .add(I.getOperand(0))
1038 .add(I.getOperand(2))
1039 .add(I.getOperand(3));
1040
1041 Register DstReg = I.getOperand(0).getReg();
1042 Register Src0Reg = I.getOperand(2).getReg();
1043 Register Src1Reg = I.getOperand(3).getReg();
1044
1045 I.eraseFromParent();
1046
1047 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1048 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1049
1050 return true;
1051 }
1052 case Intrinsic::amdgcn_interp_p1_f16:
1053 return selectInterpP1F16(I);
1054 case Intrinsic::amdgcn_wqm:
1055 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1056 case Intrinsic::amdgcn_softwqm:
1057 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1058 case Intrinsic::amdgcn_strict_wwm:
1059 case Intrinsic::amdgcn_wwm:
1060 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1061 case Intrinsic::amdgcn_strict_wqm:
1062 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1063 case Intrinsic::amdgcn_writelane:
1064 return selectWritelane(I);
1065 case Intrinsic::amdgcn_div_scale:
1066 return selectDivScale(I);
1067 case Intrinsic::amdgcn_icmp:
1068 case Intrinsic::amdgcn_fcmp:
1069 if (selectImpl(I, *CoverageInfo))
1070 return true;
1071 return selectIntrinsicCmp(I);
1072 case Intrinsic::amdgcn_ballot:
1073 return selectBallot(I);
1074 case Intrinsic::amdgcn_reloc_constant:
1075 return selectRelocConstant(I);
1076 case Intrinsic::amdgcn_groupstaticsize:
1077 return selectGroupStaticSize(I);
1078 case Intrinsic::returnaddress:
1079 return selectReturnAddress(I);
1080 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1081 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1082 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1083 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1084 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1085 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1086 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1087 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1088 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1089 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1090 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1091 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1092 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1093 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1094 return selectSMFMACIntrin(I);
1095 default:
1096 return selectImpl(I, *CoverageInfo);
1097 }
1098}
1099
1101 const GCNSubtarget &ST) {
1102 if (Size != 16 && Size != 32 && Size != 64)
1103 return -1;
1104
1105 if (Size == 16 && !ST.has16BitInsts())
1106 return -1;
1107
1108 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1109 unsigned S64Opc) {
1110 if (Size == 16)
1111 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1112 if (Size == 32)
1113 return S32Opc;
1114 return S64Opc;
1115 };
1116
1117 switch (P) {
1118 default:
1119 llvm_unreachable("Unknown condition code!");
1120 case CmpInst::ICMP_NE:
1121 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1122 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1123 case CmpInst::ICMP_EQ:
1124 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1125 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1126 case CmpInst::ICMP_SGT:
1127 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1128 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1129 case CmpInst::ICMP_SGE:
1130 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1131 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1132 case CmpInst::ICMP_SLT:
1133 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1134 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1135 case CmpInst::ICMP_SLE:
1136 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1137 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1138 case CmpInst::ICMP_UGT:
1139 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1140 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1141 case CmpInst::ICMP_UGE:
1142 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1143 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1144 case CmpInst::ICMP_ULT:
1145 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1146 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1147 case CmpInst::ICMP_ULE:
1148 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1149 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1150
1151 case CmpInst::FCMP_OEQ:
1152 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1153 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1154 case CmpInst::FCMP_OGT:
1155 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1156 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1157 case CmpInst::FCMP_OGE:
1158 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1159 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1160 case CmpInst::FCMP_OLT:
1161 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1162 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1163 case CmpInst::FCMP_OLE:
1164 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1165 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1166 case CmpInst::FCMP_ONE:
1167 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1168 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1169 case CmpInst::FCMP_ORD:
1170 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1171 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1172 case CmpInst::FCMP_UNO:
1173 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1174 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1175 case CmpInst::FCMP_UEQ:
1176 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1177 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1178 case CmpInst::FCMP_UGT:
1179 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1180 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1181 case CmpInst::FCMP_UGE:
1182 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1183 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1184 case CmpInst::FCMP_ULT:
1185 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1186 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1187 case CmpInst::FCMP_ULE:
1188 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1189 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1190 case CmpInst::FCMP_UNE:
1191 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1192 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1193 case CmpInst::FCMP_TRUE:
1194 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1195 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1197 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1198 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1199 }
1200}
1201
1202int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1203 unsigned Size) const {
1204 if (Size == 64) {
1205 if (!STI.hasScalarCompareEq64())
1206 return -1;
1207
1208 switch (P) {
1209 case CmpInst::ICMP_NE:
1210 return AMDGPU::S_CMP_LG_U64;
1211 case CmpInst::ICMP_EQ:
1212 return AMDGPU::S_CMP_EQ_U64;
1213 default:
1214 return -1;
1215 }
1216 }
1217
1218 if (Size == 32) {
1219 switch (P) {
1220 case CmpInst::ICMP_NE:
1221 return AMDGPU::S_CMP_LG_U32;
1222 case CmpInst::ICMP_EQ:
1223 return AMDGPU::S_CMP_EQ_U32;
1224 case CmpInst::ICMP_SGT:
1225 return AMDGPU::S_CMP_GT_I32;
1226 case CmpInst::ICMP_SGE:
1227 return AMDGPU::S_CMP_GE_I32;
1228 case CmpInst::ICMP_SLT:
1229 return AMDGPU::S_CMP_LT_I32;
1230 case CmpInst::ICMP_SLE:
1231 return AMDGPU::S_CMP_LE_I32;
1232 case CmpInst::ICMP_UGT:
1233 return AMDGPU::S_CMP_GT_U32;
1234 case CmpInst::ICMP_UGE:
1235 return AMDGPU::S_CMP_GE_U32;
1236 case CmpInst::ICMP_ULT:
1237 return AMDGPU::S_CMP_LT_U32;
1238 case CmpInst::ICMP_ULE:
1239 return AMDGPU::S_CMP_LE_U32;
1240 case CmpInst::FCMP_OEQ:
1241 return AMDGPU::S_CMP_EQ_F32;
1242 case CmpInst::FCMP_OGT:
1243 return AMDGPU::S_CMP_GT_F32;
1244 case CmpInst::FCMP_OGE:
1245 return AMDGPU::S_CMP_GE_F32;
1246 case CmpInst::FCMP_OLT:
1247 return AMDGPU::S_CMP_LT_F32;
1248 case CmpInst::FCMP_OLE:
1249 return AMDGPU::S_CMP_LE_F32;
1250 case CmpInst::FCMP_ONE:
1251 return AMDGPU::S_CMP_LG_F32;
1252 case CmpInst::FCMP_ORD:
1253 return AMDGPU::S_CMP_O_F32;
1254 case CmpInst::FCMP_UNO:
1255 return AMDGPU::S_CMP_U_F32;
1256 case CmpInst::FCMP_UEQ:
1257 return AMDGPU::S_CMP_NLG_F32;
1258 case CmpInst::FCMP_UGT:
1259 return AMDGPU::S_CMP_NLE_F32;
1260 case CmpInst::FCMP_UGE:
1261 return AMDGPU::S_CMP_NLT_F32;
1262 case CmpInst::FCMP_ULT:
1263 return AMDGPU::S_CMP_NGE_F32;
1264 case CmpInst::FCMP_ULE:
1265 return AMDGPU::S_CMP_NGT_F32;
1266 case CmpInst::FCMP_UNE:
1267 return AMDGPU::S_CMP_NEQ_F32;
1268 default:
1269 llvm_unreachable("Unknown condition code!");
1270 }
1271 }
1272
1273 if (Size == 16) {
1274 if (!STI.hasSALUFloatInsts())
1275 return -1;
1276
1277 switch (P) {
1278 case CmpInst::FCMP_OEQ:
1279 return AMDGPU::S_CMP_EQ_F16;
1280 case CmpInst::FCMP_OGT:
1281 return AMDGPU::S_CMP_GT_F16;
1282 case CmpInst::FCMP_OGE:
1283 return AMDGPU::S_CMP_GE_F16;
1284 case CmpInst::FCMP_OLT:
1285 return AMDGPU::S_CMP_LT_F16;
1286 case CmpInst::FCMP_OLE:
1287 return AMDGPU::S_CMP_LE_F16;
1288 case CmpInst::FCMP_ONE:
1289 return AMDGPU::S_CMP_LG_F16;
1290 case CmpInst::FCMP_ORD:
1291 return AMDGPU::S_CMP_O_F16;
1292 case CmpInst::FCMP_UNO:
1293 return AMDGPU::S_CMP_U_F16;
1294 case CmpInst::FCMP_UEQ:
1295 return AMDGPU::S_CMP_NLG_F16;
1296 case CmpInst::FCMP_UGT:
1297 return AMDGPU::S_CMP_NLE_F16;
1298 case CmpInst::FCMP_UGE:
1299 return AMDGPU::S_CMP_NLT_F16;
1300 case CmpInst::FCMP_ULT:
1301 return AMDGPU::S_CMP_NGE_F16;
1302 case CmpInst::FCMP_ULE:
1303 return AMDGPU::S_CMP_NGT_F16;
1304 case CmpInst::FCMP_UNE:
1305 return AMDGPU::S_CMP_NEQ_F16;
1306 default:
1307 llvm_unreachable("Unknown condition code!");
1308 }
1309 }
1310
1311 return -1;
1312}
1313
1314bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1315
1316 MachineBasicBlock *BB = I.getParent();
1317 const DebugLoc &DL = I.getDebugLoc();
1318
1319 Register SrcReg = I.getOperand(2).getReg();
1320 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1321
1322 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1323
1324 Register CCReg = I.getOperand(0).getReg();
1325 if (!isVCC(CCReg, *MRI)) {
1326 int Opcode = getS_CMPOpcode(Pred, Size);
1327 if (Opcode == -1)
1328 return false;
1329 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1330 .add(I.getOperand(2))
1331 .add(I.getOperand(3));
1332 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1333 .addReg(AMDGPU::SCC);
1334 bool Ret =
1335 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1336 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1337 I.eraseFromParent();
1338 return Ret;
1339 }
1340
1341 if (I.getOpcode() == AMDGPU::G_FCMP)
1342 return false;
1343
1344 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1345 if (Opcode == -1)
1346 return false;
1347
1348 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1349 I.getOperand(0).getReg())
1350 .add(I.getOperand(2))
1351 .add(I.getOperand(3));
1353 *TRI.getBoolRC(), *MRI);
1354 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1355 I.eraseFromParent();
1356 return Ret;
1357}
1358
1359bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1360 Register Dst = I.getOperand(0).getReg();
1361 if (isVCC(Dst, *MRI))
1362 return false;
1363
1364 LLT DstTy = MRI->getType(Dst);
1365 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1366 return false;
1367
1368 MachineBasicBlock *BB = I.getParent();
1369 const DebugLoc &DL = I.getDebugLoc();
1370 Register SrcReg = I.getOperand(2).getReg();
1371 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1372
1373 // i1 inputs are not supported in GlobalISel.
1374 if (Size == 1)
1375 return false;
1376
1377 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1378 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1379 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1380 I.eraseFromParent();
1381 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1382 }
1383
1384 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1385 if (Opcode == -1)
1386 return false;
1387
1388 MachineInstrBuilder SelectedMI;
1389 MachineOperand &LHS = I.getOperand(2);
1390 MachineOperand &RHS = I.getOperand(3);
1391 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1392 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1393 Register Src0Reg =
1394 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1395 Register Src1Reg =
1396 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1397 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1398 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1399 SelectedMI.addImm(Src0Mods);
1400 SelectedMI.addReg(Src0Reg);
1401 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1402 SelectedMI.addImm(Src1Mods);
1403 SelectedMI.addReg(Src1Reg);
1404 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1405 SelectedMI.addImm(0); // clamp
1406 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1407 SelectedMI.addImm(0); // op_sel
1408
1409 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1410 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1411 return false;
1412
1413 I.eraseFromParent();
1414 return true;
1415}
1416
1417bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1418 MachineBasicBlock *BB = I.getParent();
1419 const DebugLoc &DL = I.getDebugLoc();
1420 Register DstReg = I.getOperand(0).getReg();
1421 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1422 const bool Is64 = Size == 64;
1423 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1424
1425 // In the common case, the return type matches the wave size.
1426 // However we also support emitting i64 ballots in wave32 mode.
1427 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1428 return false;
1429
1430 std::optional<ValueAndVReg> Arg =
1431 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1432
1433 const auto BuildCopy = [&](Register SrcReg) {
1434 if (Size == STI.getWavefrontSize()) {
1435 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1436 .addReg(SrcReg);
1437 return;
1438 }
1439
1440 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1441 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1442 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1443 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1444 .addReg(SrcReg)
1445 .addImm(AMDGPU::sub0)
1446 .addReg(HiReg)
1447 .addImm(AMDGPU::sub1);
1448 };
1449
1450 if (Arg) {
1451 const int64_t Value = Arg->Value.getSExtValue();
1452 if (Value == 0) {
1453 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1454 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1455 } else if (Value == -1) // all ones
1456 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1457 else
1458 return false;
1459 } else
1460 BuildCopy(I.getOperand(2).getReg());
1461
1462 I.eraseFromParent();
1463 return true;
1464}
1465
1466bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1467 Register DstReg = I.getOperand(0).getReg();
1468 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1469 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1470 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1471 return false;
1472
1473 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1474
1476 const MDNode *Metadata = I.getOperand(2).getMetadata();
1477 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1478 auto RelocSymbol = cast<GlobalVariable>(
1479 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1480
1481 MachineBasicBlock *BB = I.getParent();
1482 BuildMI(*BB, &I, I.getDebugLoc(),
1483 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1485
1486 I.eraseFromParent();
1487 return true;
1488}
1489
1490bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1492
1493 Register DstReg = I.getOperand(0).getReg();
1494 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1495 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1496 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1497
1498 MachineBasicBlock *MBB = I.getParent();
1499 const DebugLoc &DL = I.getDebugLoc();
1500
1501 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1502
1503 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1505 MIB.addImm(MFI->getLDSSize());
1506 } else {
1508 const GlobalValue *GV
1509 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1511 }
1512
1513 I.eraseFromParent();
1514 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1515}
1516
1517bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1518 MachineBasicBlock *MBB = I.getParent();
1520 const DebugLoc &DL = I.getDebugLoc();
1521
1522 MachineOperand &Dst = I.getOperand(0);
1523 Register DstReg = Dst.getReg();
1524 unsigned Depth = I.getOperand(2).getImm();
1525
1526 const TargetRegisterClass *RC
1527 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1528 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1529 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1530 return false;
1531
1532 // Check for kernel and shader functions
1533 if (Depth != 0 ||
1535 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1536 .addImm(0);
1537 I.eraseFromParent();
1538 return true;
1539 }
1540
1542 // There is a call to @llvm.returnaddress in this function
1543 MFI.setReturnAddressIsTaken(true);
1544
1545 // Get the return address reg and mark it as an implicit live-in
1546 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1547 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1548 AMDGPU::SReg_64RegClass, DL);
1549 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1550 .addReg(LiveIn);
1551 I.eraseFromParent();
1552 return true;
1553}
1554
1555bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1556 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1557 // SelectionDAG uses for wave32 vs wave64.
1558 MachineBasicBlock *BB = MI.getParent();
1559 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1560 .add(MI.getOperand(1));
1561
1562 Register Reg = MI.getOperand(1).getReg();
1563 MI.eraseFromParent();
1564
1565 if (!MRI->getRegClassOrNull(Reg))
1566 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1567 return true;
1568}
1569
1570bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1571 MachineInstr &MI, Intrinsic::ID IntrID) const {
1572 MachineBasicBlock *MBB = MI.getParent();
1574 const DebugLoc &DL = MI.getDebugLoc();
1575
1576 unsigned IndexOperand = MI.getOperand(7).getImm();
1577 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1578 bool WaveDone = MI.getOperand(9).getImm() != 0;
1579
1580 if (WaveDone && !WaveRelease)
1581 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1582
1583 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1584 IndexOperand &= ~0x3f;
1585 unsigned CountDw = 0;
1586
1588 CountDw = (IndexOperand >> 24) & 0xf;
1589 IndexOperand &= ~(0xf << 24);
1590
1591 if (CountDw < 1 || CountDw > 4) {
1593 "ds_ordered_count: dword count must be between 1 and 4");
1594 }
1595 }
1596
1597 if (IndexOperand)
1598 report_fatal_error("ds_ordered_count: bad index operand");
1599
1600 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1601 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1602
1603 unsigned Offset0 = OrderedCountIndex << 2;
1604 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1605
1607 Offset1 |= (CountDw - 1) << 6;
1608
1610 Offset1 |= ShaderType << 2;
1611
1612 unsigned Offset = Offset0 | (Offset1 << 8);
1613
1614 Register M0Val = MI.getOperand(2).getReg();
1615 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1616 .addReg(M0Val);
1617
1618 Register DstReg = MI.getOperand(0).getReg();
1619 Register ValReg = MI.getOperand(3).getReg();
1621 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1622 .addReg(ValReg)
1623 .addImm(Offset)
1624 .cloneMemRefs(MI);
1625
1626 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1627 return false;
1628
1629 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1630 MI.eraseFromParent();
1631 return Ret;
1632}
1633
1634static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1635 switch (IntrID) {
1636 case Intrinsic::amdgcn_ds_gws_init:
1637 return AMDGPU::DS_GWS_INIT;
1638 case Intrinsic::amdgcn_ds_gws_barrier:
1639 return AMDGPU::DS_GWS_BARRIER;
1640 case Intrinsic::amdgcn_ds_gws_sema_v:
1641 return AMDGPU::DS_GWS_SEMA_V;
1642 case Intrinsic::amdgcn_ds_gws_sema_br:
1643 return AMDGPU::DS_GWS_SEMA_BR;
1644 case Intrinsic::amdgcn_ds_gws_sema_p:
1645 return AMDGPU::DS_GWS_SEMA_P;
1646 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1647 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1648 default:
1649 llvm_unreachable("not a gws intrinsic");
1650 }
1651}
1652
1653bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1654 Intrinsic::ID IID) const {
1655 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1656 !STI.hasGWSSemaReleaseAll()))
1657 return false;
1658
1659 // intrinsic ID, vsrc, offset
1660 const bool HasVSrc = MI.getNumOperands() == 3;
1661 assert(HasVSrc || MI.getNumOperands() == 2);
1662
1663 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1664 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1665 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1666 return false;
1667
1668 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1669 unsigned ImmOffset;
1670
1671 MachineBasicBlock *MBB = MI.getParent();
1672 const DebugLoc &DL = MI.getDebugLoc();
1673
1674 MachineInstr *Readfirstlane = nullptr;
1675
1676 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1677 // incoming offset, in case there's an add of a constant. We'll have to put it
1678 // back later.
1679 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1680 Readfirstlane = OffsetDef;
1681 BaseOffset = OffsetDef->getOperand(1).getReg();
1682 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1683 }
1684
1685 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1686 // If we have a constant offset, try to use the 0 in m0 as the base.
1687 // TODO: Look into changing the default m0 initialization value. If the
1688 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1689 // the immediate offset.
1690
1691 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1692 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1693 .addImm(0);
1694 } else {
1695 std::tie(BaseOffset, ImmOffset) =
1696 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1697
1698 if (Readfirstlane) {
1699 // We have the constant offset now, so put the readfirstlane back on the
1700 // variable component.
1701 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1702 return false;
1703
1704 Readfirstlane->getOperand(1).setReg(BaseOffset);
1705 BaseOffset = Readfirstlane->getOperand(0).getReg();
1706 } else {
1707 if (!RBI.constrainGenericRegister(BaseOffset,
1708 AMDGPU::SReg_32RegClass, *MRI))
1709 return false;
1710 }
1711
1712 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1713 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1714 .addReg(BaseOffset)
1715 .addImm(16)
1716 .setOperandDead(3); // Dead scc
1717
1718 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1719 .addReg(M0Base);
1720 }
1721
1722 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1723 // offset field) % 64. Some versions of the programming guide omit the m0
1724 // part, or claim it's from offset 0.
1725 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1726
1727 if (HasVSrc) {
1728 Register VSrc = MI.getOperand(1).getReg();
1729 MIB.addReg(VSrc);
1730
1731 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1732 return false;
1733 }
1734
1735 MIB.addImm(ImmOffset)
1736 .cloneMemRefs(MI);
1737
1738 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1739
1740 MI.eraseFromParent();
1741 return true;
1742}
1743
1744bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1745 bool IsAppend) const {
1746 Register PtrBase = MI.getOperand(2).getReg();
1747 LLT PtrTy = MRI->getType(PtrBase);
1748 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1749
1750 unsigned Offset;
1751 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1752
1753 // TODO: Should this try to look through readfirstlane like GWS?
1754 if (!isDSOffsetLegal(PtrBase, Offset)) {
1755 PtrBase = MI.getOperand(2).getReg();
1756 Offset = 0;
1757 }
1758
1759 MachineBasicBlock *MBB = MI.getParent();
1760 const DebugLoc &DL = MI.getDebugLoc();
1761 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1762
1763 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1764 .addReg(PtrBase);
1765 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1766 return false;
1767
1768 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1769 .addImm(Offset)
1770 .addImm(IsGDS ? -1 : 0)
1771 .cloneMemRefs(MI);
1772 MI.eraseFromParent();
1773 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1774}
1775
1776bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1778 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1779 if (WGSize <= STI.getWavefrontSize()) {
1780 MachineBasicBlock *MBB = MI.getParent();
1781 const DebugLoc &DL = MI.getDebugLoc();
1782 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1783 MI.eraseFromParent();
1784 return true;
1785 }
1786 }
1787
1788 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1789 if (STI.hasSplitBarriers()) {
1790 MachineBasicBlock *MBB = MI.getParent();
1791 const DebugLoc &DL = MI.getDebugLoc();
1792 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1794 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1796 MI.eraseFromParent();
1797 return true;
1798 }
1799
1800 return selectImpl(MI, *CoverageInfo);
1801}
1802
1803static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1804 bool &IsTexFail) {
1805 if (TexFailCtrl)
1806 IsTexFail = true;
1807
1808 TFE = (TexFailCtrl & 0x1) ? true : false;
1809 TexFailCtrl &= ~(uint64_t)0x1;
1810 LWE = (TexFailCtrl & 0x2) ? true : false;
1811 TexFailCtrl &= ~(uint64_t)0x2;
1812
1813 return TexFailCtrl == 0;
1814}
1815
1816bool AMDGPUInstructionSelector::selectImageIntrinsic(
1818 MachineBasicBlock *MBB = MI.getParent();
1819 const DebugLoc &DL = MI.getDebugLoc();
1820
1821 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1823
1824 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1825 unsigned IntrOpcode = Intr->BaseOpcode;
1826 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1827 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1828 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1829
1830 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1831
1832 Register VDataIn, VDataOut;
1833 LLT VDataTy;
1834 int NumVDataDwords = -1;
1835 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1836 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1837
1838 bool Unorm;
1839 if (!BaseOpcode->Sampler)
1840 Unorm = true;
1841 else
1842 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1843
1844 bool TFE;
1845 bool LWE;
1846 bool IsTexFail = false;
1847 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1848 TFE, LWE, IsTexFail))
1849 return false;
1850
1851 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1852 const bool IsA16 = (Flags & 1) != 0;
1853 const bool IsG16 = (Flags & 2) != 0;
1854
1855 // A16 implies 16 bit gradients if subtarget doesn't support G16
1856 if (IsA16 && !STI.hasG16() && !IsG16)
1857 return false;
1858
1859 unsigned DMask = 0;
1860 unsigned DMaskLanes = 0;
1861
1862 if (BaseOpcode->Atomic) {
1863 VDataOut = MI.getOperand(0).getReg();
1864 VDataIn = MI.getOperand(2).getReg();
1865 LLT Ty = MRI->getType(VDataIn);
1866
1867 // Be careful to allow atomic swap on 16-bit element vectors.
1868 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1869 Ty.getSizeInBits() == 128 :
1870 Ty.getSizeInBits() == 64;
1871
1872 if (BaseOpcode->AtomicX2) {
1873 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1874
1875 DMask = Is64Bit ? 0xf : 0x3;
1876 NumVDataDwords = Is64Bit ? 4 : 2;
1877 } else {
1878 DMask = Is64Bit ? 0x3 : 0x1;
1879 NumVDataDwords = Is64Bit ? 2 : 1;
1880 }
1881 } else {
1882 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1883 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1884
1885 if (BaseOpcode->Store) {
1886 VDataIn = MI.getOperand(1).getReg();
1887 VDataTy = MRI->getType(VDataIn);
1888 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1889 } else if (BaseOpcode->NoReturn) {
1890 NumVDataDwords = 0;
1891 } else {
1892 VDataOut = MI.getOperand(0).getReg();
1893 VDataTy = MRI->getType(VDataOut);
1894 NumVDataDwords = DMaskLanes;
1895
1896 if (IsD16 && !STI.hasUnpackedD16VMem())
1897 NumVDataDwords = (DMaskLanes + 1) / 2;
1898 }
1899 }
1900
1901 // Set G16 opcode
1902 if (Subtarget->hasG16() && IsG16) {
1903 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1905 assert(G16MappingInfo);
1906 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1907 }
1908
1909 // TODO: Check this in verifier.
1910 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1911
1912 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1913 if (BaseOpcode->Atomic)
1914 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1915 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1917 return false;
1918
1919 int NumVAddrRegs = 0;
1920 int NumVAddrDwords = 0;
1921 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1922 // Skip the $noregs and 0s inserted during legalization.
1923 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1924 if (!AddrOp.isReg())
1925 continue; // XXX - Break?
1926
1927 Register Addr = AddrOp.getReg();
1928 if (!Addr)
1929 break;
1930
1931 ++NumVAddrRegs;
1932 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1933 }
1934
1935 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1936 // NSA, these should have been packed into a single value in the first
1937 // address register
1938 const bool UseNSA =
1939 NumVAddrRegs != 1 &&
1940 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1941 : NumVAddrDwords == NumVAddrRegs);
1942 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1943 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1944 return false;
1945 }
1946
1947 if (IsTexFail)
1948 ++NumVDataDwords;
1949
1950 int Opcode = -1;
1951 if (IsGFX12Plus) {
1952 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1953 NumVDataDwords, NumVAddrDwords);
1954 } else if (IsGFX11Plus) {
1955 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1956 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1957 : AMDGPU::MIMGEncGfx11Default,
1958 NumVDataDwords, NumVAddrDwords);
1959 } else if (IsGFX10Plus) {
1960 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1961 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1962 : AMDGPU::MIMGEncGfx10Default,
1963 NumVDataDwords, NumVAddrDwords);
1964 } else {
1965 if (Subtarget->hasGFX90AInsts()) {
1966 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1967 NumVDataDwords, NumVAddrDwords);
1968 if (Opcode == -1) {
1969 LLVM_DEBUG(
1970 dbgs()
1971 << "requested image instruction is not supported on this GPU\n");
1972 return false;
1973 }
1974 }
1975 if (Opcode == -1 &&
1977 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1978 NumVDataDwords, NumVAddrDwords);
1979 if (Opcode == -1)
1980 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1981 NumVDataDwords, NumVAddrDwords);
1982 }
1983 if (Opcode == -1)
1984 return false;
1985
1986 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1987 .cloneMemRefs(MI);
1988
1989 if (VDataOut) {
1990 if (BaseOpcode->AtomicX2) {
1991 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1992
1993 Register TmpReg = MRI->createVirtualRegister(
1994 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1995 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1996
1997 MIB.addDef(TmpReg);
1998 if (!MRI->use_empty(VDataOut)) {
1999 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2000 .addReg(TmpReg, RegState::Kill, SubReg);
2001 }
2002
2003 } else {
2004 MIB.addDef(VDataOut); // vdata output
2005 }
2006 }
2007
2008 if (VDataIn)
2009 MIB.addReg(VDataIn); // vdata input
2010
2011 for (int I = 0; I != NumVAddrRegs; ++I) {
2012 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2013 if (SrcOp.isReg()) {
2014 assert(SrcOp.getReg() != 0);
2015 MIB.addReg(SrcOp.getReg());
2016 }
2017 }
2018
2019 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2020 if (BaseOpcode->Sampler)
2021 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2022
2023 MIB.addImm(DMask); // dmask
2024
2025 if (IsGFX10Plus)
2026 MIB.addImm(DimInfo->Encoding);
2027 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2028 MIB.addImm(Unorm);
2029
2030 MIB.addImm(CPol);
2031 MIB.addImm(IsA16 && // a16 or r128
2032 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2033 if (IsGFX10Plus)
2034 MIB.addImm(IsA16 ? -1 : 0);
2035
2036 if (!Subtarget->hasGFX90AInsts()) {
2037 MIB.addImm(TFE); // tfe
2038 } else if (TFE) {
2039 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2040 return false;
2041 }
2042
2043 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2044 MIB.addImm(LWE); // lwe
2045 if (!IsGFX10Plus)
2046 MIB.addImm(DimInfo->DA ? -1 : 0);
2047 if (BaseOpcode->HasD16)
2048 MIB.addImm(IsD16 ? -1 : 0);
2049
2050 MI.eraseFromParent();
2051 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2052 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2053 return true;
2054}
2055
2056// We need to handle this here because tablegen doesn't support matching
2057// instructions with multiple outputs.
2058bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2059 MachineInstr &MI) const {
2060 Register Dst0 = MI.getOperand(0).getReg();
2061 Register Dst1 = MI.getOperand(1).getReg();
2062
2063 const DebugLoc &DL = MI.getDebugLoc();
2064 MachineBasicBlock *MBB = MI.getParent();
2065
2066 Register Addr = MI.getOperand(3).getReg();
2067 Register Data0 = MI.getOperand(4).getReg();
2068 Register Data1 = MI.getOperand(5).getReg();
2069 unsigned Offset = MI.getOperand(6).getImm();
2070
2071 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2072 .addDef(Dst1)
2073 .addUse(Addr)
2074 .addUse(Data0)
2075 .addUse(Data1)
2076 .addImm(Offset)
2077 .cloneMemRefs(MI);
2078
2079 MI.eraseFromParent();
2080 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2081}
2082
2083bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2084 MachineInstr &I) const {
2085 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2086 switch (IntrinsicID) {
2087 case Intrinsic::amdgcn_end_cf:
2088 return selectEndCfIntrinsic(I);
2089 case Intrinsic::amdgcn_ds_ordered_add:
2090 case Intrinsic::amdgcn_ds_ordered_swap:
2091 return selectDSOrderedIntrinsic(I, IntrinsicID);
2092 case Intrinsic::amdgcn_ds_gws_init:
2093 case Intrinsic::amdgcn_ds_gws_barrier:
2094 case Intrinsic::amdgcn_ds_gws_sema_v:
2095 case Intrinsic::amdgcn_ds_gws_sema_br:
2096 case Intrinsic::amdgcn_ds_gws_sema_p:
2097 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2098 return selectDSGWSIntrinsic(I, IntrinsicID);
2099 case Intrinsic::amdgcn_ds_append:
2100 return selectDSAppendConsume(I, true);
2101 case Intrinsic::amdgcn_ds_consume:
2102 return selectDSAppendConsume(I, false);
2103 case Intrinsic::amdgcn_s_barrier:
2104 return selectSBarrier(I);
2105 case Intrinsic::amdgcn_raw_buffer_load_lds:
2106 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2107 case Intrinsic::amdgcn_struct_buffer_load_lds:
2108 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2109 return selectBufferLoadLds(I);
2110 case Intrinsic::amdgcn_global_load_lds:
2111 return selectGlobalLoadLds(I);
2112 case Intrinsic::amdgcn_exp_compr:
2113 if (!STI.hasCompressedExport()) {
2114 Function &F = I.getMF()->getFunction();
2116 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2117 F.getContext().diagnose(NoFpRet);
2118 return false;
2119 }
2120 break;
2121 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2122 return selectDSBvhStackIntrinsic(I);
2123 case Intrinsic::amdgcn_s_barrier_init:
2124 case Intrinsic::amdgcn_s_barrier_join:
2125 case Intrinsic::amdgcn_s_wakeup_barrier:
2126 case Intrinsic::amdgcn_s_get_barrier_state:
2127 return selectNamedBarrierInst(I, IntrinsicID);
2128 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2129 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2130 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2131 case Intrinsic::amdgcn_s_barrier_leave:
2132 return selectSBarrierLeave(I);
2133 }
2134 return selectImpl(I, *CoverageInfo);
2135}
2136
2137bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2138 if (selectImpl(I, *CoverageInfo))
2139 return true;
2140
2141 MachineBasicBlock *BB = I.getParent();
2142 const DebugLoc &DL = I.getDebugLoc();
2143
2144 Register DstReg = I.getOperand(0).getReg();
2145 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2146 assert(Size <= 32 || Size == 64);
2147 const MachineOperand &CCOp = I.getOperand(1);
2148 Register CCReg = CCOp.getReg();
2149 if (!isVCC(CCReg, *MRI)) {
2150 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2151 AMDGPU::S_CSELECT_B32;
2152 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2153 .addReg(CCReg);
2154
2155 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2156 // bank, because it does not cover the register class that we used to represent
2157 // for it. So we need to manually set the register class here.
2158 if (!MRI->getRegClassOrNull(CCReg))
2159 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2160 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2161 .add(I.getOperand(2))
2162 .add(I.getOperand(3));
2163
2164 bool Ret = false;
2165 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2166 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2167 I.eraseFromParent();
2168 return Ret;
2169 }
2170
2171 // Wide VGPR select should have been split in RegBankSelect.
2172 if (Size > 32)
2173 return false;
2174
2176 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2177 .addImm(0)
2178 .add(I.getOperand(3))
2179 .addImm(0)
2180 .add(I.getOperand(2))
2181 .add(I.getOperand(1));
2182
2183 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2184 I.eraseFromParent();
2185 return Ret;
2186}
2187
2188bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2189 Register DstReg = I.getOperand(0).getReg();
2190 Register SrcReg = I.getOperand(1).getReg();
2191 const LLT DstTy = MRI->getType(DstReg);
2192 const LLT SrcTy = MRI->getType(SrcReg);
2193 const LLT S1 = LLT::scalar(1);
2194
2195 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2196 const RegisterBank *DstRB;
2197 if (DstTy == S1) {
2198 // This is a special case. We don't treat s1 for legalization artifacts as
2199 // vcc booleans.
2200 DstRB = SrcRB;
2201 } else {
2202 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2203 if (SrcRB != DstRB)
2204 return false;
2205 }
2206
2207 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2208
2209 unsigned DstSize = DstTy.getSizeInBits();
2210 unsigned SrcSize = SrcTy.getSizeInBits();
2211
2212 const TargetRegisterClass *SrcRC =
2213 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2214 const TargetRegisterClass *DstRC =
2215 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2216 if (!SrcRC || !DstRC)
2217 return false;
2218
2219 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2220 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2221 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2222 return false;
2223 }
2224
2225 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2227 const DebugLoc &DL = I.getDebugLoc();
2228 MachineBasicBlock *MBB = I.getParent();
2229 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2230 .addReg(SrcReg, 0, AMDGPU::lo16);
2231 I.eraseFromParent();
2232 return true;
2233 }
2234
2235 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2236 MachineBasicBlock *MBB = I.getParent();
2237 const DebugLoc &DL = I.getDebugLoc();
2238
2239 Register LoReg = MRI->createVirtualRegister(DstRC);
2240 Register HiReg = MRI->createVirtualRegister(DstRC);
2241 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2242 .addReg(SrcReg, 0, AMDGPU::sub0);
2243 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2244 .addReg(SrcReg, 0, AMDGPU::sub1);
2245
2246 if (IsVALU && STI.hasSDWA()) {
2247 // Write the low 16-bits of the high element into the high 16-bits of the
2248 // low element.
2249 MachineInstr *MovSDWA =
2250 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2251 .addImm(0) // $src0_modifiers
2252 .addReg(HiReg) // $src0
2253 .addImm(0) // $clamp
2254 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2255 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2256 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2257 .addReg(LoReg, RegState::Implicit);
2258 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2259 } else {
2260 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2261 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2262 Register ImmReg = MRI->createVirtualRegister(DstRC);
2263 if (IsVALU) {
2264 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2265 .addImm(16)
2266 .addReg(HiReg);
2267 } else {
2268 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2269 .addReg(HiReg)
2270 .addImm(16)
2271 .setOperandDead(3); // Dead scc
2272 }
2273
2274 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2275 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2276 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2277
2278 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2279 .addImm(0xffff);
2280 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2281 .addReg(LoReg)
2282 .addReg(ImmReg);
2283 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2284 .addReg(TmpReg0)
2285 .addReg(TmpReg1);
2286
2287 if (!IsVALU) {
2288 And.setOperandDead(3); // Dead scc
2289 Or.setOperandDead(3); // Dead scc
2290 }
2291 }
2292
2293 I.eraseFromParent();
2294 return true;
2295 }
2296
2297 if (!DstTy.isScalar())
2298 return false;
2299
2300 if (SrcSize > 32) {
2301 unsigned SubRegIdx =
2302 DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
2303 if (SubRegIdx == AMDGPU::NoSubRegister)
2304 return false;
2305
2306 // Deal with weird cases where the class only partially supports the subreg
2307 // index.
2308 const TargetRegisterClass *SrcWithSubRC
2309 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2310 if (!SrcWithSubRC)
2311 return false;
2312
2313 if (SrcWithSubRC != SrcRC) {
2314 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2315 return false;
2316 }
2317
2318 I.getOperand(1).setSubReg(SubRegIdx);
2319 }
2320
2321 I.setDesc(TII.get(TargetOpcode::COPY));
2322 return true;
2323}
2324
2325/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2326static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2327 Mask = maskTrailingOnes<unsigned>(Size);
2328 int SignedMask = static_cast<int>(Mask);
2329 return SignedMask >= -16 && SignedMask <= 64;
2330}
2331
2332// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2333const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2334 Register Reg, const MachineRegisterInfo &MRI,
2335 const TargetRegisterInfo &TRI) const {
2336 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2337 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2338 return RB;
2339
2340 // Ignore the type, since we don't use vcc in artifacts.
2341 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2342 return &RBI.getRegBankFromRegClass(*RC, LLT());
2343 return nullptr;
2344}
2345
2346bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2347 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2348 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2349 const DebugLoc &DL = I.getDebugLoc();
2350 MachineBasicBlock &MBB = *I.getParent();
2351 const Register DstReg = I.getOperand(0).getReg();
2352 const Register SrcReg = I.getOperand(1).getReg();
2353
2354 const LLT DstTy = MRI->getType(DstReg);
2355 const LLT SrcTy = MRI->getType(SrcReg);
2356 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2357 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2358 const unsigned DstSize = DstTy.getSizeInBits();
2359 if (!DstTy.isScalar())
2360 return false;
2361
2362 // Artifact casts should never use vcc.
2363 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2364
2365 // FIXME: This should probably be illegal and split earlier.
2366 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2367 if (DstSize <= 32)
2368 return selectCOPY(I);
2369
2370 const TargetRegisterClass *SrcRC =
2371 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2372 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2373 const TargetRegisterClass *DstRC =
2374 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2375
2376 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2377 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2378 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2379 .addReg(SrcReg)
2380 .addImm(AMDGPU::sub0)
2381 .addReg(UndefReg)
2382 .addImm(AMDGPU::sub1);
2383 I.eraseFromParent();
2384
2385 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2386 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2387 }
2388
2389 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2390 // 64-bit should have been split up in RegBankSelect
2391
2392 // Try to use an and with a mask if it will save code size.
2393 unsigned Mask;
2394 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2395 MachineInstr *ExtI =
2396 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2397 .addImm(Mask)
2398 .addReg(SrcReg);
2399 I.eraseFromParent();
2400 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2401 }
2402
2403 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2404 MachineInstr *ExtI =
2405 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2406 .addReg(SrcReg)
2407 .addImm(0) // Offset
2408 .addImm(SrcSize); // Width
2409 I.eraseFromParent();
2410 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2411 }
2412
2413 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2414 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2415 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2416 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2417 return false;
2418
2419 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2420 const unsigned SextOpc = SrcSize == 8 ?
2421 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2422 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2423 .addReg(SrcReg);
2424 I.eraseFromParent();
2425 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2426 }
2427
2428 // Using a single 32-bit SALU to calculate the high half is smaller than
2429 // S_BFE with a literal constant operand.
2430 if (DstSize > 32 && SrcSize == 32) {
2431 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2432 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2433 if (Signed) {
2434 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2435 .addReg(SrcReg, 0, SubReg)
2436 .addImm(31)
2437 .setOperandDead(3); // Dead scc
2438 } else {
2439 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2440 .addImm(0);
2441 }
2442 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2443 .addReg(SrcReg, 0, SubReg)
2444 .addImm(AMDGPU::sub0)
2445 .addReg(HiReg)
2446 .addImm(AMDGPU::sub1);
2447 I.eraseFromParent();
2448 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2449 *MRI);
2450 }
2451
2452 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2453 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2454
2455 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2456 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2457 // We need a 64-bit register source, but the high bits don't matter.
2458 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2459 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2460 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2461
2462 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2463 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2464 .addReg(SrcReg, 0, SubReg)
2465 .addImm(AMDGPU::sub0)
2466 .addReg(UndefReg)
2467 .addImm(AMDGPU::sub1);
2468
2469 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2470 .addReg(ExtReg)
2471 .addImm(SrcSize << 16);
2472
2473 I.eraseFromParent();
2474 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2475 }
2476
2477 unsigned Mask;
2478 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2479 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2480 .addReg(SrcReg)
2481 .addImm(Mask)
2482 .setOperandDead(3); // Dead scc
2483 } else {
2484 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2485 .addReg(SrcReg)
2486 .addImm(SrcSize << 16);
2487 }
2488
2489 I.eraseFromParent();
2490 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2491 }
2492
2493 return false;
2494}
2495
2497 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2498}
2499
2501 Register BitcastSrc;
2502 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2503 Reg = BitcastSrc;
2504 return Reg;
2505}
2506
2508 Register &Out) {
2509 Register Trunc;
2510 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2511 return false;
2512
2513 Register LShlSrc;
2514 Register Cst;
2515 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2516 Cst = stripCopy(Cst, MRI);
2517 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2518 Out = stripBitCast(LShlSrc, MRI);
2519 return true;
2520 }
2521 }
2522
2523 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2524 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2525 return false;
2526
2527 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2528 LLT::fixed_vector(2, 16));
2529
2530 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2531 assert(Mask.size() == 2);
2532
2533 if (Mask[0] == 1 && Mask[1] <= 1) {
2534 Out = Shuffle->getOperand(0).getReg();
2535 return true;
2536 }
2537
2538 return false;
2539}
2540
2541bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2542 if (!Subtarget->hasSALUFloatInsts())
2543 return false;
2544
2545 Register Dst = I.getOperand(0).getReg();
2546 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2547 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2548 return false;
2549
2550 Register Src = I.getOperand(1).getReg();
2551
2552 if (MRI->getType(Dst) == LLT::scalar(32) &&
2553 MRI->getType(Src) == LLT::scalar(16)) {
2554 if (isExtractHiElt(*MRI, Src, Src)) {
2555 MachineBasicBlock *BB = I.getParent();
2556 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2557 .addUse(Src);
2558 I.eraseFromParent();
2559 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2560 }
2561 }
2562
2563 return false;
2564}
2565
2566bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2567 // Only manually handle the f64 SGPR case.
2568 //
2569 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2570 // the bit ops theoretically have a second result due to the implicit def of
2571 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2572 // that is easy by disabling the check. The result works, but uses a
2573 // nonsensical sreg32orlds_and_sreg_1 regclass.
2574 //
2575 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2576 // the variadic REG_SEQUENCE operands.
2577
2578 Register Dst = MI.getOperand(0).getReg();
2579 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2580 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2581 MRI->getType(Dst) != LLT::scalar(64))
2582 return false;
2583
2584 Register Src = MI.getOperand(1).getReg();
2585 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2586 if (Fabs)
2587 Src = Fabs->getOperand(1).getReg();
2588
2589 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2590 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2591 return false;
2592
2593 MachineBasicBlock *BB = MI.getParent();
2594 const DebugLoc &DL = MI.getDebugLoc();
2595 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2596 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2597 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2598 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2599
2600 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2601 .addReg(Src, 0, AMDGPU::sub0);
2602 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2603 .addReg(Src, 0, AMDGPU::sub1);
2604 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2605 .addImm(0x80000000);
2606
2607 // Set or toggle sign bit.
2608 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2609 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2610 .addReg(HiReg)
2611 .addReg(ConstReg)
2612 .setOperandDead(3); // Dead scc
2613 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2614 .addReg(LoReg)
2615 .addImm(AMDGPU::sub0)
2616 .addReg(OpReg)
2617 .addImm(AMDGPU::sub1);
2618 MI.eraseFromParent();
2619 return true;
2620}
2621
2622// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2623bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2624 Register Dst = MI.getOperand(0).getReg();
2625 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2626 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2627 MRI->getType(Dst) != LLT::scalar(64))
2628 return false;
2629
2630 Register Src = MI.getOperand(1).getReg();
2631 MachineBasicBlock *BB = MI.getParent();
2632 const DebugLoc &DL = MI.getDebugLoc();
2633 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2634 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2635 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2636 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2637
2638 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2639 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2640 return false;
2641
2642 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2643 .addReg(Src, 0, AMDGPU::sub0);
2644 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2645 .addReg(Src, 0, AMDGPU::sub1);
2646 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2647 .addImm(0x7fffffff);
2648
2649 // Clear sign bit.
2650 // TODO: Should this used S_BITSET0_*?
2651 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2652 .addReg(HiReg)
2653 .addReg(ConstReg)
2654 .setOperandDead(3); // Dead scc
2655 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2656 .addReg(LoReg)
2657 .addImm(AMDGPU::sub0)
2658 .addReg(OpReg)
2659 .addImm(AMDGPU::sub1);
2660
2661 MI.eraseFromParent();
2662 return true;
2663}
2664
2665static bool isConstant(const MachineInstr &MI) {
2666 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2667}
2668
2669void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2670 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2671
2672 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2673 const MachineInstr *PtrMI =
2674 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2675
2676 assert(PtrMI);
2677
2678 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2679 return;
2680
2681 GEPInfo GEPInfo;
2682
2683 for (unsigned i = 1; i != 3; ++i) {
2684 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2685 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2686 assert(OpDef);
2687 if (i == 2 && isConstant(*OpDef)) {
2688 // TODO: Could handle constant base + variable offset, but a combine
2689 // probably should have commuted it.
2690 assert(GEPInfo.Imm == 0);
2691 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2692 continue;
2693 }
2694 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2695 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2696 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2697 else
2698 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2699 }
2700
2701 AddrInfo.push_back(GEPInfo);
2702 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2703}
2704
2705bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2706 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2707}
2708
2709bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2710 if (!MI.hasOneMemOperand())
2711 return false;
2712
2713 const MachineMemOperand *MMO = *MI.memoperands_begin();
2714 const Value *Ptr = MMO->getValue();
2715
2716 // UndefValue means this is a load of a kernel input. These are uniform.
2717 // Sometimes LDS instructions have constant pointers.
2718 // If Ptr is null, then that means this mem operand contains a
2719 // PseudoSourceValue like GOT.
2720 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2721 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2722 return true;
2723
2725 return true;
2726
2727 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2728 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2729 AMDGPU::SGPRRegBankID;
2730
2731 const Instruction *I = dyn_cast<Instruction>(Ptr);
2732 return I && I->getMetadata("amdgpu.uniform");
2733}
2734
2735bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2736 for (const GEPInfo &GEPInfo : AddrInfo) {
2737 if (!GEPInfo.VgprParts.empty())
2738 return true;
2739 }
2740 return false;
2741}
2742
2743void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2744 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2745 unsigned AS = PtrTy.getAddressSpace();
2747 STI.ldsRequiresM0Init()) {
2748 MachineBasicBlock *BB = I.getParent();
2749
2750 // If DS instructions require M0 initialization, insert it before selecting.
2751 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2752 .addImm(-1);
2753 }
2754}
2755
2756bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2757 MachineInstr &I) const {
2758 initM0(I);
2759 return selectImpl(I, *CoverageInfo);
2760}
2761
2763 if (Reg.isPhysical())
2764 return false;
2765
2766 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2767 const unsigned Opcode = MI.getOpcode();
2768
2769 if (Opcode == AMDGPU::COPY)
2770 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2771
2772 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2773 Opcode == AMDGPU::G_XOR)
2774 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2775 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2776
2777 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2778 return GI->is(Intrinsic::amdgcn_class);
2779
2780 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2781}
2782
2783bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2784 MachineBasicBlock *BB = I.getParent();
2785 MachineOperand &CondOp = I.getOperand(0);
2786 Register CondReg = CondOp.getReg();
2787 const DebugLoc &DL = I.getDebugLoc();
2788
2789 unsigned BrOpcode;
2790 Register CondPhysReg;
2791 const TargetRegisterClass *ConstrainRC;
2792
2793 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2794 // whether the branch is uniform when selecting the instruction. In
2795 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2796 // RegBankSelect knows what it's doing if the branch condition is scc, even
2797 // though it currently does not.
2798 if (!isVCC(CondReg, *MRI)) {
2799 if (MRI->getType(CondReg) != LLT::scalar(32))
2800 return false;
2801
2802 CondPhysReg = AMDGPU::SCC;
2803 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2804 ConstrainRC = &AMDGPU::SReg_32RegClass;
2805 } else {
2806 // FIXME: Should scc->vcc copies and with exec?
2807
2808 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2809 // need to insert an and with exec.
2810 if (!isVCmpResult(CondReg, *MRI)) {
2811 const bool Is64 = STI.isWave64();
2812 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2813 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2814
2815 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2816 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2817 .addReg(CondReg)
2818 .addReg(Exec)
2819 .setOperandDead(3); // Dead scc
2820 CondReg = TmpReg;
2821 }
2822
2823 CondPhysReg = TRI.getVCC();
2824 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2825 ConstrainRC = TRI.getBoolRC();
2826 }
2827
2828 if (!MRI->getRegClassOrNull(CondReg))
2829 MRI->setRegClass(CondReg, ConstrainRC);
2830
2831 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2832 .addReg(CondReg);
2833 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2834 .addMBB(I.getOperand(1).getMBB());
2835
2836 I.eraseFromParent();
2837 return true;
2838}
2839
2840bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2841 MachineInstr &I) const {
2842 Register DstReg = I.getOperand(0).getReg();
2843 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2844 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2845 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2846 if (IsVGPR)
2847 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2848
2849 return RBI.constrainGenericRegister(
2850 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2851}
2852
2853bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2854 Register DstReg = I.getOperand(0).getReg();
2855 Register SrcReg = I.getOperand(1).getReg();
2856 Register MaskReg = I.getOperand(2).getReg();
2857 LLT Ty = MRI->getType(DstReg);
2858 LLT MaskTy = MRI->getType(MaskReg);
2859 MachineBasicBlock *BB = I.getParent();
2860 const DebugLoc &DL = I.getDebugLoc();
2861
2862 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2863 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2864 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2865 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2866 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2867 return false;
2868
2869 // Try to avoid emitting a bit operation when we only need to touch half of
2870 // the 64-bit pointer.
2871 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2872 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2873 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2874
2875 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2876 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2877
2878 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2879 !CanCopyLow32 && !CanCopyHi32) {
2880 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2881 .addReg(SrcReg)
2882 .addReg(MaskReg)
2883 .setOperandDead(3); // Dead scc
2884 I.eraseFromParent();
2885 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2886 }
2887
2888 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2889 const TargetRegisterClass &RegRC
2890 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2891
2892 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2893 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2894 const TargetRegisterClass *MaskRC =
2895 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2896
2897 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2898 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2899 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2900 return false;
2901
2902 if (Ty.getSizeInBits() == 32) {
2903 assert(MaskTy.getSizeInBits() == 32 &&
2904 "ptrmask should have been narrowed during legalize");
2905
2906 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2907 .addReg(SrcReg)
2908 .addReg(MaskReg);
2909
2910 if (!IsVGPR)
2911 NewOp.setOperandDead(3); // Dead scc
2912 I.eraseFromParent();
2913 return true;
2914 }
2915
2916 Register HiReg = MRI->createVirtualRegister(&RegRC);
2917 Register LoReg = MRI->createVirtualRegister(&RegRC);
2918
2919 // Extract the subregisters from the source pointer.
2920 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2921 .addReg(SrcReg, 0, AMDGPU::sub0);
2922 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2923 .addReg(SrcReg, 0, AMDGPU::sub1);
2924
2925 Register MaskedLo, MaskedHi;
2926
2927 if (CanCopyLow32) {
2928 // If all the bits in the low half are 1, we only need a copy for it.
2929 MaskedLo = LoReg;
2930 } else {
2931 // Extract the mask subregister and apply the and.
2932 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2933 MaskedLo = MRI->createVirtualRegister(&RegRC);
2934
2935 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2936 .addReg(MaskReg, 0, AMDGPU::sub0);
2937 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2938 .addReg(LoReg)
2939 .addReg(MaskLo);
2940 }
2941
2942 if (CanCopyHi32) {
2943 // If all the bits in the high half are 1, we only need a copy for it.
2944 MaskedHi = HiReg;
2945 } else {
2946 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2947 MaskedHi = MRI->createVirtualRegister(&RegRC);
2948
2949 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2950 .addReg(MaskReg, 0, AMDGPU::sub1);
2951 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2952 .addReg(HiReg)
2953 .addReg(MaskHi);
2954 }
2955
2956 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2957 .addReg(MaskedLo)
2958 .addImm(AMDGPU::sub0)
2959 .addReg(MaskedHi)
2960 .addImm(AMDGPU::sub1);
2961 I.eraseFromParent();
2962 return true;
2963}
2964
2965/// Return the register to use for the index value, and the subregister to use
2966/// for the indirectly accessed register.
2967static std::pair<Register, unsigned>
2969 const TargetRegisterClass *SuperRC, Register IdxReg,
2970 unsigned EltSize, GISelKnownBits &KnownBits) {
2971 Register IdxBaseReg;
2972 int Offset;
2973
2974 std::tie(IdxBaseReg, Offset) =
2976 if (IdxBaseReg == AMDGPU::NoRegister) {
2977 // This will happen if the index is a known constant. This should ordinarily
2978 // be legalized out, but handle it as a register just in case.
2979 assert(Offset == 0);
2980 IdxBaseReg = IdxReg;
2981 }
2982
2983 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2984
2985 // Skip out of bounds offsets, or else we would end up using an undefined
2986 // register.
2987 if (static_cast<unsigned>(Offset) >= SubRegs.size())
2988 return std::pair(IdxReg, SubRegs[0]);
2989 return std::pair(IdxBaseReg, SubRegs[Offset]);
2990}
2991
2992bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2993 MachineInstr &MI) const {
2994 Register DstReg = MI.getOperand(0).getReg();
2995 Register SrcReg = MI.getOperand(1).getReg();
2996 Register IdxReg = MI.getOperand(2).getReg();
2997
2998 LLT DstTy = MRI->getType(DstReg);
2999 LLT SrcTy = MRI->getType(SrcReg);
3000
3001 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3002 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3003 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3004
3005 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3006 // into a waterfall loop.
3007 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3008 return false;
3009
3010 const TargetRegisterClass *SrcRC =
3011 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3012 const TargetRegisterClass *DstRC =
3013 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3014 if (!SrcRC || !DstRC)
3015 return false;
3016 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3017 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3018 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3019 return false;
3020
3021 MachineBasicBlock *BB = MI.getParent();
3022 const DebugLoc &DL = MI.getDebugLoc();
3023 const bool Is64 = DstTy.getSizeInBits() == 64;
3024
3025 unsigned SubReg;
3026 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3027 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3028
3029 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3030 if (DstTy.getSizeInBits() != 32 && !Is64)
3031 return false;
3032
3033 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3034 .addReg(IdxReg);
3035
3036 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3037 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3038 .addReg(SrcReg, 0, SubReg)
3039 .addReg(SrcReg, RegState::Implicit);
3040 MI.eraseFromParent();
3041 return true;
3042 }
3043
3044 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3045 return false;
3046
3047 if (!STI.useVGPRIndexMode()) {
3048 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3049 .addReg(IdxReg);
3050 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3051 .addReg(SrcReg, 0, SubReg)
3052 .addReg(SrcReg, RegState::Implicit);
3053 MI.eraseFromParent();
3054 return true;
3055 }
3056
3057 const MCInstrDesc &GPRIDXDesc =
3058 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3059 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3060 .addReg(SrcReg)
3061 .addReg(IdxReg)
3062 .addImm(SubReg);
3063
3064 MI.eraseFromParent();
3065 return true;
3066}
3067
3068// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3069bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3070 MachineInstr &MI) const {
3071 Register DstReg = MI.getOperand(0).getReg();
3072 Register VecReg = MI.getOperand(1).getReg();
3073 Register ValReg = MI.getOperand(2).getReg();
3074 Register IdxReg = MI.getOperand(3).getReg();
3075
3076 LLT VecTy = MRI->getType(DstReg);
3077 LLT ValTy = MRI->getType(ValReg);
3078 unsigned VecSize = VecTy.getSizeInBits();
3079 unsigned ValSize = ValTy.getSizeInBits();
3080
3081 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3082 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3083 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3084
3085 assert(VecTy.getElementType() == ValTy);
3086
3087 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3088 // into a waterfall loop.
3089 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3090 return false;
3091
3092 const TargetRegisterClass *VecRC =
3093 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3094 const TargetRegisterClass *ValRC =
3095 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3096
3097 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3098 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3099 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3100 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3101 return false;
3102
3103 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3104 return false;
3105
3106 unsigned SubReg;
3107 std::tie(IdxReg, SubReg) =
3108 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3109
3110 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3111 STI.useVGPRIndexMode();
3112
3113 MachineBasicBlock *BB = MI.getParent();
3114 const DebugLoc &DL = MI.getDebugLoc();
3115
3116 if (!IndexMode) {
3117 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3118 .addReg(IdxReg);
3119
3120 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3121 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3122 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3123 .addReg(VecReg)
3124 .addReg(ValReg)
3125 .addImm(SubReg);
3126 MI.eraseFromParent();
3127 return true;
3128 }
3129
3130 const MCInstrDesc &GPRIDXDesc =
3131 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3132 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3133 .addReg(VecReg)
3134 .addReg(ValReg)
3135 .addReg(IdxReg)
3136 .addImm(SubReg);
3137
3138 MI.eraseFromParent();
3139 return true;
3140}
3141
3142bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3144 unsigned Opc;
3145 unsigned Size = MI.getOperand(3).getImm();
3146
3147 // The struct intrinsic variants add one additional operand over raw.
3148 const bool HasVIndex = MI.getNumOperands() == 9;
3149 Register VIndex;
3150 int OpOffset = 0;
3151 if (HasVIndex) {
3152 VIndex = MI.getOperand(4).getReg();
3153 OpOffset = 1;
3154 }
3155
3156 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3157 std::optional<ValueAndVReg> MaybeVOffset =
3159 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3160
3161 switch (Size) {
3162 default:
3163 return false;
3164 case 1:
3165 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3166 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3167 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3168 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3169 break;
3170 case 2:
3171 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3172 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3173 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3174 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3175 break;
3176 case 4:
3177 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3178 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3179 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3180 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3181 break;
3182 }
3183
3184 MachineBasicBlock *MBB = MI.getParent();
3185 const DebugLoc &DL = MI.getDebugLoc();
3186 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3187 .add(MI.getOperand(2));
3188
3189 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3190
3191 if (HasVIndex && HasVOffset) {
3192 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3193 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3194 .addReg(VIndex)
3195 .addImm(AMDGPU::sub0)
3196 .addReg(VOffset)
3197 .addImm(AMDGPU::sub1);
3198
3199 MIB.addReg(IdxReg);
3200 } else if (HasVIndex) {
3201 MIB.addReg(VIndex);
3202 } else if (HasVOffset) {
3203 MIB.addReg(VOffset);
3204 }
3205
3206 MIB.add(MI.getOperand(1)); // rsrc
3207 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3208 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3209 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3210 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3211 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3212
3213 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3214 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3215 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3216 MachinePointerInfo StorePtrI = LoadPtrI;
3217 StorePtrI.V = nullptr;
3219
3220 auto F = LoadMMO->getFlags() &
3222 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3223 Size, LoadMMO->getBaseAlign());
3224
3225 MachineMemOperand *StoreMMO =
3227 sizeof(int32_t), LoadMMO->getBaseAlign());
3228
3229 MIB.setMemRefs({LoadMMO, StoreMMO});
3230
3231 MI.eraseFromParent();
3232 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3233}
3234
3235/// Match a zero extend from a 32-bit value to 64-bits.
3237 Register ZExtSrc;
3238 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3239 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3240
3241 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3242 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3243 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3244 return Register();
3245
3246 assert(Def->getNumOperands() == 3 &&
3247 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3248 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3249 return Def->getOperand(1).getReg();
3250 }
3251
3252 return Register();
3253}
3254
3255bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3256 unsigned Opc;
3257 unsigned Size = MI.getOperand(3).getImm();
3258
3259 switch (Size) {
3260 default:
3261 return false;
3262 case 1:
3263 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3264 break;
3265 case 2:
3266 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3267 break;
3268 case 4:
3269 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3270 break;
3271 }
3272
3273 MachineBasicBlock *MBB = MI.getParent();
3274 const DebugLoc &DL = MI.getDebugLoc();
3275 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3276 .add(MI.getOperand(2));
3277
3278 Register Addr = MI.getOperand(1).getReg();
3279 Register VOffset;
3280 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3281 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3282 if (!isSGPR(Addr)) {
3283 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3284 if (isSGPR(AddrDef->Reg)) {
3285 Addr = AddrDef->Reg;
3286 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3287 Register SAddr =
3288 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3289 if (isSGPR(SAddr)) {
3290 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3291 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3292 Addr = SAddr;
3293 VOffset = Off;
3294 }
3295 }
3296 }
3297 }
3298
3299 if (isSGPR(Addr)) {
3300 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3301 if (!VOffset) {
3302 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3303 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3304 .addImm(0);
3305 }
3306 }
3307
3308 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3309 .addReg(Addr);
3310
3311 if (isSGPR(Addr))
3312 MIB.addReg(VOffset);
3313
3314 MIB.add(MI.getOperand(4)) // offset
3315 .add(MI.getOperand(5)); // cpol
3316
3317 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3318 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3319 LoadPtrI.Offset = MI.getOperand(4).getImm();
3320 MachinePointerInfo StorePtrI = LoadPtrI;
3323 auto F = LoadMMO->getFlags() &
3325 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3326 Size, LoadMMO->getBaseAlign());
3327 MachineMemOperand *StoreMMO =
3329 sizeof(int32_t), Align(4));
3330
3331 MIB.setMemRefs({LoadMMO, StoreMMO});
3332
3333 MI.eraseFromParent();
3334 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3335}
3336
3337bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3338 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3339 MI.removeOperand(1);
3340 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3341 return true;
3342}
3343
3344bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3345 unsigned Opc;
3346 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3347 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3348 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3349 break;
3350 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3351 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3352 break;
3353 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3354 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3355 break;
3356 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3357 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3358 break;
3359 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3360 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3361 break;
3362 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3363 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3364 break;
3365 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3366 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3367 break;
3368 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3369 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3370 break;
3371 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3372 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3373 break;
3374 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3375 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3376 break;
3377 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3378 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3379 break;
3380 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3381 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3382 break;
3383 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3384 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3385 break;
3386 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3387 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3388 break;
3389 default:
3390 llvm_unreachable("unhandled smfmac intrinsic");
3391 }
3392
3393 auto VDst_In = MI.getOperand(4);
3394
3395 MI.setDesc(TII.get(Opc));
3396 MI.removeOperand(4); // VDst_In
3397 MI.removeOperand(1); // Intrinsic ID
3398 MI.addOperand(VDst_In); // Readd VDst_In to the end
3399 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3400 return true;
3401}
3402
3403bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3404 Register DstReg = MI.getOperand(0).getReg();
3405 Register SrcReg = MI.getOperand(1).getReg();
3406 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3407 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3408 MachineBasicBlock *MBB = MI.getParent();
3409 const DebugLoc &DL = MI.getDebugLoc();
3410
3411 if (IsVALU) {
3412 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3413 .addImm(Subtarget->getWavefrontSizeLog2())
3414 .addReg(SrcReg);
3415 } else {
3416 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3417 .addReg(SrcReg)
3418 .addImm(Subtarget->getWavefrontSizeLog2())
3419 .setOperandDead(3); // Dead scc
3420 }
3421
3422 const TargetRegisterClass &RC =
3423 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3424 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3425 return false;
3426
3427 MI.eraseFromParent();
3428 return true;
3429}
3430
3431bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3432 Register SrcReg = MI.getOperand(0).getReg();
3433 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3434 return false;
3435
3436 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3437 Register SP =
3439 Register WaveAddr = getWaveAddress(DefMI);
3440 MachineBasicBlock *MBB = MI.getParent();
3441 const DebugLoc &DL = MI.getDebugLoc();
3442
3443 if (!WaveAddr) {
3444 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3445 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3446 .addReg(SrcReg)
3447 .addImm(Subtarget->getWavefrontSizeLog2())
3448 .setOperandDead(3); // Dead scc
3449 }
3450
3451 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3452 .addReg(WaveAddr);
3453
3454 MI.eraseFromParent();
3455 return true;
3456}
3457
3459
3460 if (!I.isPreISelOpcode()) {
3461 if (I.isCopy())
3462 return selectCOPY(I);
3463 return true;
3464 }
3465
3466 switch (I.getOpcode()) {
3467 case TargetOpcode::G_AND:
3468 case TargetOpcode::G_OR:
3469 case TargetOpcode::G_XOR:
3470 if (selectImpl(I, *CoverageInfo))
3471 return true;
3472 return selectG_AND_OR_XOR(I);
3473 case TargetOpcode::G_ADD:
3474 case TargetOpcode::G_SUB:
3475 case TargetOpcode::G_PTR_ADD:
3476 if (selectImpl(I, *CoverageInfo))
3477 return true;
3478 return selectG_ADD_SUB(I);
3479 case TargetOpcode::G_UADDO:
3480 case TargetOpcode::G_USUBO:
3481 case TargetOpcode::G_UADDE:
3482 case TargetOpcode::G_USUBE:
3483 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3484 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3485 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3486 return selectG_AMDGPU_MAD_64_32(I);
3487 case TargetOpcode::G_INTTOPTR:
3488 case TargetOpcode::G_BITCAST:
3489 case TargetOpcode::G_PTRTOINT:
3490 case TargetOpcode::G_FREEZE:
3491 return selectCOPY(I);
3492 case TargetOpcode::G_FNEG:
3493 if (selectImpl(I, *CoverageInfo))
3494 return true;
3495 return selectG_FNEG(I);
3496 case TargetOpcode::G_FABS:
3497 if (selectImpl(I, *CoverageInfo))
3498 return true;
3499 return selectG_FABS(I);
3500 case TargetOpcode::G_EXTRACT:
3501 return selectG_EXTRACT(I);
3502 case TargetOpcode::G_MERGE_VALUES:
3503 case TargetOpcode::G_CONCAT_VECTORS:
3504 return selectG_MERGE_VALUES(I);
3505 case TargetOpcode::G_UNMERGE_VALUES:
3506 return selectG_UNMERGE_VALUES(I);
3507 case TargetOpcode::G_BUILD_VECTOR:
3508 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3509 return selectG_BUILD_VECTOR(I);
3510 case TargetOpcode::G_IMPLICIT_DEF:
3511 return selectG_IMPLICIT_DEF(I);
3512 case TargetOpcode::G_INSERT:
3513 return selectG_INSERT(I);
3514 case TargetOpcode::G_INTRINSIC:
3515 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3516 return selectG_INTRINSIC(I);
3517 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3518 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3519 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3520 case TargetOpcode::G_ICMP:
3521 case TargetOpcode::G_FCMP:
3522 if (selectG_ICMP_or_FCMP(I))
3523 return true;
3524 return selectImpl(I, *CoverageInfo);
3525 case TargetOpcode::G_LOAD:
3526 case TargetOpcode::G_STORE:
3527 case TargetOpcode::G_ATOMIC_CMPXCHG:
3528 case TargetOpcode::G_ATOMICRMW_XCHG:
3529 case TargetOpcode::G_ATOMICRMW_ADD:
3530 case TargetOpcode::G_ATOMICRMW_SUB:
3531 case TargetOpcode::G_ATOMICRMW_AND:
3532 case TargetOpcode::G_ATOMICRMW_OR:
3533 case TargetOpcode::G_ATOMICRMW_XOR:
3534 case TargetOpcode::G_ATOMICRMW_MIN:
3535 case TargetOpcode::G_ATOMICRMW_MAX:
3536 case TargetOpcode::G_ATOMICRMW_UMIN:
3537 case TargetOpcode::G_ATOMICRMW_UMAX:
3538 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3539 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3540 case TargetOpcode::G_ATOMICRMW_FADD:
3541 case TargetOpcode::G_ATOMICRMW_FMIN:
3542 case TargetOpcode::G_ATOMICRMW_FMAX:
3543 return selectG_LOAD_STORE_ATOMICRMW(I);
3544 case TargetOpcode::G_SELECT:
3545 return selectG_SELECT(I);
3546 case TargetOpcode::G_TRUNC:
3547 return selectG_TRUNC(I);
3548 case TargetOpcode::G_SEXT:
3549 case TargetOpcode::G_ZEXT:
3550 case TargetOpcode::G_ANYEXT:
3551 case TargetOpcode::G_SEXT_INREG:
3552 // This is a workaround. For extension from type i1, `selectImpl()` uses
3553 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3554 // i1 can only be hold in a SGPR class.
3555 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3556 selectImpl(I, *CoverageInfo))
3557 return true;
3558 return selectG_SZA_EXT(I);
3559 case TargetOpcode::G_FPEXT:
3560 if (selectG_FPEXT(I))
3561 return true;
3562 return selectImpl(I, *CoverageInfo);
3563 case TargetOpcode::G_BRCOND:
3564 return selectG_BRCOND(I);
3565 case TargetOpcode::G_GLOBAL_VALUE:
3566 return selectG_GLOBAL_VALUE(I);
3567 case TargetOpcode::G_PTRMASK:
3568 return selectG_PTRMASK(I);
3569 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3570 return selectG_EXTRACT_VECTOR_ELT(I);
3571 case TargetOpcode::G_INSERT_VECTOR_ELT:
3572 return selectG_INSERT_VECTOR_ELT(I);
3573 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3574 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3575 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3576 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3577 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3580 assert(Intr && "not an image intrinsic with image pseudo");
3581 return selectImageIntrinsic(I, Intr);
3582 }
3583 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3584 return selectBVHIntrinsic(I);
3585 case AMDGPU::G_SBFX:
3586 case AMDGPU::G_UBFX:
3587 return selectG_SBFX_UBFX(I);
3588 case AMDGPU::G_SI_CALL:
3589 I.setDesc(TII.get(AMDGPU::SI_CALL));
3590 return true;
3591 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3592 return selectWaveAddress(I);
3593 case AMDGPU::G_STACKRESTORE:
3594 return selectStackRestore(I);
3595 case AMDGPU::G_PHI:
3596 return selectPHI(I);
3597 case TargetOpcode::G_CONSTANT:
3598 case TargetOpcode::G_FCONSTANT:
3599 default:
3600 return selectImpl(I, *CoverageInfo);
3601 }
3602 return false;
3603}
3604
3606AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3607 return {{
3608 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3609 }};
3610
3611}
3612
3613std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
3614 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
3615 unsigned Mods = 0;
3616 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3617
3618 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3619 Src = MI->getOperand(1).getReg();
3620 Mods |= SISrcMods::NEG;
3621 MI = getDefIgnoringCopies(Src, *MRI);
3622 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3623 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3624 // denormal mode, but we're implicitly canonicalizing in a source operand.
3625 const ConstantFP *LHS =
3626 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3627 if (LHS && LHS->isZero()) {
3628 Mods |= SISrcMods::NEG;
3629 Src = MI->getOperand(2).getReg();
3630 }
3631 }
3632
3633 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3634 Src = MI->getOperand(1).getReg();
3635 Mods |= SISrcMods::ABS;
3636 }
3637
3638 if (OpSel)
3639 Mods |= SISrcMods::OP_SEL_0;
3640
3641 return std::pair(Src, Mods);
3642}
3643
3644Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3645 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3646 bool ForceVGPR) const {
3647 if ((Mods != 0 || ForceVGPR) &&
3648 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3649
3650 // If we looked through copies to find source modifiers on an SGPR operand,
3651 // we now have an SGPR register source. To avoid potentially violating the
3652 // constant bus restriction, we need to insert a copy to a VGPR.
3653 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3654 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3655 TII.get(AMDGPU::COPY), VGPRSrc)
3656 .addReg(Src);
3657 Src = VGPRSrc;
3658 }
3659
3660 return Src;
3661}
3662
3663///
3664/// This will select either an SGPR or VGPR operand and will save us from
3665/// having to write an extra tablegen pattern.
3667AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3668 return {{
3669 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3670 }};
3671}
3672
3674AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3675 Register Src;
3676 unsigned Mods;
3677 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
3678
3679 return {{
3680 [=](MachineInstrBuilder &MIB) {
3681 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3682 },
3683 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3684 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3685 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3686 }};
3687}
3688
3690AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3691 Register Src;
3692 unsigned Mods;
3693 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
3694 /*IsCanonicalizing=*/true,
3695 /*AllowAbs=*/false);
3696
3697 return {{
3698 [=](MachineInstrBuilder &MIB) {
3699 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3700 },
3701 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3702 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3703 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3704 }};
3705}
3706
3708AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3709 return {{
3710 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3711 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3712 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3713 }};
3714}
3715
3717AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3718 Register Src;
3719 unsigned Mods;
3720 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
3721
3722 return {{
3723 [=](MachineInstrBuilder &MIB) {
3724 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3725 },
3726 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3727 }};
3728}
3729
3731AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3732 MachineOperand &Root) const {
3733 Register Src;
3734 unsigned Mods;
3735 std::tie(Src, Mods) =
3736 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
3737
3738 return {{
3739 [=](MachineInstrBuilder &MIB) {
3740 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3741 },
3742 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3743 }};
3744}
3745
3747AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3748 Register Src;
3749 unsigned Mods;
3750 std::tie(Src, Mods) =
3751 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
3752 /*AllowAbs=*/false);
3753
3754 return {{
3755 [=](MachineInstrBuilder &MIB) {
3756 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3757 },
3758 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3759 }};
3760}
3761
3763AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3764 Register Reg = Root.getReg();
3765 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3766 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3767 return {};
3768 return {{
3769 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3770 }};
3771}
3772
3773std::pair<Register, unsigned>
3774AMDGPUInstructionSelector::selectVOP3PModsImpl(
3775 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3776 unsigned Mods = 0;
3777 MachineInstr *MI = MRI.getVRegDef(Src);
3778
3779 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3780 // It's possible to see an f32 fneg here, but unlikely.
3781 // TODO: Treat f32 fneg as only high bit.
3782 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3784 Src = MI->getOperand(1).getReg();
3785 MI = MRI.getVRegDef(Src);
3786 }
3787
3788 // TODO: Handle G_FSUB 0 as fneg
3789
3790 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3791 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3792
3793 // Packed instructions do not have abs modifiers.
3794 Mods |= SISrcMods::OP_SEL_1;
3795
3796 return std::pair(Src, Mods);
3797}
3798
3800AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3802 = Root.getParent()->getParent()->getParent()->getRegInfo();
3803
3804 Register Src;
3805 unsigned Mods;
3806 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3807
3808 return {{
3809 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3810 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3811 }};
3812}
3813
3815AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3817 = Root.getParent()->getParent()->getParent()->getRegInfo();
3818
3819 Register Src;
3820 unsigned Mods;
3821 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3822
3823 return {{
3824 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3825 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3826 }};
3827}
3828
3830AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3831 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3832 // Value is in Imm operand as i1 sign extended to int64_t.
3833 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3834 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3835 "expected i1 value");
3836 unsigned Mods = SISrcMods::OP_SEL_1;
3837 if (Root.getImm() == -1)
3838 Mods ^= SISrcMods::NEG;
3839 return {{
3840 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3841 }};
3842}
3843
3845AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3846 MachineOperand &Root) const {
3847 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3848 "expected i1 value");
3849 unsigned Mods = SISrcMods::OP_SEL_1;
3850 if (Root.getImm() != 0)
3851 Mods |= SISrcMods::OP_SEL_0;
3852
3853 return {{
3854 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3855 }};
3856}
3857
3859 MachineInstr *InsertPt,
3861 const TargetRegisterClass *DstRegClass;
3862 switch (Elts.size()) {
3863 case 8:
3864 DstRegClass = &AMDGPU::VReg_256RegClass;
3865 break;
3866 case 4:
3867 DstRegClass = &AMDGPU::VReg_128RegClass;
3868 break;
3869 case 2:
3870 DstRegClass = &AMDGPU::VReg_64RegClass;
3871 break;
3872 default:
3873 llvm_unreachable("unhandled Reg sequence size");
3874 }
3875
3876 MachineIRBuilder B(*InsertPt);
3877 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3878 .addDef(MRI.createVirtualRegister(DstRegClass));
3879 for (unsigned i = 0; i < Elts.size(); ++i) {
3880 MIB.addReg(Elts[i]);
3882 }
3883 return MIB->getOperand(0).getReg();
3884}
3885
3886static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3888 MachineInstr *InsertPt,
3890 if (ModOpcode == TargetOpcode::G_FNEG) {
3891 Mods |= SISrcMods::NEG;
3892 // Check if all elements also have abs modifier
3893 SmallVector<Register, 8> NegAbsElts;
3894 for (auto El : Elts) {
3895 Register FabsSrc;
3896 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3897 break;
3898 NegAbsElts.push_back(FabsSrc);
3899 }
3900 if (Elts.size() != NegAbsElts.size()) {
3901 // Neg
3902 Src = buildRegSequence(Elts, InsertPt, MRI);
3903 } else {
3904 // Neg and Abs
3905 Mods |= SISrcMods::NEG_HI;
3906 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
3907 }
3908 } else {
3909 assert(ModOpcode == TargetOpcode::G_FABS);
3910 // Abs
3911 Mods |= SISrcMods::NEG_HI;
3912 Src = buildRegSequence(Elts, InsertPt, MRI);
3913 }
3914}
3915
3917AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3918 Register Src = Root.getReg();
3919 unsigned Mods = SISrcMods::OP_SEL_1;
3921
3922 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3923 assert(BV->getNumSources() > 0);
3924 // Based on first element decide which mod we match, neg or abs
3925 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3926 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3927 ? AMDGPU::G_FNEG
3928 : AMDGPU::G_FABS;
3929 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3930 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3931 if (ElF32->getOpcode() != ModOpcode)
3932 break;
3933 EltsF32.push_back(ElF32->getOperand(1).getReg());
3934 }
3935
3936 // All elements had ModOpcode modifier
3937 if (BV->getNumSources() == EltsF32.size()) {
3938 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
3939 *MRI);
3940 }
3941 }
3942
3943 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3944 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
3945}
3946
3948AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
3949 Register Src = Root.getReg();
3950 unsigned Mods = SISrcMods::OP_SEL_1;
3951 SmallVector<Register, 8> EltsV2F16;
3952
3953 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
3954 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
3955 Register FNegSrc;
3956 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
3957 break;
3958 EltsV2F16.push_back(FNegSrc);
3959 }
3960
3961 // All elements had ModOpcode modifier
3962 if (CV->getNumSources() == EltsV2F16.size()) {
3963 Mods |= SISrcMods::NEG;
3964 Mods |= SISrcMods::NEG_HI;
3965 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
3966 }
3967 }
3968
3969 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3970 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
3971}
3972
3974AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
3975 Register Src = Root.getReg();
3976 unsigned Mods = SISrcMods::OP_SEL_1;
3977 SmallVector<Register, 8> EltsV2F16;
3978
3979 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
3980 assert(CV->getNumSources() > 0);
3981 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
3982 // Based on first element decide which mod we match, neg or abs
3983 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
3984 ? AMDGPU::G_FNEG
3985 : AMDGPU::G_FABS;
3986
3987 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
3988 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
3989 if (ElV2F16->getOpcode() != ModOpcode)
3990 break;
3991 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
3992 }
3993
3994 // All elements had ModOpcode modifier
3995 if (CV->getNumSources() == EltsV2F16.size()) {
3996 MachineIRBuilder B(*Root.getParent());
3997 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
3998 *MRI);
3999 }
4000 }
4001
4002 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4003 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4004}
4005
4007AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4008 std::optional<FPValueAndVReg> FPValReg;
4009 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4010 if (TII.isInlineConstant(FPValReg->Value)) {
4011 return {{[=](MachineInstrBuilder &MIB) {
4012 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4013 }}};
4014 }
4015 // Non-inlineable splat floats should not fall-through for integer immediate
4016 // checks.
4017 return {};
4018 }
4019
4020 APInt ICst;
4021 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4022 if (TII.isInlineConstant(ICst)) {
4023 return {
4024 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4025 }
4026 }
4027
4028 return {};
4029}
4030
4032AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4033 Register Src =
4034 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4035 unsigned Key = 0;
4036
4037 Register ShiftSrc;
4038 std::optional<ValueAndVReg> ShiftAmt;
4039 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4040 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4041 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4042 Key = ShiftAmt->Value.getZExtValue() / 8;
4043 Src = ShiftSrc;
4044 }
4045
4046 return {{
4047 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4049 }};
4050}
4051
4053AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4054
4055 Register Src =
4056 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4057 unsigned Key = 0;
4058
4059 Register ShiftSrc;
4060 std::optional<ValueAndVReg> ShiftAmt;
4061 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4062 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4063 ShiftAmt->Value.getZExtValue() == 16) {
4064 Src = ShiftSrc;
4065 Key = 1;
4066 }
4067
4068 return {{
4069 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4070 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4071 }};
4072}
4073
4075AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4076 Register Src;
4077 unsigned Mods;
4078 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4079
4080 // FIXME: Handle op_sel
4081 return {{
4082 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4083 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4084 }};
4085}
4086
4088AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4089 Register Src;
4090 unsigned Mods;
4091 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4092 /*IsCanonicalizing=*/true,
4093 /*AllowAbs=*/false,
4094 /*OpSel=*/false);
4095
4096 return {{
4097 [=](MachineInstrBuilder &MIB) {
4098 MIB.addReg(
4099 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4100 },
4101 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4102 }};
4103}
4104
4106AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4107 Register Src;
4108 unsigned Mods;
4109 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4110 /*IsCanonicalizing=*/true,
4111 /*AllowAbs=*/false,
4112 /*OpSel=*/true);
4113
4114 return {{
4115 [=](MachineInstrBuilder &MIB) {
4116 MIB.addReg(
4117 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4118 },
4119 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4120 }};
4121}
4122
4123bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4124 Register &Base,
4125 Register *SOffset,
4126 int64_t *Offset) const {
4127 MachineInstr *MI = Root.getParent();
4128 MachineBasicBlock *MBB = MI->getParent();
4129
4130 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4131 // then we can select all ptr + 32-bit offsets.
4132 SmallVector<GEPInfo, 4> AddrInfo;
4133 getAddrModeInfo(*MI, *MRI, AddrInfo);
4134
4135 if (AddrInfo.empty())
4136 return false;
4137
4138 const GEPInfo &GEPI = AddrInfo[0];
4139 std::optional<int64_t> EncodedImm;
4140
4141 if (SOffset && Offset) {
4142 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4143 /*HasSOffset=*/true);
4144 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4145 AddrInfo.size() > 1) {
4146 const GEPInfo &GEPI2 = AddrInfo[1];
4147 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4148 if (Register OffsetReg =
4149 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4150 Base = GEPI2.SgprParts[0];
4151 *SOffset = OffsetReg;
4152 *Offset = *EncodedImm;
4153 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4154 return true;
4155
4156 // For unbuffered smem loads, it is illegal for the Immediate Offset
4157 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4158 // is negative. Handle the case where the Immediate Offset + SOffset
4159 // is negative.
4160 auto SKnown = KB->getKnownBits(*SOffset);
4161 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4162 return false;
4163
4164 return true;
4165 }
4166 }
4167 }
4168 return false;
4169 }
4170
4171 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4172 /*HasSOffset=*/false);
4173 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4174 Base = GEPI.SgprParts[0];
4175 *Offset = *EncodedImm;
4176 return true;
4177 }
4178
4179 // SGPR offset is unsigned.
4180 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4181 GEPI.Imm != 0) {
4182 // If we make it this far we have a load with an 32-bit immediate offset.
4183 // It is OK to select this using a sgpr offset, because we have already
4184 // failed trying to select this load into one of the _IMM variants since
4185 // the _IMM Patterns are considered before the _SGPR patterns.
4186 Base = GEPI.SgprParts[0];
4187 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4188 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4189 .addImm(GEPI.Imm);
4190 return true;
4191 }
4192
4193 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4194 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4195 Base = GEPI.SgprParts[0];
4196 *SOffset = OffsetReg;
4197 return true;
4198 }
4199 }
4200
4201 return false;
4202}
4203
4205AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4206 Register Base;
4207 int64_t Offset;
4208 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4209 return std::nullopt;
4210
4211 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4212 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4213}
4214
4216AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4217 SmallVector<GEPInfo, 4> AddrInfo;
4218 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4219
4220 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4221 return std::nullopt;
4222
4223 const GEPInfo &GEPInfo = AddrInfo[0];
4224 Register PtrReg = GEPInfo.SgprParts[0];
4225 std::optional<int64_t> EncodedImm =
4226 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4227 if (!EncodedImm)
4228 return std::nullopt;
4229
4230 return {{
4231 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4232 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4233 }};
4234}
4235
4237AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4238 Register Base, SOffset;
4239 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4240 return std::nullopt;
4241
4242 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4243 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4244}
4245
4247AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4248 Register Base, SOffset;
4249 int64_t Offset;
4250 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4251 return std::nullopt;
4252
4253 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4254 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4255 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4256}
4257
4258std::pair<Register, int>
4259AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4260 uint64_t FlatVariant) const {
4261 MachineInstr *MI = Root.getParent();
4262
4263 auto Default = std::pair(Root.getReg(), 0);
4264
4265 if (!STI.hasFlatInstOffsets())
4266 return Default;
4267
4268 Register PtrBase;
4269 int64_t ConstOffset;
4270 std::tie(PtrBase, ConstOffset) =
4271 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4272
4273 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4274 !isFlatScratchBaseLegal(Root.getReg())))
4275 return Default;
4276
4277 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4278 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4279 return Default;
4280
4281 return std::pair(PtrBase, ConstOffset);
4282}
4283
4285AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4286 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4287
4288 return {{
4289 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4290 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4291 }};
4292}
4293
4295AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4296 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4297
4298 return {{
4299 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4300 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4301 }};
4302}
4303
4305AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4306 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4307
4308 return {{
4309 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4310 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4311 }};
4312}
4313
4314// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4316AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4317 Register Addr = Root.getReg();
4318 Register PtrBase;
4319 int64_t ConstOffset;
4320 int64_t ImmOffset = 0;
4321
4322 // Match the immediate offset first, which canonically is moved as low as
4323 // possible.
4324 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4325
4326 if (ConstOffset != 0) {
4327 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4329 Addr = PtrBase;
4330 ImmOffset = ConstOffset;
4331 } else {
4332 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4333 if (isSGPR(PtrBaseDef->Reg)) {
4334 if (ConstOffset > 0) {
4335 // Offset is too large.
4336 //
4337 // saddr + large_offset -> saddr +
4338 // (voffset = large_offset & ~MaxOffset) +
4339 // (large_offset & MaxOffset);
4340 int64_t SplitImmOffset, RemainderOffset;
4341 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4343
4344 if (isUInt<32>(RemainderOffset)) {
4345 MachineInstr *MI = Root.getParent();
4346 MachineBasicBlock *MBB = MI->getParent();
4347 Register HighBits =
4348 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4349
4350 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4351 HighBits)
4352 .addImm(RemainderOffset);
4353
4354 return {{
4355 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4356 [=](MachineInstrBuilder &MIB) {
4357 MIB.addReg(HighBits);
4358 }, // voffset
4359 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4360 }};
4361 }
4362 }
4363
4364 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4365 // is 1 we would need to perform 1 or 2 extra moves for each half of
4366 // the constant and it is better to do a scalar add and then issue a
4367 // single VALU instruction to materialize zero. Otherwise it is less
4368 // instructions to perform VALU adds with immediates or inline literals.
4369 unsigned NumLiterals =
4370 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4371 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4372 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4373 return std::nullopt;
4374 }
4375 }
4376 }
4377
4378 // Match the variable offset.
4379 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4380 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4381 // Look through the SGPR->VGPR copy.
4382 Register SAddr =
4383 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4384
4385 if (isSGPR(SAddr)) {
4386 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4387
4388 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4389 // inserted later.
4390 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4391 return {{[=](MachineInstrBuilder &MIB) { // saddr
4392 MIB.addReg(SAddr);
4393 },
4394 [=](MachineInstrBuilder &MIB) { // voffset
4395 MIB.addReg(VOffset);
4396 },
4397 [=](MachineInstrBuilder &MIB) { // offset
4398 MIB.addImm(ImmOffset);
4399 }}};
4400 }
4401 }
4402 }
4403
4404 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4405 // drop this.
4406 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4407 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4408 return std::nullopt;
4409
4410 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4411 // moves required to copy a 64-bit SGPR to VGPR.
4412 MachineInstr *MI = Root.getParent();
4413 MachineBasicBlock *MBB = MI->getParent();
4414 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4415
4416 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4417 .addImm(0);
4418
4419 return {{
4420 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4421 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4422 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4423 }};
4424}
4425
4427AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4428 Register Addr = Root.getReg();
4429 Register PtrBase;
4430 int64_t ConstOffset;
4431 int64_t ImmOffset = 0;
4432
4433 // Match the immediate offset first, which canonically is moved as low as
4434 // possible.
4435 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4436
4437 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4440 Addr = PtrBase;
4441 ImmOffset = ConstOffset;
4442 }
4443
4444 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4445 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4446 int FI = AddrDef->MI->getOperand(1).getIndex();
4447 return {{
4448 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4449 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4450 }};
4451 }
4452
4453 Register SAddr = AddrDef->Reg;
4454
4455 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4456 Register LHS = AddrDef->MI->getOperand(1).getReg();
4457 Register RHS = AddrDef->MI->getOperand(2).getReg();
4458 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4459 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4460
4461 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4462 isSGPR(RHSDef->Reg)) {
4463 int FI = LHSDef->MI->getOperand(1).getIndex();
4464 MachineInstr &I = *Root.getParent();
4465 MachineBasicBlock *BB = I.getParent();
4466 const DebugLoc &DL = I.getDebugLoc();
4467 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4468
4469 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4470 .addFrameIndex(FI)
4471 .addReg(RHSDef->Reg)
4472 .setOperandDead(3); // Dead scc
4473 }
4474 }
4475
4476 if (!isSGPR(SAddr))
4477 return std::nullopt;
4478
4479 return {{
4480 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4481 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4482 }};
4483}
4484
4485// Check whether the flat scratch SVS swizzle bug affects this access.
4486bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4487 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4488 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4489 return false;
4490
4491 // The bug affects the swizzling of SVS accesses if there is any carry out
4492 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4493 // voffset to (soffset + inst_offset).
4494 auto VKnown = KB->getKnownBits(VAddr);
4495 auto SKnown = KnownBits::add(KB->getKnownBits(SAddr),
4496 KnownBits::makeConstant(APInt(32, ImmOffset)));
4497 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4498 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4499 return (VMax & 3) + (SMax & 3) >= 4;
4500}
4501
4503AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4504 Register Addr = Root.getReg();
4505 Register PtrBase;
4506 int64_t ConstOffset;
4507 int64_t ImmOffset = 0;
4508
4509 // Match the immediate offset first, which canonically is moved as low as
4510 // possible.
4511 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4512
4513 Register OrigAddr = Addr;
4514 if (ConstOffset != 0 &&
4515 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4516 Addr = PtrBase;
4517 ImmOffset = ConstOffset;
4518 }
4519
4520 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4521 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4522 return std::nullopt;
4523
4524 Register RHS = AddrDef->MI->getOperand(2).getReg();
4525 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4526 return std::nullopt;
4527
4528 Register LHS = AddrDef->MI->getOperand(1).getReg();
4529 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4530
4531 if (OrigAddr != Addr) {
4532 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4533 return std::nullopt;
4534 } else {
4535 if (!isFlatScratchBaseLegalSV(OrigAddr))
4536 return std::nullopt;
4537 }
4538
4539 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4540 return std::nullopt;
4541
4542 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4543 int FI = LHSDef->MI->getOperand(1).getIndex();
4544 return {{
4545 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4546 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4547 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4548 }};
4549 }
4550
4551 if (!isSGPR(LHS))
4552 return std::nullopt;
4553
4554 return {{
4555 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4556 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4557 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4558 }};
4559}
4560
4562AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4563 MachineInstr *MI = Root.getParent();
4564 MachineBasicBlock *MBB = MI->getParent();
4567
4568 int64_t Offset = 0;
4569 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4571 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4572
4573 // TODO: Should this be inside the render function? The iterator seems to
4574 // move.
4575 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4576 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4577 HighBits)
4578 .addImm(Offset & ~MaxOffset);
4579
4580 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4581 MIB.addReg(Info->getScratchRSrcReg());
4582 },
4583 [=](MachineInstrBuilder &MIB) { // vaddr
4584 MIB.addReg(HighBits);
4585 },
4586 [=](MachineInstrBuilder &MIB) { // soffset
4587 // Use constant zero for soffset and rely on eliminateFrameIndex
4588 // to choose the appropriate frame register if need be.
4589 MIB.addImm(0);
4590 },
4591 [=](MachineInstrBuilder &MIB) { // offset
4592 MIB.addImm(Offset & MaxOffset);
4593 }}};
4594 }
4595
4596 assert(Offset == 0 || Offset == -1);
4597
4598 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4599 // offsets.
4600 std::optional<int> FI;
4601 Register VAddr = Root.getReg();
4602 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4603 Register PtrBase;
4604 int64_t ConstOffset;
4605 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4606 if (ConstOffset != 0) {
4607 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4609 KB->signBitIsZero(PtrBase))) {
4610 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4611 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4612 FI = PtrBaseDef->getOperand(1).getIndex();
4613 else
4614 VAddr = PtrBase;
4615 Offset = ConstOffset;
4616 }
4617 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4618 FI = RootDef->getOperand(1).getIndex();
4619 }
4620 }
4621
4622 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4623 MIB.addReg(Info->getScratchRSrcReg());
4624 },
4625 [=](MachineInstrBuilder &MIB) { // vaddr
4626 if (FI)
4627 MIB.addFrameIndex(*FI);
4628 else
4629 MIB.addReg(VAddr);
4630 },
4631 [=](MachineInstrBuilder &MIB) { // soffset
4632 // Use constant zero for soffset and rely on eliminateFrameIndex
4633 // to choose the appropriate frame register if need be.
4634 MIB.addImm(0);
4635 },
4636 [=](MachineInstrBuilder &MIB) { // offset
4637 MIB.addImm(Offset);
4638 }}};
4639}
4640
4641bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4642 int64_t Offset) const {
4643 if (!isUInt<16>(Offset))
4644 return false;
4645
4647 return true;
4648
4649 // On Southern Islands instruction with a negative base value and an offset
4650 // don't seem to work.
4651 return KB->signBitIsZero(Base);
4652}
4653
4654bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4655 int64_t Offset1,
4656 unsigned Size) const {
4657 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4658 return false;
4659 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4660 return false;
4661
4663 return true;
4664
4665 // On Southern Islands instruction with a negative base value and an offset
4666 // don't seem to work.
4667 return KB->signBitIsZero(Base);
4668}
4669
4670// Return whether the operation has NoUnsignedWrap property.
4672 return Addr->getOpcode() == TargetOpcode::G_OR ||
4673 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4674 Addr->getFlag(MachineInstr::NoUWrap));
4675}
4676
4677// Check that the base address of flat scratch load/store in the form of `base +
4678// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4679// requirement). We always treat the first operand as the base address here.
4680bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4681 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4682
4683 if (isNoUnsignedWrap(AddrMI))
4684 return true;
4685
4686 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4687 // values.
4688 if (STI.hasSignedScratchOffsets())
4689 return true;
4690
4691 Register LHS = AddrMI->getOperand(1).getReg();
4692 Register RHS = AddrMI->getOperand(2).getReg();
4693
4694 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4695 std::optional<ValueAndVReg> RhsValReg =
4697 // If the immediate offset is negative and within certain range, the base
4698 // address cannot also be negative. If the base is also negative, the sum
4699 // would be either negative or much larger than the valid range of scratch
4700 // memory a thread can access.
4701 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4702 RhsValReg->Value.getSExtValue() > -0x40000000)
4703 return true;
4704 }
4705
4706 return KB->signBitIsZero(LHS);
4707}
4708
4709// Check address value in SGPR/VGPR are legal for flat scratch in the form
4710// of: SGPR + VGPR.
4711bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4712 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4713
4714 if (isNoUnsignedWrap(AddrMI))
4715 return true;
4716
4717 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4718 // values.
4719 if (STI.hasSignedScratchOffsets())
4720 return true;
4721
4722 Register LHS = AddrMI->getOperand(1).getReg();
4723 Register RHS = AddrMI->getOperand(2).getReg();
4724 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4725}
4726
4727// Check address value in SGPR/VGPR are legal for flat scratch in the form
4728// of: SGPR + VGPR + Imm.
4729bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4730 Register Addr) const {
4731 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4732 // values.
4733 if (STI.hasSignedScratchOffsets())
4734 return true;
4735
4736 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4737 Register Base = AddrMI->getOperand(1).getReg();
4738 std::optional<DefinitionAndSourceRegister> BaseDef =
4740 std::optional<ValueAndVReg> RHSOffset =
4742 assert(RHSOffset);
4743
4744 // If the immediate offset is negative and within certain range, the base
4745 // address cannot also be negative. If the base is also negative, the sum
4746 // would be either negative or much larger than the valid range of scratch
4747 // memory a thread can access.
4748 if (isNoUnsignedWrap(BaseDef->MI) &&
4749 (isNoUnsignedWrap(AddrMI) ||
4750 (RHSOffset->Value.getSExtValue() < 0 &&
4751 RHSOffset->Value.getSExtValue() > -0x40000000)))
4752 return true;
4753
4754 Register LHS = BaseDef->MI->getOperand(1).getReg();
4755 Register RHS = BaseDef->MI->getOperand(2).getReg();
4756 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4757}
4758
4759bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4760 unsigned ShAmtBits) const {
4761 assert(MI.getOpcode() == TargetOpcode::G_AND);
4762
4763 std::optional<APInt> RHS =
4764 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4765 if (!RHS)
4766 return false;
4767
4768 if (RHS->countr_one() >= ShAmtBits)
4769 return true;
4770
4771 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4772 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4773}
4774
4776AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4777 MachineOperand &Root) const {
4778 Register Reg = Root.getReg();
4780
4781 std::optional<DefinitionAndSourceRegister> Def =
4782 getDefSrcRegIgnoringCopies(Reg, *MRI);
4783 assert(Def && "this shouldn't be an optional result");
4784 Reg = Def->Reg;
4785
4786 if (Register WaveBase = getWaveAddress(Def->MI)) {
4787 return {{
4788 [=](MachineInstrBuilder &MIB) { // rsrc
4789 MIB.addReg(Info->getScratchRSrcReg());
4790 },
4791 [=](MachineInstrBuilder &MIB) { // soffset
4792 MIB.addReg(WaveBase);
4793 },
4794 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4795 }};
4796 }
4797
4798 int64_t Offset = 0;
4799
4800 // FIXME: Copy check is a hack
4802 if (mi_match(Reg, *MRI,
4803 m_GPtrAdd(m_Reg(BasePtr),
4805 if (!TII.isLegalMUBUFImmOffset(Offset))
4806 return {};
4807 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4808 Register WaveBase = getWaveAddress(BasePtrDef);
4809 if (!WaveBase)
4810 return {};
4811
4812 return {{
4813 [=](MachineInstrBuilder &MIB) { // rsrc
4814 MIB.addReg(Info->getScratchRSrcReg());
4815 },
4816 [=](MachineInstrBuilder &MIB) { // soffset
4817 MIB.addReg(WaveBase);
4818 },
4819 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4820 }};
4821 }
4822
4823 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4825 return {};
4826
4827 return {{
4828 [=](MachineInstrBuilder &MIB) { // rsrc
4829 MIB.addReg(Info->getScratchRSrcReg());
4830 },
4831 [=](MachineInstrBuilder &MIB) { // soffset
4832 MIB.addImm(0);
4833 },
4834 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4835 }};
4836}
4837
4838std::pair<Register, unsigned>
4839AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4840 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4841 if (!RootDef)
4842 return std::pair(Root.getReg(), 0);
4843
4844 int64_t ConstAddr = 0;
4845
4846 Register PtrBase;
4847 int64_t Offset;
4848 std::tie(PtrBase, Offset) =
4849 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4850
4851 if (Offset) {
4852 if (isDSOffsetLegal(PtrBase, Offset)) {
4853 // (add n0, c0)
4854 return std::pair(PtrBase, Offset);
4855 }
4856 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4857 // TODO
4858
4859
4860 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4861 // TODO
4862
4863 }
4864
4865 return std::pair(Root.getReg(), 0);
4866}
4867
4869AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4870 Register Reg;
4871 unsigned Offset;
4872 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4873 return {{
4874 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4875 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4876 }};
4877}
4878
4880AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4881 return selectDSReadWrite2(Root, 4);
4882}
4883
4885AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4886 return selectDSReadWrite2(Root, 8);
4887}
4888
4890AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4891 unsigned Size) const {
4892 Register Reg;
4893 unsigned Offset;
4894 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4895 return {{
4896 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4897 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4898 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4899 }};
4900}
4901
4902std::pair<Register, unsigned>
4903AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4904 unsigned Size) const {
4905 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4906 if (!RootDef)
4907 return std::pair(Root.getReg(), 0);
4908
4909 int64_t ConstAddr = 0;
4910
4911 Register PtrBase;
4912 int64_t Offset;
4913 std::tie(PtrBase, Offset) =
4914 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4915
4916 if (Offset) {
4917 int64_t OffsetValue0 = Offset;
4918 int64_t OffsetValue1 = Offset + Size;
4919 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4920 // (add n0, c0)
4921 return std::pair(PtrBase, OffsetValue0 / Size);
4922 }
4923 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4924 // TODO
4925
4926 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4927 // TODO
4928
4929 }
4930
4931 return std::pair(Root.getReg(), 0);
4932}
4933
4934/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4935/// the base value with the constant offset. There may be intervening copies
4936/// between \p Root and the identified constant. Returns \p Root, 0 if this does
4937/// not match the pattern.
4938std::pair<Register, int64_t>
4939AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4940 Register Root, const MachineRegisterInfo &MRI) const {
4941 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4942 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4943 return {Root, 0};
4944
4945 MachineOperand &RHS = RootI->getOperand(2);
4946 std::optional<ValueAndVReg> MaybeOffset =
4948 if (!MaybeOffset)
4949 return {Root, 0};
4950 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4951}
4952
4954 MIB.addImm(0);
4955}
4956
4957/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
4958/// BasePtr is not valid, a null base pointer will be used.
4960 uint32_t FormatLo, uint32_t FormatHi,
4961 Register BasePtr) {
4962 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4963 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4964 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4965 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
4966
4967 B.buildInstr(AMDGPU::S_MOV_B32)
4968 .addDef(RSrc2)
4969 .addImm(FormatLo);
4970 B.buildInstr(AMDGPU::S_MOV_B32)
4971 .addDef(RSrc3)
4972 .addImm(FormatHi);
4973
4974 // Build the half of the subregister with the constants before building the
4975 // full 128-bit register. If we are building multiple resource descriptors,
4976 // this will allow CSEing of the 2-component register.
4977 B.buildInstr(AMDGPU::REG_SEQUENCE)
4978 .addDef(RSrcHi)
4979 .addReg(RSrc2)
4980 .addImm(AMDGPU::sub0)
4981 .addReg(RSrc3)
4982 .addImm(AMDGPU::sub1);
4983
4984 Register RSrcLo = BasePtr;
4985 if (!BasePtr) {
4986 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4987 B.buildInstr(AMDGPU::S_MOV_B64)
4988 .addDef(RSrcLo)
4989 .addImm(0);
4990 }
4991
4992 B.buildInstr(AMDGPU::REG_SEQUENCE)
4993 .addDef(RSrc)
4994 .addReg(RSrcLo)
4995 .addImm(AMDGPU::sub0_sub1)
4996 .addReg(RSrcHi)
4997 .addImm(AMDGPU::sub2_sub3);
4998
4999 return RSrc;
5000}
5001
5003 const SIInstrInfo &TII, Register BasePtr) {
5004 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5005
5006 // FIXME: Why are half the "default" bits ignored based on the addressing
5007 // mode?
5008 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5009}
5010
5012 const SIInstrInfo &TII, Register BasePtr) {
5013 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5014
5015 // FIXME: Why are half the "default" bits ignored based on the addressing
5016 // mode?
5017 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5018}
5019
5020AMDGPUInstructionSelector::MUBUFAddressData
5021AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5022 MUBUFAddressData Data;
5023 Data.N0 = Src;
5024
5025 Register PtrBase;
5026 int64_t Offset;
5027
5028 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5029 if (isUInt<32>(Offset)) {
5030 Data.N0 = PtrBase;
5031 Data.Offset = Offset;
5032 }
5033
5034 if (MachineInstr *InputAdd
5035 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5036 Data.N2 = InputAdd->getOperand(1).getReg();
5037 Data.N3 = InputAdd->getOperand(2).getReg();
5038
5039 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5040 // FIXME: Don't know this was defined by operand 0
5041 //
5042 // TODO: Remove this when we have copy folding optimizations after
5043 // RegBankSelect.
5044 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5045 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5046 }
5047
5048 return Data;
5049}
5050
5051/// Return if the addr64 mubuf mode should be used for the given address.
5052bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5053 // (ptr_add N2, N3) -> addr64, or
5054 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5055 if (Addr.N2)
5056 return true;
5057
5058 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5059 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5060}
5061
5062/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5063/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5064/// component.
5065void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5066 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5067 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5068 return;
5069
5070 // Illegal offset, store it in soffset.
5071 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5072 B.buildInstr(AMDGPU::S_MOV_B32)
5073 .addDef(SOffset)
5074 .addImm(ImmOffset);
5075 ImmOffset = 0;
5076}
5077
5078bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5079 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5080 Register &SOffset, int64_t &Offset) const {
5081 // FIXME: Predicates should stop this from reaching here.
5082 // addr64 bit was removed for volcanic islands.
5083 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5084 return false;
5085
5086 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5087 if (!shouldUseAddr64(AddrData))
5088 return false;
5089
5090 Register N0 = AddrData.N0;
5091 Register N2 = AddrData.N2;
5092 Register N3 = AddrData.N3;
5093 Offset = AddrData.Offset;
5094
5095 // Base pointer for the SRD.
5096 Register SRDPtr;
5097
5098 if (N2) {
5099 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5100 assert(N3);
5101 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5102 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5103 // addr64, and construct the default resource from a 0 address.
5104 VAddr = N0;
5105 } else {
5106 SRDPtr = N3;
5107 VAddr = N2;
5108 }
5109 } else {
5110 // N2 is not divergent.
5111 SRDPtr = N2;
5112 VAddr = N3;
5113 }
5114 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5115 // Use the default null pointer in the resource
5116 VAddr = N0;
5117 } else {
5118 // N0 -> offset, or
5119 // (N0 + C1) -> offset
5120 SRDPtr = N0;
5121 }
5122
5123 MachineIRBuilder B(*Root.getParent());
5124 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5125 splitIllegalMUBUFOffset(B, SOffset, Offset);
5126 return true;
5127}
5128
5129bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5130 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5131 int64_t &Offset) const {
5132
5133 // FIXME: Pattern should not reach here.
5134 if (STI.useFlatForGlobal())
5135 return false;
5136
5137 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5138 if (shouldUseAddr64(AddrData))
5139 return false;
5140
5141 // N0 -> offset, or
5142 // (N0 + C1) -> offset
5143 Register SRDPtr = AddrData.N0;
5144 Offset = AddrData.Offset;
5145
5146 // TODO: Look through extensions for 32-bit soffset.
5147 MachineIRBuilder B(*Root.getParent());
5148
5149 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5150 splitIllegalMUBUFOffset(B, SOffset, Offset);
5151 return true;
5152}
5153
5155AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5156 Register VAddr;
5157 Register RSrcReg;
5158 Register SOffset;
5159 int64_t Offset = 0;
5160
5161 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5162 return {};
5163
5164 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5165 // pattern.
5166 return {{
5167 [=](MachineInstrBuilder &MIB) { // rsrc
5168 MIB.addReg(RSrcReg);
5169 },
5170 [=](MachineInstrBuilder &MIB) { // vaddr
5171 MIB.addReg(VAddr);
5172 },
5173 [=](MachineInstrBuilder &MIB) { // soffset
5174 if (SOffset)
5175 MIB.addReg(SOffset);
5176 else if (STI.hasRestrictedSOffset())
5177 MIB.addReg(AMDGPU::SGPR_NULL);
5178 else
5179 MIB.addImm(0);
5180 },
5181 [=](MachineInstrBuilder &MIB) { // offset
5182 MIB.addImm(Offset);
5183 },
5184 addZeroImm, // cpol
5185 addZeroImm, // tfe
5186 addZeroImm // swz
5187 }};
5188}
5189
5191AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5192 Register RSrcReg;
5193 Register SOffset;
5194 int64_t Offset = 0;
5195
5196 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5197 return {};
5198
5199 return {{
5200 [=](MachineInstrBuilder &MIB) { // rsrc
5201 MIB.addReg(RSrcReg);
5202 },
5203 [=](MachineInstrBuilder &MIB) { // soffset
5204 if (SOffset)
5205 MIB.addReg(SOffset);
5206 else if (STI.hasRestrictedSOffset())
5207 MIB.addReg(AMDGPU::SGPR_NULL);
5208 else
5209 MIB.addImm(0);
5210 },
5211 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5212 addZeroImm, // cpol
5213 addZeroImm, // tfe
5214 addZeroImm, // swz
5215 }};
5216}
5217
5219AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5220
5221 Register SOffset = Root.getReg();
5222
5223 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5224 SOffset = AMDGPU::SGPR_NULL;
5225
5226 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5227}
5228
5229/// Get an immediate that must be 32-bits, and treated as zero extended.
5230static std::optional<uint64_t>
5232 // getIConstantVRegVal sexts any values, so see if that matters.
5233 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5234 if (!OffsetVal || !isInt<32>(*OffsetVal))
5235 return std::nullopt;
5236 return Lo_32(*OffsetVal);
5237}
5238
5240AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5241 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5242 if (!OffsetVal)
5243 return {};
5244
5245 std::optional<int64_t> EncodedImm =
5246 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5247 if (!EncodedImm)
5248 return {};
5249
5250 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5251}
5252
5254AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5256
5257 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5258 if (!OffsetVal)
5259 return {};
5260
5261 std::optional<int64_t> EncodedImm =
5263 if (!EncodedImm)
5264 return {};
5265
5266 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5267}
5268
5270AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5271 // Match the (soffset + offset) pair as a 32-bit register base and
5272 // an immediate offset.
5273 Register SOffset;
5274 unsigned Offset;
5275 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5276 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5277 if (!SOffset)
5278 return std::nullopt;
5279
5280 std::optional<int64_t> EncodedOffset =
5281 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5282 if (!EncodedOffset)
5283 return std::nullopt;
5284
5285 assert(MRI->getType(SOffset) == LLT::scalar(32));
5286 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5287 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5288}
5289
5290std::pair<Register, unsigned>
5291AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5292 bool &Matched) const {
5293 Matched = false;
5294
5295 Register Src;
5296 unsigned Mods;
5297 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5298
5299 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
5300 assert(MRI->getType(Src) == LLT::scalar(16));
5301
5302 // Only change Src if src modifier could be gained. In such cases new Src
5303 // could be sgpr but this does not violate constant bus restriction for
5304 // instruction that is being selected.
5305 // Note: Src is not changed when there is only a simple sgpr to vgpr copy
5306 // since this could violate constant bus restriction.
5307 Register PeekSrc = stripCopy(Src, *MRI);
5308
5309 const auto CheckAbsNeg = [&]() {
5310 // Be careful about folding modifiers if we already have an abs. fneg is
5311 // applied last, so we don't want to apply an earlier fneg.
5312 if ((Mods & SISrcMods::ABS) == 0) {
5313 unsigned ModsTmp;
5314 std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
5315
5316 if ((ModsTmp & SISrcMods::NEG) != 0) {
5317 Mods ^= SISrcMods::NEG;
5318 Src = PeekSrc;
5319 }
5320
5321 if ((ModsTmp & SISrcMods::ABS) != 0) {
5322 Mods |= SISrcMods::ABS;
5323 Src = PeekSrc;
5324 }
5325 }
5326 };
5327
5328 CheckAbsNeg();
5329
5330 // op_sel/op_sel_hi decide the source type and source.
5331 // If the source's op_sel_hi is set, it indicates to do a conversion from
5332 // fp16. If the sources's op_sel is set, it picks the high half of the
5333 // source register.
5334
5335 Mods |= SISrcMods::OP_SEL_1;
5336
5337 if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
5338 Src = PeekSrc;
5339 Mods |= SISrcMods::OP_SEL_0;
5340 CheckAbsNeg();
5341 }
5342
5343 Matched = true;
5344 }
5345
5346 return {Src, Mods};
5347}
5348
5350AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5351 MachineOperand &Root) const {
5352 Register Src;
5353 unsigned Mods;
5354 bool Matched;
5355 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5356 if (!Matched)
5357 return {};
5358
5359 return {{
5360 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5361 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5362 }};
5363}
5364
5366AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5367 Register Src;
5368 unsigned Mods;
5369 bool Matched;
5370 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5371
5372 return {{
5373 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5374 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5375 }};
5376}
5377
5378bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5379 MachineInstr &I, Intrinsic::ID IntrID) const {
5380 MachineBasicBlock *MBB = I.getParent();
5381 const DebugLoc &DL = I.getDebugLoc();
5382 Register CCReg = I.getOperand(0).getReg();
5383
5384 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5385
5386 if (HasM0) {
5387 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5388 .addReg(I.getOperand(2).getReg());
5389 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5390 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5391 return false;
5392 } else {
5393 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5394 .addImm(I.getOperand(2).getImm());
5395 }
5396
5397 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5398
5399 I.eraseFromParent();
5400 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5401 *MRI);
5402}
5403
5404unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5405 if (HasInlineConst) {
5406 switch (IntrID) {
5407 default:
5408 llvm_unreachable("not a named barrier op");
5409 case Intrinsic::amdgcn_s_barrier_init:
5410 return AMDGPU::S_BARRIER_INIT_IMM;
5411 case Intrinsic::amdgcn_s_barrier_join:
5412 return AMDGPU::S_BARRIER_JOIN_IMM;
5413 case Intrinsic::amdgcn_s_wakeup_barrier:
5414 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5415 case Intrinsic::amdgcn_s_get_barrier_state:
5416 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5417 };
5418 } else {
5419 switch (IntrID) {
5420 default:
5421 llvm_unreachable("not a named barrier op");
5422 case Intrinsic::amdgcn_s_barrier_init:
5423 return AMDGPU::S_BARRIER_INIT_M0;
5424 case Intrinsic::amdgcn_s_barrier_join:
5425 return AMDGPU::S_BARRIER_JOIN_M0;
5426 case Intrinsic::amdgcn_s_wakeup_barrier:
5427 return AMDGPU::S_WAKEUP_BARRIER_M0;
5428 case Intrinsic::amdgcn_s_get_barrier_state:
5429 return AMDGPU::S_GET_BARRIER_STATE_M0;
5430 };
5431 }
5432}
5433
5434bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5435 MachineInstr &I, Intrinsic::ID IntrID) const {
5436 MachineBasicBlock *MBB = I.getParent();
5437 const DebugLoc &DL = I.getDebugLoc();
5438 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5439 ? I.getOperand(2)
5440 : I.getOperand(1);
5441 std::optional<int64_t> BarValImm =
5442 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5443 Register M0Val;
5444 Register TmpReg0;
5445
5446 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5447 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5448 Register MemberCount = I.getOperand(2).getReg();
5449 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5450 // TODO: This should be expanded during legalization so that the the S_LSHL
5451 // and S_OR can be constant-folded
5452 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5453 .addImm(16)
5454 .addReg(MemberCount);
5455 M0Val = TmpReg0;
5456 }
5457
5458 // If not inlinable, get reference to barrier depending on the instruction
5459 if (!BarValImm) {
5460 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5461 // If reference to barrier id is not an inlinable constant then it must be
5462 // referenced with M0[4:0]. Perform an OR with the member count to include
5463 // it in M0 for S_BARRIER_INIT.
5464 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5465 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5466 .addReg(BarOp.getReg())
5467 .addReg(TmpReg0);
5468 M0Val = TmpReg1;
5469 } else {
5470 M0Val = BarOp.getReg();
5471 }
5472 }
5473
5474 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5475 if (M0Val) {
5476 auto CopyMIB =
5477 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5478 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5479 }
5480
5482 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5483 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5484
5485 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5486 MIB.addDef(I.getOperand(0).getReg());
5487
5488 if (BarValImm)
5489 MIB.addImm(*BarValImm);
5490
5491 I.eraseFromParent();
5492 return true;
5493}
5494
5495bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5496 MachineBasicBlock *BB = I.getParent();
5497 const DebugLoc &DL = I.getDebugLoc();
5498 Register CCReg = I.getOperand(0).getReg();
5499
5500 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5501 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5502
5503 I.eraseFromParent();
5504 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5505 *MRI);
5506}
5507
5508void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5509 const MachineInstr &MI,
5510 int OpIdx) const {
5511 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5512 "Expected G_CONSTANT");
5513 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5514}
5515
5516void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5517 const MachineInstr &MI,
5518 int OpIdx) const {
5519 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5520 "Expected G_CONSTANT");
5521 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5522}
5523
5524void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
5525 const MachineInstr &MI,
5526 int OpIdx) const {
5527 const MachineOperand &Op = MI.getOperand(1);
5528 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
5529 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5530}
5531
5532void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5533 const MachineInstr &MI,
5534 int OpIdx) const {
5535 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5536 "Expected G_CONSTANT");
5537 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5538}
5539
5540/// This only really exists to satisfy DAG type checking machinery, so is a
5541/// no-op here.
5542void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5543 const MachineInstr &MI,
5544 int OpIdx) const {
5545 MIB.addImm(MI.getOperand(OpIdx).getImm());
5546}
5547
5548void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5549 const MachineInstr &MI,
5550 int OpIdx) const {
5551 assert(OpIdx >= 0 && "expected to match an immediate operand");
5552 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5553}
5554
5555void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5556 const MachineInstr &MI,
5557 int OpIdx) const {
5558 assert(OpIdx >= 0 && "expected to match an immediate operand");
5559 MIB.addImm(MI.getOperand(OpIdx).getImm() &
5562}
5563
5564void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5565 const MachineInstr &MI,
5566 int OpIdx) const {
5567 assert(OpIdx >= 0 && "expected to match an immediate operand");
5568 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5571 MIB.addImm(Swizzle);
5572}
5573
5574void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5575 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5576 assert(OpIdx >= 0 && "expected to match an immediate operand");
5577 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5580 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5581}
5582
5583void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5584 const MachineInstr &MI,
5585 int OpIdx) const {
5586 MIB.addFrameIndex(MI.getOperand(1).getIndex());
5587}
5588
5589void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5590 const MachineInstr &MI,
5591 int OpIdx) const {
5592 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5593 int ExpVal = APF.getExactLog2Abs();
5594 assert(ExpVal != INT_MIN);
5595 MIB.addImm(ExpVal);
5596}
5597
5598bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5599 return TII.isInlineConstant(Imm);
5600}
5601
5602bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5603 return TII.isInlineConstant(Imm);
5604}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1404
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1613
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:774
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:772
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:771
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:769
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:770
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:759
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:999
int getLDSBankCount() const
Definition: GCNSubtarget.h:340
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:468
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:472
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:627
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:553
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:274
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:702
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:531
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:732
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:716
bool hasAddr64() const
Definition: GCNSubtarget.h:381
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1069
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1539
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:910
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:646
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:486
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:314
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:439
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:467
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:493
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:333
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.