LLVM 19.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
50#include "AMDGPUGenGlobalISel.inc"
53#include "AMDGPUGenGlobalISel.inc"
55{
56}
57
58const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
61 CodeGenCoverage *CoverageInfo,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
67}
68
69// Return the wave level SGPR base address if this is a wave address.
71 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72 ? Def->getOperand(1).getReg()
73 : Register();
74}
75
76bool AMDGPUInstructionSelector::isVCC(Register Reg,
77 const MachineRegisterInfo &MRI) const {
78 // The verifier is oblivious to s1 being a valid value for wavesize registers.
79 if (Reg.isPhysical())
80 return false;
81
82 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
83 const TargetRegisterClass *RC =
84 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
85 if (RC) {
86 const LLT Ty = MRI.getType(Reg);
87 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
88 return false;
89 // G_TRUNC s1 result is never vcc.
90 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
91 RC->hasSuperClassEq(TRI.getBoolRC());
92 }
93
94 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
95 return RB->getID() == AMDGPU::VCCRegBankID;
96}
97
98bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
99 unsigned NewOpc) const {
100 MI.setDesc(TII.get(NewOpc));
101 MI.removeOperand(1); // Remove intrinsic ID.
102 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
103
104 MachineOperand &Dst = MI.getOperand(0);
105 MachineOperand &Src = MI.getOperand(1);
106
107 // TODO: This should be legalized to s32 if needed
108 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
109 return false;
110
111 const TargetRegisterClass *DstRC
112 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
113 const TargetRegisterClass *SrcRC
114 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
115 if (!DstRC || DstRC != SrcRC)
116 return false;
117
118 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
119 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
120}
121
122bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
123 const DebugLoc &DL = I.getDebugLoc();
124 MachineBasicBlock *BB = I.getParent();
125 I.setDesc(TII.get(TargetOpcode::COPY));
126
127 const MachineOperand &Src = I.getOperand(1);
128 MachineOperand &Dst = I.getOperand(0);
129 Register DstReg = Dst.getReg();
130 Register SrcReg = Src.getReg();
131
132 if (isVCC(DstReg, *MRI)) {
133 if (SrcReg == AMDGPU::SCC) {
134 const TargetRegisterClass *RC
135 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
136 if (!RC)
137 return true;
138 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
139 }
140
141 if (!isVCC(SrcReg, *MRI)) {
142 // TODO: Should probably leave the copy and let copyPhysReg expand it.
143 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
144 return false;
145
146 const TargetRegisterClass *SrcRC
147 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
148
149 std::optional<ValueAndVReg> ConstVal =
150 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
151 if (ConstVal) {
152 unsigned MovOpc =
153 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
154 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
155 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
156 } else {
157 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
158
159 // We can't trust the high bits at this point, so clear them.
160
161 // TODO: Skip masking high bits if def is known boolean.
162
163 bool IsSGPR = TRI.isSGPRClass(SrcRC);
164 unsigned AndOpc =
165 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
166 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
167 .addImm(1)
168 .addReg(SrcReg);
169 if (IsSGPR)
170 And.setOperandDead(3); // Dead scc
171
172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
173 .addImm(0)
174 .addReg(MaskedReg);
175 }
176
177 if (!MRI->getRegClassOrNull(SrcReg))
178 MRI->setRegClass(SrcReg, SrcRC);
179 I.eraseFromParent();
180 return true;
181 }
182
183 const TargetRegisterClass *RC =
185 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
186 return false;
187
188 return true;
189 }
190
191 for (const MachineOperand &MO : I.operands()) {
192 if (MO.getReg().isPhysical())
193 continue;
194
195 const TargetRegisterClass *RC =
197 if (!RC)
198 continue;
199 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
200 }
201 return true;
202}
203
204bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
205 const Register DefReg = I.getOperand(0).getReg();
206 const LLT DefTy = MRI->getType(DefReg);
207
208 // S1 G_PHIs should not be selected in instruction-select, instead:
209 // - divergent S1 G_PHI should go through lane mask merging algorithm
210 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
211 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
212 if (DefTy == LLT::scalar(1))
213 return false;
214
215 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
216
217 const RegClassOrRegBank &RegClassOrBank =
218 MRI->getRegClassOrRegBank(DefReg);
219
220 const TargetRegisterClass *DefRC
221 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
222 if (!DefRC) {
223 if (!DefTy.isValid()) {
224 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
225 return false;
226 }
227
228 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
229 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
230 if (!DefRC) {
231 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
232 return false;
233 }
234 }
235
236 // TODO: Verify that all registers have the same bank
237 I.setDesc(TII.get(TargetOpcode::PHI));
238 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
239}
240
242AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
243 const TargetRegisterClass &SubRC,
244 unsigned SubIdx) const {
245
246 MachineInstr *MI = MO.getParent();
248 Register DstReg = MRI->createVirtualRegister(&SubRC);
249
250 if (MO.isReg()) {
251 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
252 Register Reg = MO.getReg();
253 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
254 .addReg(Reg, 0, ComposedSubIdx);
255
256 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
257 MO.isKill(), MO.isDead(), MO.isUndef(),
258 MO.isEarlyClobber(), 0, MO.isDebug(),
259 MO.isInternalRead());
260 }
261
262 assert(MO.isImm());
263
264 APInt Imm(64, MO.getImm());
265
266 switch (SubIdx) {
267 default:
268 llvm_unreachable("do not know to split immediate with this sub index.");
269 case AMDGPU::sub0:
270 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
271 case AMDGPU::sub1:
272 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
273 }
274}
275
276static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
277 switch (Opc) {
278 case AMDGPU::G_AND:
279 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
280 case AMDGPU::G_OR:
281 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
282 case AMDGPU::G_XOR:
283 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
284 default:
285 llvm_unreachable("not a bit op");
286 }
287}
288
289bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
290 Register DstReg = I.getOperand(0).getReg();
291 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
292
293 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
294 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
295 DstRB->getID() != AMDGPU::VCCRegBankID)
296 return false;
297
298 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
299 STI.isWave64());
300 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
301
302 // Dead implicit-def of scc
303 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
304 true, // isImp
305 false, // isKill
306 true)); // isDead
307 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
308}
309
310bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
311 MachineBasicBlock *BB = I.getParent();
313 Register DstReg = I.getOperand(0).getReg();
314 const DebugLoc &DL = I.getDebugLoc();
315 LLT Ty = MRI->getType(DstReg);
316 if (Ty.isVector())
317 return false;
318
319 unsigned Size = Ty.getSizeInBits();
320 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
321 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
322 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
323
324 if (Size == 32) {
325 if (IsSALU) {
326 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
328 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
329 .add(I.getOperand(1))
330 .add(I.getOperand(2))
331 .setOperandDead(3); // Dead scc
332 I.eraseFromParent();
333 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
334 }
335
336 if (STI.hasAddNoCarry()) {
337 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
338 I.setDesc(TII.get(Opc));
339 I.addOperand(*MF, MachineOperand::CreateImm(0));
340 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
341 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
342 }
343
344 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
345
346 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
348 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
349 .addDef(UnusedCarry, RegState::Dead)
350 .add(I.getOperand(1))
351 .add(I.getOperand(2))
352 .addImm(0);
353 I.eraseFromParent();
354 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
355 }
356
357 assert(!Sub && "illegal sub should not reach here");
358
359 const TargetRegisterClass &RC
360 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
361 const TargetRegisterClass &HalfRC
362 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
363
364 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
365 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
366 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
367 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
368
369 Register DstLo = MRI->createVirtualRegister(&HalfRC);
370 Register DstHi = MRI->createVirtualRegister(&HalfRC);
371
372 if (IsSALU) {
373 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
374 .add(Lo1)
375 .add(Lo2);
376 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
377 .add(Hi1)
378 .add(Hi2)
379 .setOperandDead(3); // Dead scc
380 } else {
381 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
382 Register CarryReg = MRI->createVirtualRegister(CarryRC);
383 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
384 .addDef(CarryReg)
385 .add(Lo1)
386 .add(Lo2)
387 .addImm(0);
388 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
389 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
390 .add(Hi1)
391 .add(Hi2)
392 .addReg(CarryReg, RegState::Kill)
393 .addImm(0);
394
395 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
396 return false;
397 }
398
399 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
400 .addReg(DstLo)
401 .addImm(AMDGPU::sub0)
402 .addReg(DstHi)
403 .addImm(AMDGPU::sub1);
404
405
406 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
407 return false;
408
409 I.eraseFromParent();
410 return true;
411}
412
413bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
414 MachineInstr &I) const {
415 MachineBasicBlock *BB = I.getParent();
417 const DebugLoc &DL = I.getDebugLoc();
418 Register Dst0Reg = I.getOperand(0).getReg();
419 Register Dst1Reg = I.getOperand(1).getReg();
420 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
421 I.getOpcode() == AMDGPU::G_UADDE;
422 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
423 I.getOpcode() == AMDGPU::G_USUBE;
424
425 if (isVCC(Dst1Reg, *MRI)) {
426 unsigned NoCarryOpc =
427 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
428 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
429 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
430 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
431 I.addOperand(*MF, MachineOperand::CreateImm(0));
432 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
433 }
434
435 Register Src0Reg = I.getOperand(2).getReg();
436 Register Src1Reg = I.getOperand(3).getReg();
437
438 if (HasCarryIn) {
439 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
440 .addReg(I.getOperand(4).getReg());
441 }
442
443 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
444 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
445
446 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
447 .add(I.getOperand(2))
448 .add(I.getOperand(3));
449
450 if (MRI->use_nodbg_empty(Dst1Reg)) {
451 CarryInst.setOperandDead(3); // Dead scc
452 } else {
453 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
454 .addReg(AMDGPU::SCC);
455 if (!MRI->getRegClassOrNull(Dst1Reg))
456 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
457 }
458
459 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
460 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
461 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
462 return false;
463
464 if (HasCarryIn &&
465 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
466 AMDGPU::SReg_32RegClass, *MRI))
467 return false;
468
469 I.eraseFromParent();
470 return true;
471}
472
473bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
474 MachineInstr &I) const {
475 MachineBasicBlock *BB = I.getParent();
477 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
478
479 unsigned Opc;
480 if (Subtarget->hasMADIntraFwdBug())
481 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
482 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
483 else
484 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
485 I.setDesc(TII.get(Opc));
486 I.addOperand(*MF, MachineOperand::CreateImm(0));
487 I.addImplicitDefUseOperands(*MF);
488 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
489}
490
491// TODO: We should probably legalize these to only using 32-bit results.
492bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
493 MachineBasicBlock *BB = I.getParent();
494 Register DstReg = I.getOperand(0).getReg();
495 Register SrcReg = I.getOperand(1).getReg();
496 LLT DstTy = MRI->getType(DstReg);
497 LLT SrcTy = MRI->getType(SrcReg);
498 const unsigned SrcSize = SrcTy.getSizeInBits();
499 unsigned DstSize = DstTy.getSizeInBits();
500
501 // TODO: Should handle any multiple of 32 offset.
502 unsigned Offset = I.getOperand(2).getImm();
503 if (Offset % 32 != 0 || DstSize > 128)
504 return false;
505
506 // 16-bit operations really use 32-bit registers.
507 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
508 if (DstSize == 16)
509 DstSize = 32;
510
511 const TargetRegisterClass *DstRC =
512 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
513 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
514 return false;
515
516 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
517 const TargetRegisterClass *SrcRC =
518 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
519 if (!SrcRC)
520 return false;
522 DstSize / 32);
523 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
524 if (!SrcRC)
525 return false;
526
527 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
528 *SrcRC, I.getOperand(1));
529 const DebugLoc &DL = I.getDebugLoc();
530 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
531 .addReg(SrcReg, 0, SubReg);
532
533 I.eraseFromParent();
534 return true;
535}
536
537bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
538 MachineBasicBlock *BB = MI.getParent();
539 Register DstReg = MI.getOperand(0).getReg();
540 LLT DstTy = MRI->getType(DstReg);
541 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
542
543 const unsigned SrcSize = SrcTy.getSizeInBits();
544 if (SrcSize < 32)
545 return selectImpl(MI, *CoverageInfo);
546
547 const DebugLoc &DL = MI.getDebugLoc();
548 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
549 const unsigned DstSize = DstTy.getSizeInBits();
550 const TargetRegisterClass *DstRC =
551 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
552 if (!DstRC)
553 return false;
554
555 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
557 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
558 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
559 MachineOperand &Src = MI.getOperand(I + 1);
560 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
561 MIB.addImm(SubRegs[I]);
562
563 const TargetRegisterClass *SrcRC
564 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
565 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
566 return false;
567 }
568
569 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
570 return false;
571
572 MI.eraseFromParent();
573 return true;
574}
575
576bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
577 MachineBasicBlock *BB = MI.getParent();
578 const int NumDst = MI.getNumOperands() - 1;
579
580 MachineOperand &Src = MI.getOperand(NumDst);
581
582 Register SrcReg = Src.getReg();
583 Register DstReg0 = MI.getOperand(0).getReg();
584 LLT DstTy = MRI->getType(DstReg0);
585 LLT SrcTy = MRI->getType(SrcReg);
586
587 const unsigned DstSize = DstTy.getSizeInBits();
588 const unsigned SrcSize = SrcTy.getSizeInBits();
589 const DebugLoc &DL = MI.getDebugLoc();
590 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
591
592 const TargetRegisterClass *SrcRC =
593 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
594 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
595 return false;
596
597 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
598 // source, and this relies on the fact that the same subregister indices are
599 // used for both.
600 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
601 for (int I = 0, E = NumDst; I != E; ++I) {
602 MachineOperand &Dst = MI.getOperand(I);
603 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
604 .addReg(SrcReg, 0, SubRegs[I]);
605
606 // Make sure the subregister index is valid for the source register.
607 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
608 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
609 return false;
610
611 const TargetRegisterClass *DstRC =
613 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
614 return false;
615 }
616
617 MI.eraseFromParent();
618 return true;
619}
620
621bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
622 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
623 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
624
625 Register Src0 = MI.getOperand(1).getReg();
626 Register Src1 = MI.getOperand(2).getReg();
627 LLT SrcTy = MRI->getType(Src0);
628 const unsigned SrcSize = SrcTy.getSizeInBits();
629
630 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
631 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
632 return selectG_MERGE_VALUES(MI);
633 }
634
635 // Selection logic below is for V2S16 only.
636 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
637 Register Dst = MI.getOperand(0).getReg();
638 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
639 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
640 SrcTy != LLT::scalar(32)))
641 return selectImpl(MI, *CoverageInfo);
642
643 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
644 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
645 return false;
646
647 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
648 DstBank->getID() == AMDGPU::VGPRRegBankID);
649 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
650
651 const DebugLoc &DL = MI.getDebugLoc();
652 MachineBasicBlock *BB = MI.getParent();
653
654 // First, before trying TableGen patterns, check if both sources are
655 // constants. In those cases, we can trivially compute the final constant
656 // and emit a simple move.
657 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
658 if (ConstSrc1) {
659 auto ConstSrc0 =
660 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
661 if (ConstSrc0) {
662 const int64_t K0 = ConstSrc0->Value.getSExtValue();
663 const int64_t K1 = ConstSrc1->Value.getSExtValue();
664 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
665 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
666 uint32_t Imm = Lo16 | (Hi16 << 16);
667
668 // VALU
669 if (IsVector) {
670 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
671 MI.eraseFromParent();
672 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
673 }
674
675 // SALU
676 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
677 MI.eraseFromParent();
678 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
679 }
680 }
681
682 // Now try TableGen patterns.
683 if (selectImpl(MI, *CoverageInfo))
684 return true;
685
686 // TODO: This should probably be a combine somewhere
687 // (build_vector $src0, undef) -> copy $src0
688 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
689 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
690 MI.setDesc(TII.get(AMDGPU::COPY));
691 MI.removeOperand(2);
692 const auto &RC =
693 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
694 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
695 RBI.constrainGenericRegister(Src0, RC, *MRI);
696 }
697
698 // TODO: Can be improved?
699 if (IsVector) {
700 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
701 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
702 .addImm(0xFFFF)
703 .addReg(Src0);
704 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
705 return false;
706
707 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
708 .addReg(Src1)
709 .addImm(16)
710 .addReg(TmpReg);
711 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
712 return false;
713
714 MI.eraseFromParent();
715 return true;
716 }
717
718 Register ShiftSrc0;
719 Register ShiftSrc1;
720
721 // With multiple uses of the shift, this will duplicate the shift and
722 // increase register pressure.
723 //
724 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
725 // => (S_PACK_HH_B32_B16 $src0, $src1)
726 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
727 // => (S_PACK_HL_B32_B16 $src0, $src1)
728 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
729 // => (S_PACK_LH_B32_B16 $src0, $src1)
730 // (build_vector $src0, $src1)
731 // => (S_PACK_LL_B32_B16 $src0, $src1)
732
733 bool Shift0 = mi_match(
734 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
735
736 bool Shift1 = mi_match(
737 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
738
739 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
740 if (Shift0 && Shift1) {
741 Opc = AMDGPU::S_PACK_HH_B32_B16;
742 MI.getOperand(1).setReg(ShiftSrc0);
743 MI.getOperand(2).setReg(ShiftSrc1);
744 } else if (Shift1) {
745 Opc = AMDGPU::S_PACK_LH_B32_B16;
746 MI.getOperand(2).setReg(ShiftSrc1);
747 } else if (Shift0) {
748 auto ConstSrc1 =
749 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
750 if (ConstSrc1 && ConstSrc1->Value == 0) {
751 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
752 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
753 .addReg(ShiftSrc0)
754 .addImm(16)
755 .setOperandDead(3); // Dead scc
756
757 MI.eraseFromParent();
758 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
759 }
760 if (STI.hasSPackHL()) {
761 Opc = AMDGPU::S_PACK_HL_B32_B16;
762 MI.getOperand(1).setReg(ShiftSrc0);
763 }
764 }
765
766 MI.setDesc(TII.get(Opc));
767 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
768}
769
770bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
771 return selectG_ADD_SUB(I);
772}
773
774bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
775 const MachineOperand &MO = I.getOperand(0);
776
777 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
778 // regbank check here is to know why getConstrainedRegClassForOperand failed.
780 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
781 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
782 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
783 return true;
784 }
785
786 return false;
787}
788
789bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
790 MachineBasicBlock *BB = I.getParent();
791
792 Register DstReg = I.getOperand(0).getReg();
793 Register Src0Reg = I.getOperand(1).getReg();
794 Register Src1Reg = I.getOperand(2).getReg();
795 LLT Src1Ty = MRI->getType(Src1Reg);
796
797 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
798 unsigned InsSize = Src1Ty.getSizeInBits();
799
800 int64_t Offset = I.getOperand(3).getImm();
801
802 // FIXME: These cases should have been illegal and unnecessary to check here.
803 if (Offset % 32 != 0 || InsSize % 32 != 0)
804 return false;
805
806 // Currently not handled by getSubRegFromChannel.
807 if (InsSize > 128)
808 return false;
809
810 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
811 if (SubReg == AMDGPU::NoSubRegister)
812 return false;
813
814 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
815 const TargetRegisterClass *DstRC =
816 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
817 if (!DstRC)
818 return false;
819
820 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
821 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
822 const TargetRegisterClass *Src0RC =
823 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
824 const TargetRegisterClass *Src1RC =
825 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
826
827 // Deal with weird cases where the class only partially supports the subreg
828 // index.
829 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
830 if (!Src0RC || !Src1RC)
831 return false;
832
833 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
834 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
835 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
836 return false;
837
838 const DebugLoc &DL = I.getDebugLoc();
839 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
840 .addReg(Src0Reg)
841 .addReg(Src1Reg)
842 .addImm(SubReg);
843
844 I.eraseFromParent();
845 return true;
846}
847
848bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
849 Register DstReg = MI.getOperand(0).getReg();
850 Register SrcReg = MI.getOperand(1).getReg();
851 Register OffsetReg = MI.getOperand(2).getReg();
852 Register WidthReg = MI.getOperand(3).getReg();
853
854 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
855 "scalar BFX instructions are expanded in regbankselect");
856 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
857 "64-bit vector BFX instructions are expanded in regbankselect");
858
859 const DebugLoc &DL = MI.getDebugLoc();
860 MachineBasicBlock *MBB = MI.getParent();
861
862 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
863 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
864 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
865 .addReg(SrcReg)
866 .addReg(OffsetReg)
867 .addReg(WidthReg);
868 MI.eraseFromParent();
869 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
870}
871
872bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
873 if (STI.getLDSBankCount() != 16)
874 return selectImpl(MI, *CoverageInfo);
875
876 Register Dst = MI.getOperand(0).getReg();
877 Register Src0 = MI.getOperand(2).getReg();
878 Register M0Val = MI.getOperand(6).getReg();
879 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
880 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
881 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
882 return false;
883
884 // This requires 2 instructions. It is possible to write a pattern to support
885 // this, but the generated isel emitter doesn't correctly deal with multiple
886 // output instructions using the same physical register input. The copy to m0
887 // is incorrectly placed before the second instruction.
888 //
889 // TODO: Match source modifiers.
890
891 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
892 const DebugLoc &DL = MI.getDebugLoc();
893 MachineBasicBlock *MBB = MI.getParent();
894
895 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
896 .addReg(M0Val);
897 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
898 .addImm(2)
899 .addImm(MI.getOperand(4).getImm()) // $attr
900 .addImm(MI.getOperand(3).getImm()); // $attrchan
901
902 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
903 .addImm(0) // $src0_modifiers
904 .addReg(Src0) // $src0
905 .addImm(MI.getOperand(4).getImm()) // $attr
906 .addImm(MI.getOperand(3).getImm()) // $attrchan
907 .addImm(0) // $src2_modifiers
908 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
909 .addImm(MI.getOperand(5).getImm()) // $high
910 .addImm(0) // $clamp
911 .addImm(0); // $omod
912
913 MI.eraseFromParent();
914 return true;
915}
916
917// Writelane is special in that it can use SGPR and M0 (which would normally
918// count as using the constant bus twice - but in this case it is allowed since
919// the lane selector doesn't count as a use of the constant bus). However, it is
920// still required to abide by the 1 SGPR rule. Fix this up if we might have
921// multiple SGPRs.
922bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
923 // With a constant bus limit of at least 2, there's no issue.
924 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
925 return selectImpl(MI, *CoverageInfo);
926
927 MachineBasicBlock *MBB = MI.getParent();
928 const DebugLoc &DL = MI.getDebugLoc();
929 Register VDst = MI.getOperand(0).getReg();
930 Register Val = MI.getOperand(2).getReg();
931 Register LaneSelect = MI.getOperand(3).getReg();
932 Register VDstIn = MI.getOperand(4).getReg();
933
934 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
935
936 std::optional<ValueAndVReg> ConstSelect =
937 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
938 if (ConstSelect) {
939 // The selector has to be an inline immediate, so we can use whatever for
940 // the other operands.
941 MIB.addReg(Val);
942 MIB.addImm(ConstSelect->Value.getSExtValue() &
943 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
944 } else {
945 std::optional<ValueAndVReg> ConstVal =
947
948 // If the value written is an inline immediate, we can get away without a
949 // copy to m0.
950 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
951 STI.hasInv2PiInlineImm())) {
952 MIB.addImm(ConstVal->Value.getSExtValue());
953 MIB.addReg(LaneSelect);
954 } else {
955 MIB.addReg(Val);
956
957 // If the lane selector was originally in a VGPR and copied with
958 // readfirstlane, there's a hazard to read the same SGPR from the
959 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
960 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
961
962 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
963 .addReg(LaneSelect);
964 MIB.addReg(AMDGPU::M0);
965 }
966 }
967
968 MIB.addReg(VDstIn);
969
970 MI.eraseFromParent();
971 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
972}
973
974// We need to handle this here because tablegen doesn't support matching
975// instructions with multiple outputs.
976bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
977 Register Dst0 = MI.getOperand(0).getReg();
978 Register Dst1 = MI.getOperand(1).getReg();
979
980 LLT Ty = MRI->getType(Dst0);
981 unsigned Opc;
982 if (Ty == LLT::scalar(32))
983 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
984 else if (Ty == LLT::scalar(64))
985 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
986 else
987 return false;
988
989 // TODO: Match source modifiers.
990
991 const DebugLoc &DL = MI.getDebugLoc();
992 MachineBasicBlock *MBB = MI.getParent();
993
994 Register Numer = MI.getOperand(3).getReg();
995 Register Denom = MI.getOperand(4).getReg();
996 unsigned ChooseDenom = MI.getOperand(5).getImm();
997
998 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
999
1000 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1001 .addDef(Dst1)
1002 .addImm(0) // $src0_modifiers
1003 .addUse(Src0) // $src0
1004 .addImm(0) // $src1_modifiers
1005 .addUse(Denom) // $src1
1006 .addImm(0) // $src2_modifiers
1007 .addUse(Numer) // $src2
1008 .addImm(0) // $clamp
1009 .addImm(0); // $omod
1010
1011 MI.eraseFromParent();
1012 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1013}
1014
1015bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1016 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1017 switch (IntrinsicID) {
1018 case Intrinsic::amdgcn_if_break: {
1019 MachineBasicBlock *BB = I.getParent();
1020
1021 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1022 // SelectionDAG uses for wave32 vs wave64.
1023 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1024 .add(I.getOperand(0))
1025 .add(I.getOperand(2))
1026 .add(I.getOperand(3));
1027
1028 Register DstReg = I.getOperand(0).getReg();
1029 Register Src0Reg = I.getOperand(2).getReg();
1030 Register Src1Reg = I.getOperand(3).getReg();
1031
1032 I.eraseFromParent();
1033
1034 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1035 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1036
1037 return true;
1038 }
1039 case Intrinsic::amdgcn_interp_p1_f16:
1040 return selectInterpP1F16(I);
1041 case Intrinsic::amdgcn_wqm:
1042 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1043 case Intrinsic::amdgcn_softwqm:
1044 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1045 case Intrinsic::amdgcn_strict_wwm:
1046 case Intrinsic::amdgcn_wwm:
1047 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1048 case Intrinsic::amdgcn_strict_wqm:
1049 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1050 case Intrinsic::amdgcn_writelane:
1051 return selectWritelane(I);
1052 case Intrinsic::amdgcn_div_scale:
1053 return selectDivScale(I);
1054 case Intrinsic::amdgcn_icmp:
1055 case Intrinsic::amdgcn_fcmp:
1056 if (selectImpl(I, *CoverageInfo))
1057 return true;
1058 return selectIntrinsicCmp(I);
1059 case Intrinsic::amdgcn_ballot:
1060 return selectBallot(I);
1061 case Intrinsic::amdgcn_inverse_ballot:
1062 return selectInverseBallot(I);
1063 case Intrinsic::amdgcn_reloc_constant:
1064 return selectRelocConstant(I);
1065 case Intrinsic::amdgcn_groupstaticsize:
1066 return selectGroupStaticSize(I);
1067 case Intrinsic::returnaddress:
1068 return selectReturnAddress(I);
1069 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1070 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1071 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1072 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1073 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1074 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1075 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1076 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1077 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1078 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1079 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1080 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1081 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1082 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1083 return selectSMFMACIntrin(I);
1084 default:
1085 return selectImpl(I, *CoverageInfo);
1086 }
1087}
1088
1090 const GCNSubtarget &ST) {
1091 if (Size != 16 && Size != 32 && Size != 64)
1092 return -1;
1093
1094 if (Size == 16 && !ST.has16BitInsts())
1095 return -1;
1096
1097 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1098 unsigned S64Opc) {
1099 if (Size == 16)
1100 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1101 if (Size == 32)
1102 return S32Opc;
1103 return S64Opc;
1104 };
1105
1106 switch (P) {
1107 default:
1108 llvm_unreachable("Unknown condition code!");
1109 case CmpInst::ICMP_NE:
1110 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1111 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1112 case CmpInst::ICMP_EQ:
1113 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1114 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1115 case CmpInst::ICMP_SGT:
1116 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1117 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1118 case CmpInst::ICMP_SGE:
1119 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1120 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1121 case CmpInst::ICMP_SLT:
1122 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1123 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1124 case CmpInst::ICMP_SLE:
1125 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1126 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1127 case CmpInst::ICMP_UGT:
1128 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1129 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1130 case CmpInst::ICMP_UGE:
1131 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1132 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1133 case CmpInst::ICMP_ULT:
1134 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1135 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1136 case CmpInst::ICMP_ULE:
1137 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1138 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1139
1140 case CmpInst::FCMP_OEQ:
1141 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1142 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1143 case CmpInst::FCMP_OGT:
1144 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1145 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1146 case CmpInst::FCMP_OGE:
1147 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1148 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1149 case CmpInst::FCMP_OLT:
1150 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1151 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1152 case CmpInst::FCMP_OLE:
1153 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1154 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1155 case CmpInst::FCMP_ONE:
1156 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1157 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1158 case CmpInst::FCMP_ORD:
1159 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1160 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1161 case CmpInst::FCMP_UNO:
1162 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1163 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1164 case CmpInst::FCMP_UEQ:
1165 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1166 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1167 case CmpInst::FCMP_UGT:
1168 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1169 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1170 case CmpInst::FCMP_UGE:
1171 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1172 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1173 case CmpInst::FCMP_ULT:
1174 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1175 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1176 case CmpInst::FCMP_ULE:
1177 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1178 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1179 case CmpInst::FCMP_UNE:
1180 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1181 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1182 case CmpInst::FCMP_TRUE:
1183 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1184 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1186 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1187 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1188 }
1189}
1190
1191int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1192 unsigned Size) const {
1193 if (Size == 64) {
1194 if (!STI.hasScalarCompareEq64())
1195 return -1;
1196
1197 switch (P) {
1198 case CmpInst::ICMP_NE:
1199 return AMDGPU::S_CMP_LG_U64;
1200 case CmpInst::ICMP_EQ:
1201 return AMDGPU::S_CMP_EQ_U64;
1202 default:
1203 return -1;
1204 }
1205 }
1206
1207 if (Size == 32) {
1208 switch (P) {
1209 case CmpInst::ICMP_NE:
1210 return AMDGPU::S_CMP_LG_U32;
1211 case CmpInst::ICMP_EQ:
1212 return AMDGPU::S_CMP_EQ_U32;
1213 case CmpInst::ICMP_SGT:
1214 return AMDGPU::S_CMP_GT_I32;
1215 case CmpInst::ICMP_SGE:
1216 return AMDGPU::S_CMP_GE_I32;
1217 case CmpInst::ICMP_SLT:
1218 return AMDGPU::S_CMP_LT_I32;
1219 case CmpInst::ICMP_SLE:
1220 return AMDGPU::S_CMP_LE_I32;
1221 case CmpInst::ICMP_UGT:
1222 return AMDGPU::S_CMP_GT_U32;
1223 case CmpInst::ICMP_UGE:
1224 return AMDGPU::S_CMP_GE_U32;
1225 case CmpInst::ICMP_ULT:
1226 return AMDGPU::S_CMP_LT_U32;
1227 case CmpInst::ICMP_ULE:
1228 return AMDGPU::S_CMP_LE_U32;
1229 case CmpInst::FCMP_OEQ:
1230 return AMDGPU::S_CMP_EQ_F32;
1231 case CmpInst::FCMP_OGT:
1232 return AMDGPU::S_CMP_GT_F32;
1233 case CmpInst::FCMP_OGE:
1234 return AMDGPU::S_CMP_GE_F32;
1235 case CmpInst::FCMP_OLT:
1236 return AMDGPU::S_CMP_LT_F32;
1237 case CmpInst::FCMP_OLE:
1238 return AMDGPU::S_CMP_LE_F32;
1239 case CmpInst::FCMP_ONE:
1240 return AMDGPU::S_CMP_LG_F32;
1241 case CmpInst::FCMP_ORD:
1242 return AMDGPU::S_CMP_O_F32;
1243 case CmpInst::FCMP_UNO:
1244 return AMDGPU::S_CMP_U_F32;
1245 case CmpInst::FCMP_UEQ:
1246 return AMDGPU::S_CMP_NLG_F32;
1247 case CmpInst::FCMP_UGT:
1248 return AMDGPU::S_CMP_NLE_F32;
1249 case CmpInst::FCMP_UGE:
1250 return AMDGPU::S_CMP_NLT_F32;
1251 case CmpInst::FCMP_ULT:
1252 return AMDGPU::S_CMP_NGE_F32;
1253 case CmpInst::FCMP_ULE:
1254 return AMDGPU::S_CMP_NGT_F32;
1255 case CmpInst::FCMP_UNE:
1256 return AMDGPU::S_CMP_NEQ_F32;
1257 default:
1258 llvm_unreachable("Unknown condition code!");
1259 }
1260 }
1261
1262 if (Size == 16) {
1263 if (!STI.hasSALUFloatInsts())
1264 return -1;
1265
1266 switch (P) {
1267 case CmpInst::FCMP_OEQ:
1268 return AMDGPU::S_CMP_EQ_F16;
1269 case CmpInst::FCMP_OGT:
1270 return AMDGPU::S_CMP_GT_F16;
1271 case CmpInst::FCMP_OGE:
1272 return AMDGPU::S_CMP_GE_F16;
1273 case CmpInst::FCMP_OLT:
1274 return AMDGPU::S_CMP_LT_F16;
1275 case CmpInst::FCMP_OLE:
1276 return AMDGPU::S_CMP_LE_F16;
1277 case CmpInst::FCMP_ONE:
1278 return AMDGPU::S_CMP_LG_F16;
1279 case CmpInst::FCMP_ORD:
1280 return AMDGPU::S_CMP_O_F16;
1281 case CmpInst::FCMP_UNO:
1282 return AMDGPU::S_CMP_U_F16;
1283 case CmpInst::FCMP_UEQ:
1284 return AMDGPU::S_CMP_NLG_F16;
1285 case CmpInst::FCMP_UGT:
1286 return AMDGPU::S_CMP_NLE_F16;
1287 case CmpInst::FCMP_UGE:
1288 return AMDGPU::S_CMP_NLT_F16;
1289 case CmpInst::FCMP_ULT:
1290 return AMDGPU::S_CMP_NGE_F16;
1291 case CmpInst::FCMP_ULE:
1292 return AMDGPU::S_CMP_NGT_F16;
1293 case CmpInst::FCMP_UNE:
1294 return AMDGPU::S_CMP_NEQ_F16;
1295 default:
1296 llvm_unreachable("Unknown condition code!");
1297 }
1298 }
1299
1300 return -1;
1301}
1302
1303bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1304
1305 MachineBasicBlock *BB = I.getParent();
1306 const DebugLoc &DL = I.getDebugLoc();
1307
1308 Register SrcReg = I.getOperand(2).getReg();
1309 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1310
1311 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1312
1313 Register CCReg = I.getOperand(0).getReg();
1314 if (!isVCC(CCReg, *MRI)) {
1315 int Opcode = getS_CMPOpcode(Pred, Size);
1316 if (Opcode == -1)
1317 return false;
1318 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1319 .add(I.getOperand(2))
1320 .add(I.getOperand(3));
1321 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1322 .addReg(AMDGPU::SCC);
1323 bool Ret =
1324 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1325 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1326 I.eraseFromParent();
1327 return Ret;
1328 }
1329
1330 if (I.getOpcode() == AMDGPU::G_FCMP)
1331 return false;
1332
1333 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1334 if (Opcode == -1)
1335 return false;
1336
1337 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1338 I.getOperand(0).getReg())
1339 .add(I.getOperand(2))
1340 .add(I.getOperand(3));
1342 *TRI.getBoolRC(), *MRI);
1343 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1344 I.eraseFromParent();
1345 return Ret;
1346}
1347
1348bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1349 Register Dst = I.getOperand(0).getReg();
1350 if (isVCC(Dst, *MRI))
1351 return false;
1352
1353 LLT DstTy = MRI->getType(Dst);
1354 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1355 return false;
1356
1357 MachineBasicBlock *BB = I.getParent();
1358 const DebugLoc &DL = I.getDebugLoc();
1359 Register SrcReg = I.getOperand(2).getReg();
1360 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1361
1362 // i1 inputs are not supported in GlobalISel.
1363 if (Size == 1)
1364 return false;
1365
1366 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1367 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1368 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1369 I.eraseFromParent();
1370 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1371 }
1372
1373 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1374 if (Opcode == -1)
1375 return false;
1376
1377 MachineInstrBuilder SelectedMI;
1378 MachineOperand &LHS = I.getOperand(2);
1379 MachineOperand &RHS = I.getOperand(3);
1380 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1381 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1382 Register Src0Reg =
1383 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1384 Register Src1Reg =
1385 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1386 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1387 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1388 SelectedMI.addImm(Src0Mods);
1389 SelectedMI.addReg(Src0Reg);
1390 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1391 SelectedMI.addImm(Src1Mods);
1392 SelectedMI.addReg(Src1Reg);
1393 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1394 SelectedMI.addImm(0); // clamp
1395 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1396 SelectedMI.addImm(0); // op_sel
1397
1398 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1399 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1400 return false;
1401
1402 I.eraseFromParent();
1403 return true;
1404}
1405
1406bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1407 MachineBasicBlock *BB = I.getParent();
1408 const DebugLoc &DL = I.getDebugLoc();
1409 Register DstReg = I.getOperand(0).getReg();
1410 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1411 const bool Is64 = Size == 64;
1412 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1413
1414 // In the common case, the return type matches the wave size.
1415 // However we also support emitting i64 ballots in wave32 mode.
1416 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1417 return false;
1418
1419 std::optional<ValueAndVReg> Arg =
1420 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1421
1422 const auto BuildCopy = [&](Register SrcReg) {
1423 if (Size == STI.getWavefrontSize()) {
1424 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1425 .addReg(SrcReg);
1426 return;
1427 }
1428
1429 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1430 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1431 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1432 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1433 .addReg(SrcReg)
1434 .addImm(AMDGPU::sub0)
1435 .addReg(HiReg)
1436 .addImm(AMDGPU::sub1);
1437 };
1438
1439 if (Arg) {
1440 const int64_t Value = Arg->Value.getSExtValue();
1441 if (Value == 0) {
1442 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1443 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1444 } else if (Value == -1) // all ones
1445 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1446 else
1447 return false;
1448 } else
1449 BuildCopy(I.getOperand(2).getReg());
1450
1451 I.eraseFromParent();
1452 return true;
1453}
1454
1455bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
1456 MachineBasicBlock *BB = I.getParent();
1457 const DebugLoc &DL = I.getDebugLoc();
1458 const Register DstReg = I.getOperand(0).getReg();
1459 const Register MaskReg = I.getOperand(2).getReg();
1460
1461 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
1462 I.eraseFromParent();
1463 return true;
1464}
1465
1466bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1467 Register DstReg = I.getOperand(0).getReg();
1468 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1469 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1470 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1471 return false;
1472
1473 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1474
1476 const MDNode *Metadata = I.getOperand(2).getMetadata();
1477 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1478 auto RelocSymbol = cast<GlobalVariable>(
1479 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1480
1481 MachineBasicBlock *BB = I.getParent();
1482 BuildMI(*BB, &I, I.getDebugLoc(),
1483 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1485
1486 I.eraseFromParent();
1487 return true;
1488}
1489
1490bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1492
1493 Register DstReg = I.getOperand(0).getReg();
1494 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1495 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1496 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1497
1498 MachineBasicBlock *MBB = I.getParent();
1499 const DebugLoc &DL = I.getDebugLoc();
1500
1501 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1502
1503 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1505 MIB.addImm(MFI->getLDSSize());
1506 } else {
1508 const GlobalValue *GV
1509 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1511 }
1512
1513 I.eraseFromParent();
1514 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1515}
1516
1517bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1518 MachineBasicBlock *MBB = I.getParent();
1520 const DebugLoc &DL = I.getDebugLoc();
1521
1522 MachineOperand &Dst = I.getOperand(0);
1523 Register DstReg = Dst.getReg();
1524 unsigned Depth = I.getOperand(2).getImm();
1525
1526 const TargetRegisterClass *RC
1527 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1528 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1529 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1530 return false;
1531
1532 // Check for kernel and shader functions
1533 if (Depth != 0 ||
1535 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1536 .addImm(0);
1537 I.eraseFromParent();
1538 return true;
1539 }
1540
1542 // There is a call to @llvm.returnaddress in this function
1543 MFI.setReturnAddressIsTaken(true);
1544
1545 // Get the return address reg and mark it as an implicit live-in
1546 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1547 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1548 AMDGPU::SReg_64RegClass, DL);
1549 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1550 .addReg(LiveIn);
1551 I.eraseFromParent();
1552 return true;
1553}
1554
1555bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1556 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1557 // SelectionDAG uses for wave32 vs wave64.
1558 MachineBasicBlock *BB = MI.getParent();
1559 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1560 .add(MI.getOperand(1));
1561
1562 Register Reg = MI.getOperand(1).getReg();
1563 MI.eraseFromParent();
1564
1565 if (!MRI->getRegClassOrNull(Reg))
1566 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1567 return true;
1568}
1569
1570bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1571 MachineInstr &MI, Intrinsic::ID IntrID) const {
1572 MachineBasicBlock *MBB = MI.getParent();
1574 const DebugLoc &DL = MI.getDebugLoc();
1575
1576 unsigned IndexOperand = MI.getOperand(7).getImm();
1577 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1578 bool WaveDone = MI.getOperand(9).getImm() != 0;
1579
1580 if (WaveDone && !WaveRelease)
1581 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1582
1583 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1584 IndexOperand &= ~0x3f;
1585 unsigned CountDw = 0;
1586
1588 CountDw = (IndexOperand >> 24) & 0xf;
1589 IndexOperand &= ~(0xf << 24);
1590
1591 if (CountDw < 1 || CountDw > 4) {
1593 "ds_ordered_count: dword count must be between 1 and 4");
1594 }
1595 }
1596
1597 if (IndexOperand)
1598 report_fatal_error("ds_ordered_count: bad index operand");
1599
1600 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1601 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1602
1603 unsigned Offset0 = OrderedCountIndex << 2;
1604 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1605
1607 Offset1 |= (CountDw - 1) << 6;
1608
1610 Offset1 |= ShaderType << 2;
1611
1612 unsigned Offset = Offset0 | (Offset1 << 8);
1613
1614 Register M0Val = MI.getOperand(2).getReg();
1615 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1616 .addReg(M0Val);
1617
1618 Register DstReg = MI.getOperand(0).getReg();
1619 Register ValReg = MI.getOperand(3).getReg();
1621 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1622 .addReg(ValReg)
1623 .addImm(Offset)
1624 .cloneMemRefs(MI);
1625
1626 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1627 return false;
1628
1629 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1630 MI.eraseFromParent();
1631 return Ret;
1632}
1633
1634static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1635 switch (IntrID) {
1636 case Intrinsic::amdgcn_ds_gws_init:
1637 return AMDGPU::DS_GWS_INIT;
1638 case Intrinsic::amdgcn_ds_gws_barrier:
1639 return AMDGPU::DS_GWS_BARRIER;
1640 case Intrinsic::amdgcn_ds_gws_sema_v:
1641 return AMDGPU::DS_GWS_SEMA_V;
1642 case Intrinsic::amdgcn_ds_gws_sema_br:
1643 return AMDGPU::DS_GWS_SEMA_BR;
1644 case Intrinsic::amdgcn_ds_gws_sema_p:
1645 return AMDGPU::DS_GWS_SEMA_P;
1646 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1647 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1648 default:
1649 llvm_unreachable("not a gws intrinsic");
1650 }
1651}
1652
1653bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1654 Intrinsic::ID IID) const {
1655 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1656 !STI.hasGWSSemaReleaseAll()))
1657 return false;
1658
1659 // intrinsic ID, vsrc, offset
1660 const bool HasVSrc = MI.getNumOperands() == 3;
1661 assert(HasVSrc || MI.getNumOperands() == 2);
1662
1663 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1664 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1665 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1666 return false;
1667
1668 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1669 unsigned ImmOffset;
1670
1671 MachineBasicBlock *MBB = MI.getParent();
1672 const DebugLoc &DL = MI.getDebugLoc();
1673
1674 MachineInstr *Readfirstlane = nullptr;
1675
1676 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1677 // incoming offset, in case there's an add of a constant. We'll have to put it
1678 // back later.
1679 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1680 Readfirstlane = OffsetDef;
1681 BaseOffset = OffsetDef->getOperand(1).getReg();
1682 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1683 }
1684
1685 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1686 // If we have a constant offset, try to use the 0 in m0 as the base.
1687 // TODO: Look into changing the default m0 initialization value. If the
1688 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1689 // the immediate offset.
1690
1691 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1692 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1693 .addImm(0);
1694 } else {
1695 std::tie(BaseOffset, ImmOffset) =
1696 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1697
1698 if (Readfirstlane) {
1699 // We have the constant offset now, so put the readfirstlane back on the
1700 // variable component.
1701 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1702 return false;
1703
1704 Readfirstlane->getOperand(1).setReg(BaseOffset);
1705 BaseOffset = Readfirstlane->getOperand(0).getReg();
1706 } else {
1707 if (!RBI.constrainGenericRegister(BaseOffset,
1708 AMDGPU::SReg_32RegClass, *MRI))
1709 return false;
1710 }
1711
1712 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1713 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1714 .addReg(BaseOffset)
1715 .addImm(16)
1716 .setOperandDead(3); // Dead scc
1717
1718 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1719 .addReg(M0Base);
1720 }
1721
1722 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1723 // offset field) % 64. Some versions of the programming guide omit the m0
1724 // part, or claim it's from offset 0.
1725 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1726
1727 if (HasVSrc) {
1728 Register VSrc = MI.getOperand(1).getReg();
1729 MIB.addReg(VSrc);
1730
1731 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1732 return false;
1733 }
1734
1735 MIB.addImm(ImmOffset)
1736 .cloneMemRefs(MI);
1737
1738 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1739
1740 MI.eraseFromParent();
1741 return true;
1742}
1743
1744bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1745 bool IsAppend) const {
1746 Register PtrBase = MI.getOperand(2).getReg();
1747 LLT PtrTy = MRI->getType(PtrBase);
1748 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1749
1750 unsigned Offset;
1751 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1752
1753 // TODO: Should this try to look through readfirstlane like GWS?
1754 if (!isDSOffsetLegal(PtrBase, Offset)) {
1755 PtrBase = MI.getOperand(2).getReg();
1756 Offset = 0;
1757 }
1758
1759 MachineBasicBlock *MBB = MI.getParent();
1760 const DebugLoc &DL = MI.getDebugLoc();
1761 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1762
1763 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1764 .addReg(PtrBase);
1765 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1766 return false;
1767
1768 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1769 .addImm(Offset)
1770 .addImm(IsGDS ? -1 : 0)
1771 .cloneMemRefs(MI);
1772 MI.eraseFromParent();
1773 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1774}
1775
1776bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1778 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1779 if (WGSize <= STI.getWavefrontSize()) {
1780 MachineBasicBlock *MBB = MI.getParent();
1781 const DebugLoc &DL = MI.getDebugLoc();
1782 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1783 MI.eraseFromParent();
1784 return true;
1785 }
1786 }
1787
1788 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1789 if (STI.hasSplitBarriers()) {
1790 MachineBasicBlock *MBB = MI.getParent();
1791 const DebugLoc &DL = MI.getDebugLoc();
1792 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1794 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1796 MI.eraseFromParent();
1797 return true;
1798 }
1799
1800 return selectImpl(MI, *CoverageInfo);
1801}
1802
1803static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1804 bool &IsTexFail) {
1805 if (TexFailCtrl)
1806 IsTexFail = true;
1807
1808 TFE = (TexFailCtrl & 0x1) ? true : false;
1809 TexFailCtrl &= ~(uint64_t)0x1;
1810 LWE = (TexFailCtrl & 0x2) ? true : false;
1811 TexFailCtrl &= ~(uint64_t)0x2;
1812
1813 return TexFailCtrl == 0;
1814}
1815
1816bool AMDGPUInstructionSelector::selectImageIntrinsic(
1818 MachineBasicBlock *MBB = MI.getParent();
1819 const DebugLoc &DL = MI.getDebugLoc();
1820
1821 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1823
1824 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1825 unsigned IntrOpcode = Intr->BaseOpcode;
1826 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1827 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1828 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1829
1830 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1831
1832 Register VDataIn, VDataOut;
1833 LLT VDataTy;
1834 int NumVDataDwords = -1;
1835 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1836 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1837
1838 bool Unorm;
1839 if (!BaseOpcode->Sampler)
1840 Unorm = true;
1841 else
1842 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1843
1844 bool TFE;
1845 bool LWE;
1846 bool IsTexFail = false;
1847 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1848 TFE, LWE, IsTexFail))
1849 return false;
1850
1851 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1852 const bool IsA16 = (Flags & 1) != 0;
1853 const bool IsG16 = (Flags & 2) != 0;
1854
1855 // A16 implies 16 bit gradients if subtarget doesn't support G16
1856 if (IsA16 && !STI.hasG16() && !IsG16)
1857 return false;
1858
1859 unsigned DMask = 0;
1860 unsigned DMaskLanes = 0;
1861
1862 if (BaseOpcode->Atomic) {
1863 VDataOut = MI.getOperand(0).getReg();
1864 VDataIn = MI.getOperand(2).getReg();
1865 LLT Ty = MRI->getType(VDataIn);
1866
1867 // Be careful to allow atomic swap on 16-bit element vectors.
1868 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1869 Ty.getSizeInBits() == 128 :
1870 Ty.getSizeInBits() == 64;
1871
1872 if (BaseOpcode->AtomicX2) {
1873 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1874
1875 DMask = Is64Bit ? 0xf : 0x3;
1876 NumVDataDwords = Is64Bit ? 4 : 2;
1877 } else {
1878 DMask = Is64Bit ? 0x3 : 0x1;
1879 NumVDataDwords = Is64Bit ? 2 : 1;
1880 }
1881 } else {
1882 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1883 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1884
1885 if (BaseOpcode->Store) {
1886 VDataIn = MI.getOperand(1).getReg();
1887 VDataTy = MRI->getType(VDataIn);
1888 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1889 } else {
1890 VDataOut = MI.getOperand(0).getReg();
1891 VDataTy = MRI->getType(VDataOut);
1892 NumVDataDwords = DMaskLanes;
1893
1894 if (IsD16 && !STI.hasUnpackedD16VMem())
1895 NumVDataDwords = (DMaskLanes + 1) / 2;
1896 }
1897 }
1898
1899 // Set G16 opcode
1900 if (Subtarget->hasG16() && IsG16) {
1901 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1903 assert(G16MappingInfo);
1904 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1905 }
1906
1907 // TODO: Check this in verifier.
1908 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1909
1910 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1911 if (BaseOpcode->Atomic)
1912 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1913 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1915 return false;
1916
1917 int NumVAddrRegs = 0;
1918 int NumVAddrDwords = 0;
1919 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1920 // Skip the $noregs and 0s inserted during legalization.
1921 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1922 if (!AddrOp.isReg())
1923 continue; // XXX - Break?
1924
1925 Register Addr = AddrOp.getReg();
1926 if (!Addr)
1927 break;
1928
1929 ++NumVAddrRegs;
1930 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1931 }
1932
1933 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1934 // NSA, these should have been packed into a single value in the first
1935 // address register
1936 const bool UseNSA =
1937 NumVAddrRegs != 1 &&
1938 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1939 : NumVAddrDwords == NumVAddrRegs);
1940 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1941 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1942 return false;
1943 }
1944
1945 if (IsTexFail)
1946 ++NumVDataDwords;
1947
1948 int Opcode = -1;
1949 if (IsGFX12Plus) {
1950 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1951 NumVDataDwords, NumVAddrDwords);
1952 } else if (IsGFX11Plus) {
1953 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1954 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1955 : AMDGPU::MIMGEncGfx11Default,
1956 NumVDataDwords, NumVAddrDwords);
1957 } else if (IsGFX10Plus) {
1958 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1959 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1960 : AMDGPU::MIMGEncGfx10Default,
1961 NumVDataDwords, NumVAddrDwords);
1962 } else {
1963 if (Subtarget->hasGFX90AInsts()) {
1964 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1965 NumVDataDwords, NumVAddrDwords);
1966 if (Opcode == -1) {
1967 LLVM_DEBUG(
1968 dbgs()
1969 << "requested image instruction is not supported on this GPU\n");
1970 return false;
1971 }
1972 }
1973 if (Opcode == -1 &&
1975 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1976 NumVDataDwords, NumVAddrDwords);
1977 if (Opcode == -1)
1978 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1979 NumVDataDwords, NumVAddrDwords);
1980 }
1981 if (Opcode == -1)
1982 return false;
1983
1984 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1985 .cloneMemRefs(MI);
1986
1987 if (VDataOut) {
1988 if (BaseOpcode->AtomicX2) {
1989 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1990
1991 Register TmpReg = MRI->createVirtualRegister(
1992 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1993 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1994
1995 MIB.addDef(TmpReg);
1996 if (!MRI->use_empty(VDataOut)) {
1997 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1998 .addReg(TmpReg, RegState::Kill, SubReg);
1999 }
2000
2001 } else {
2002 MIB.addDef(VDataOut); // vdata output
2003 }
2004 }
2005
2006 if (VDataIn)
2007 MIB.addReg(VDataIn); // vdata input
2008
2009 for (int I = 0; I != NumVAddrRegs; ++I) {
2010 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2011 if (SrcOp.isReg()) {
2012 assert(SrcOp.getReg() != 0);
2013 MIB.addReg(SrcOp.getReg());
2014 }
2015 }
2016
2017 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2018 if (BaseOpcode->Sampler)
2019 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2020
2021 MIB.addImm(DMask); // dmask
2022
2023 if (IsGFX10Plus)
2024 MIB.addImm(DimInfo->Encoding);
2025 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2026 MIB.addImm(Unorm);
2027
2028 MIB.addImm(CPol);
2029 MIB.addImm(IsA16 && // a16 or r128
2030 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2031 if (IsGFX10Plus)
2032 MIB.addImm(IsA16 ? -1 : 0);
2033
2034 if (!Subtarget->hasGFX90AInsts()) {
2035 MIB.addImm(TFE); // tfe
2036 } else if (TFE) {
2037 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2038 return false;
2039 }
2040
2041 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2042 MIB.addImm(LWE); // lwe
2043 if (!IsGFX10Plus)
2044 MIB.addImm(DimInfo->DA ? -1 : 0);
2045 if (BaseOpcode->HasD16)
2046 MIB.addImm(IsD16 ? -1 : 0);
2047
2048 MI.eraseFromParent();
2049 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2050 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2051 return true;
2052}
2053
2054// We need to handle this here because tablegen doesn't support matching
2055// instructions with multiple outputs.
2056bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2057 MachineInstr &MI) const {
2058 Register Dst0 = MI.getOperand(0).getReg();
2059 Register Dst1 = MI.getOperand(1).getReg();
2060
2061 const DebugLoc &DL = MI.getDebugLoc();
2062 MachineBasicBlock *MBB = MI.getParent();
2063
2064 Register Addr = MI.getOperand(3).getReg();
2065 Register Data0 = MI.getOperand(4).getReg();
2066 Register Data1 = MI.getOperand(5).getReg();
2067 unsigned Offset = MI.getOperand(6).getImm();
2068
2069 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2070 .addDef(Dst1)
2071 .addUse(Addr)
2072 .addUse(Data0)
2073 .addUse(Data1)
2074 .addImm(Offset)
2075 .cloneMemRefs(MI);
2076
2077 MI.eraseFromParent();
2078 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2079}
2080
2081bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2082 MachineInstr &I) const {
2083 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2084 switch (IntrinsicID) {
2085 case Intrinsic::amdgcn_end_cf:
2086 return selectEndCfIntrinsic(I);
2087 case Intrinsic::amdgcn_ds_ordered_add:
2088 case Intrinsic::amdgcn_ds_ordered_swap:
2089 return selectDSOrderedIntrinsic(I, IntrinsicID);
2090 case Intrinsic::amdgcn_ds_gws_init:
2091 case Intrinsic::amdgcn_ds_gws_barrier:
2092 case Intrinsic::amdgcn_ds_gws_sema_v:
2093 case Intrinsic::amdgcn_ds_gws_sema_br:
2094 case Intrinsic::amdgcn_ds_gws_sema_p:
2095 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2096 return selectDSGWSIntrinsic(I, IntrinsicID);
2097 case Intrinsic::amdgcn_ds_append:
2098 return selectDSAppendConsume(I, true);
2099 case Intrinsic::amdgcn_ds_consume:
2100 return selectDSAppendConsume(I, false);
2101 case Intrinsic::amdgcn_s_barrier:
2102 return selectSBarrier(I);
2103 case Intrinsic::amdgcn_raw_buffer_load_lds:
2104 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2105 case Intrinsic::amdgcn_struct_buffer_load_lds:
2106 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2107 return selectBufferLoadLds(I);
2108 case Intrinsic::amdgcn_global_load_lds:
2109 return selectGlobalLoadLds(I);
2110 case Intrinsic::amdgcn_exp_compr:
2111 if (!STI.hasCompressedExport()) {
2112 Function &F = I.getMF()->getFunction();
2114 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2115 F.getContext().diagnose(NoFpRet);
2116 return false;
2117 }
2118 break;
2119 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2120 return selectDSBvhStackIntrinsic(I);
2121 case Intrinsic::amdgcn_s_barrier_init:
2122 case Intrinsic::amdgcn_s_barrier_join:
2123 case Intrinsic::amdgcn_s_wakeup_barrier:
2124 case Intrinsic::amdgcn_s_get_barrier_state:
2125 return selectNamedBarrierInst(I, IntrinsicID);
2126 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2127 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2128 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2129 case Intrinsic::amdgcn_s_barrier_leave:
2130 return selectSBarrierLeave(I);
2131 }
2132 return selectImpl(I, *CoverageInfo);
2133}
2134
2135bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2136 if (selectImpl(I, *CoverageInfo))
2137 return true;
2138
2139 MachineBasicBlock *BB = I.getParent();
2140 const DebugLoc &DL = I.getDebugLoc();
2141
2142 Register DstReg = I.getOperand(0).getReg();
2143 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2144 assert(Size <= 32 || Size == 64);
2145 const MachineOperand &CCOp = I.getOperand(1);
2146 Register CCReg = CCOp.getReg();
2147 if (!isVCC(CCReg, *MRI)) {
2148 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2149 AMDGPU::S_CSELECT_B32;
2150 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2151 .addReg(CCReg);
2152
2153 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2154 // bank, because it does not cover the register class that we used to represent
2155 // for it. So we need to manually set the register class here.
2156 if (!MRI->getRegClassOrNull(CCReg))
2157 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2158 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2159 .add(I.getOperand(2))
2160 .add(I.getOperand(3));
2161
2162 bool Ret = false;
2163 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2164 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2165 I.eraseFromParent();
2166 return Ret;
2167 }
2168
2169 // Wide VGPR select should have been split in RegBankSelect.
2170 if (Size > 32)
2171 return false;
2172
2174 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2175 .addImm(0)
2176 .add(I.getOperand(3))
2177 .addImm(0)
2178 .add(I.getOperand(2))
2179 .add(I.getOperand(1));
2180
2181 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2182 I.eraseFromParent();
2183 return Ret;
2184}
2185
2186static int sizeToSubRegIndex(unsigned Size) {
2187 switch (Size) {
2188 case 32:
2189 return AMDGPU::sub0;
2190 case 64:
2191 return AMDGPU::sub0_sub1;
2192 case 96:
2193 return AMDGPU::sub0_sub1_sub2;
2194 case 128:
2195 return AMDGPU::sub0_sub1_sub2_sub3;
2196 case 256:
2197 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2198 default:
2199 if (Size < 32)
2200 return AMDGPU::sub0;
2201 if (Size > 256)
2202 return -1;
2204 }
2205}
2206
2207bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2208 Register DstReg = I.getOperand(0).getReg();
2209 Register SrcReg = I.getOperand(1).getReg();
2210 const LLT DstTy = MRI->getType(DstReg);
2211 const LLT SrcTy = MRI->getType(SrcReg);
2212 const LLT S1 = LLT::scalar(1);
2213
2214 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2215 const RegisterBank *DstRB;
2216 if (DstTy == S1) {
2217 // This is a special case. We don't treat s1 for legalization artifacts as
2218 // vcc booleans.
2219 DstRB = SrcRB;
2220 } else {
2221 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2222 if (SrcRB != DstRB)
2223 return false;
2224 }
2225
2226 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2227
2228 unsigned DstSize = DstTy.getSizeInBits();
2229 unsigned SrcSize = SrcTy.getSizeInBits();
2230
2231 const TargetRegisterClass *SrcRC =
2232 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2233 const TargetRegisterClass *DstRC =
2234 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2235 if (!SrcRC || !DstRC)
2236 return false;
2237
2238 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2239 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2240 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2241 return false;
2242 }
2243
2244 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2245 MachineBasicBlock *MBB = I.getParent();
2246 const DebugLoc &DL = I.getDebugLoc();
2247
2248 Register LoReg = MRI->createVirtualRegister(DstRC);
2249 Register HiReg = MRI->createVirtualRegister(DstRC);
2250 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2251 .addReg(SrcReg, 0, AMDGPU::sub0);
2252 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2253 .addReg(SrcReg, 0, AMDGPU::sub1);
2254
2255 if (IsVALU && STI.hasSDWA()) {
2256 // Write the low 16-bits of the high element into the high 16-bits of the
2257 // low element.
2258 MachineInstr *MovSDWA =
2259 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2260 .addImm(0) // $src0_modifiers
2261 .addReg(HiReg) // $src0
2262 .addImm(0) // $clamp
2263 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2264 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2265 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2266 .addReg(LoReg, RegState::Implicit);
2267 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2268 } else {
2269 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2270 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2271 Register ImmReg = MRI->createVirtualRegister(DstRC);
2272 if (IsVALU) {
2273 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2274 .addImm(16)
2275 .addReg(HiReg);
2276 } else {
2277 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2278 .addReg(HiReg)
2279 .addImm(16)
2280 .setOperandDead(3); // Dead scc
2281 }
2282
2283 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2284 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2285 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2286
2287 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2288 .addImm(0xffff);
2289 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2290 .addReg(LoReg)
2291 .addReg(ImmReg);
2292 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2293 .addReg(TmpReg0)
2294 .addReg(TmpReg1);
2295
2296 if (!IsVALU) {
2297 And.setOperandDead(3); // Dead scc
2298 Or.setOperandDead(3); // Dead scc
2299 }
2300 }
2301
2302 I.eraseFromParent();
2303 return true;
2304 }
2305
2306 if (!DstTy.isScalar())
2307 return false;
2308
2309 if (SrcSize > 32) {
2310 int SubRegIdx = sizeToSubRegIndex(DstSize);
2311 if (SubRegIdx == -1)
2312 return false;
2313
2314 // Deal with weird cases where the class only partially supports the subreg
2315 // index.
2316 const TargetRegisterClass *SrcWithSubRC
2317 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2318 if (!SrcWithSubRC)
2319 return false;
2320
2321 if (SrcWithSubRC != SrcRC) {
2322 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2323 return false;
2324 }
2325
2326 I.getOperand(1).setSubReg(SubRegIdx);
2327 }
2328
2329 I.setDesc(TII.get(TargetOpcode::COPY));
2330 return true;
2331}
2332
2333/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2334static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2335 Mask = maskTrailingOnes<unsigned>(Size);
2336 int SignedMask = static_cast<int>(Mask);
2337 return SignedMask >= -16 && SignedMask <= 64;
2338}
2339
2340// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2341const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2342 Register Reg, const MachineRegisterInfo &MRI,
2343 const TargetRegisterInfo &TRI) const {
2344 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2345 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2346 return RB;
2347
2348 // Ignore the type, since we don't use vcc in artifacts.
2349 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2350 return &RBI.getRegBankFromRegClass(*RC, LLT());
2351 return nullptr;
2352}
2353
2354bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2355 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2356 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2357 const DebugLoc &DL = I.getDebugLoc();
2358 MachineBasicBlock &MBB = *I.getParent();
2359 const Register DstReg = I.getOperand(0).getReg();
2360 const Register SrcReg = I.getOperand(1).getReg();
2361
2362 const LLT DstTy = MRI->getType(DstReg);
2363 const LLT SrcTy = MRI->getType(SrcReg);
2364 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2365 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2366 const unsigned DstSize = DstTy.getSizeInBits();
2367 if (!DstTy.isScalar())
2368 return false;
2369
2370 // Artifact casts should never use vcc.
2371 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2372
2373 // FIXME: This should probably be illegal and split earlier.
2374 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2375 if (DstSize <= 32)
2376 return selectCOPY(I);
2377
2378 const TargetRegisterClass *SrcRC =
2379 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2380 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2381 const TargetRegisterClass *DstRC =
2382 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2383
2384 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2385 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2386 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2387 .addReg(SrcReg)
2388 .addImm(AMDGPU::sub0)
2389 .addReg(UndefReg)
2390 .addImm(AMDGPU::sub1);
2391 I.eraseFromParent();
2392
2393 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2394 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2395 }
2396
2397 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2398 // 64-bit should have been split up in RegBankSelect
2399
2400 // Try to use an and with a mask if it will save code size.
2401 unsigned Mask;
2402 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2403 MachineInstr *ExtI =
2404 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2405 .addImm(Mask)
2406 .addReg(SrcReg);
2407 I.eraseFromParent();
2408 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2409 }
2410
2411 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2412 MachineInstr *ExtI =
2413 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2414 .addReg(SrcReg)
2415 .addImm(0) // Offset
2416 .addImm(SrcSize); // Width
2417 I.eraseFromParent();
2418 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2419 }
2420
2421 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2422 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2423 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2424 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2425 return false;
2426
2427 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2428 const unsigned SextOpc = SrcSize == 8 ?
2429 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2430 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2431 .addReg(SrcReg);
2432 I.eraseFromParent();
2433 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2434 }
2435
2436 // Using a single 32-bit SALU to calculate the high half is smaller than
2437 // S_BFE with a literal constant operand.
2438 if (DstSize > 32 && SrcSize == 32) {
2439 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2440 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2441 if (Signed) {
2442 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2443 .addReg(SrcReg, 0, SubReg)
2444 .addImm(31)
2445 .setOperandDead(3); // Dead scc
2446 } else {
2447 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2448 .addImm(0);
2449 }
2450 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2451 .addReg(SrcReg, 0, SubReg)
2452 .addImm(AMDGPU::sub0)
2453 .addReg(HiReg)
2454 .addImm(AMDGPU::sub1);
2455 I.eraseFromParent();
2456 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2457 *MRI);
2458 }
2459
2460 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2461 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2462
2463 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2464 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2465 // We need a 64-bit register source, but the high bits don't matter.
2466 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2467 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2468 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2469
2470 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2471 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2472 .addReg(SrcReg, 0, SubReg)
2473 .addImm(AMDGPU::sub0)
2474 .addReg(UndefReg)
2475 .addImm(AMDGPU::sub1);
2476
2477 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2478 .addReg(ExtReg)
2479 .addImm(SrcSize << 16);
2480
2481 I.eraseFromParent();
2482 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2483 }
2484
2485 unsigned Mask;
2486 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2487 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2488 .addReg(SrcReg)
2489 .addImm(Mask)
2490 .setOperandDead(3); // Dead scc
2491 } else {
2492 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2493 .addReg(SrcReg)
2494 .addImm(SrcSize << 16);
2495 }
2496
2497 I.eraseFromParent();
2498 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2499 }
2500
2501 return false;
2502}
2503
2505 Register &Out) {
2506 Register LShlSrc;
2507 if (mi_match(In, MRI,
2508 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2509 Out = LShlSrc;
2510 return true;
2511 }
2512 return false;
2513}
2514
2515bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2516 if (!Subtarget->hasSALUFloatInsts())
2517 return false;
2518
2519 Register Dst = I.getOperand(0).getReg();
2520 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2521 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2522 return false;
2523
2524 Register Src = I.getOperand(1).getReg();
2525
2526 if (MRI->getType(Dst) == LLT::scalar(32) &&
2527 MRI->getType(Src) == LLT::scalar(16)) {
2528 if (isExtractHiElt(*MRI, Src, Src)) {
2529 MachineBasicBlock *BB = I.getParent();
2530 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2531 .addUse(Src);
2532 I.eraseFromParent();
2533 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2534 }
2535 }
2536
2537 return false;
2538}
2539
2540bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2541 MachineBasicBlock *BB = I.getParent();
2542 MachineOperand &ImmOp = I.getOperand(1);
2543 Register DstReg = I.getOperand(0).getReg();
2544 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2545 bool IsFP = false;
2546
2547 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2548 if (ImmOp.isFPImm()) {
2549 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2550 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2551 IsFP = true;
2552 } else if (ImmOp.isCImm()) {
2553 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2554 } else {
2555 llvm_unreachable("Not supported by g_constants");
2556 }
2557
2558 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2559 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2560
2561 unsigned Opcode;
2562 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2563 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2564 } else if (Size == 64 &&
2565 AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2566 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2567 I.setDesc(TII.get(Opcode));
2568 I.addImplicitDefUseOperands(*MF);
2569 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2570 } else {
2571 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2572
2573 // We should never produce s1 values on banks other than VCC. If the user of
2574 // this already constrained the register, we may incorrectly think it's VCC
2575 // if it wasn't originally.
2576 if (Size == 1)
2577 return false;
2578 }
2579
2580 if (Size != 64) {
2581 I.setDesc(TII.get(Opcode));
2582 I.addImplicitDefUseOperands(*MF);
2583 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2584 }
2585
2586 const DebugLoc &DL = I.getDebugLoc();
2587
2588 APInt Imm(Size, I.getOperand(1).getImm());
2589
2590 MachineInstr *ResInst;
2591 if (IsSgpr && TII.isInlineConstant(Imm)) {
2592 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2593 .addImm(I.getOperand(1).getImm());
2594 } else {
2595 const TargetRegisterClass *RC = IsSgpr ?
2596 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2597 Register LoReg = MRI->createVirtualRegister(RC);
2598 Register HiReg = MRI->createVirtualRegister(RC);
2599
2600 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2601 .addImm(Imm.trunc(32).getZExtValue());
2602
2603 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2604 .addImm(Imm.ashr(32).getZExtValue());
2605
2606 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2607 .addReg(LoReg)
2608 .addImm(AMDGPU::sub0)
2609 .addReg(HiReg)
2610 .addImm(AMDGPU::sub1);
2611 }
2612
2613 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2614 // work for target independent opcodes
2615 I.eraseFromParent();
2616 const TargetRegisterClass *DstRC =
2617 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2618 if (!DstRC)
2619 return true;
2620 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2621}
2622
2623bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2624 // Only manually handle the f64 SGPR case.
2625 //
2626 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2627 // the bit ops theoretically have a second result due to the implicit def of
2628 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2629 // that is easy by disabling the check. The result works, but uses a
2630 // nonsensical sreg32orlds_and_sreg_1 regclass.
2631 //
2632 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2633 // the variadic REG_SEQUENCE operands.
2634
2635 Register Dst = MI.getOperand(0).getReg();
2636 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2637 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2638 MRI->getType(Dst) != LLT::scalar(64))
2639 return false;
2640
2641 Register Src = MI.getOperand(1).getReg();
2642 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2643 if (Fabs)
2644 Src = Fabs->getOperand(1).getReg();
2645
2646 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2647 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2648 return false;
2649
2650 MachineBasicBlock *BB = MI.getParent();
2651 const DebugLoc &DL = MI.getDebugLoc();
2652 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2653 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2654 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2655 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2656
2657 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2658 .addReg(Src, 0, AMDGPU::sub0);
2659 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2660 .addReg(Src, 0, AMDGPU::sub1);
2661 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2662 .addImm(0x80000000);
2663
2664 // Set or toggle sign bit.
2665 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2666 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2667 .addReg(HiReg)
2668 .addReg(ConstReg)
2669 .setOperandDead(3); // Dead scc
2670 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2671 .addReg(LoReg)
2672 .addImm(AMDGPU::sub0)
2673 .addReg(OpReg)
2674 .addImm(AMDGPU::sub1);
2675 MI.eraseFromParent();
2676 return true;
2677}
2678
2679// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2680bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2681 Register Dst = MI.getOperand(0).getReg();
2682 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2683 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2684 MRI->getType(Dst) != LLT::scalar(64))
2685 return false;
2686
2687 Register Src = MI.getOperand(1).getReg();
2688 MachineBasicBlock *BB = MI.getParent();
2689 const DebugLoc &DL = MI.getDebugLoc();
2690 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2691 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2692 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2693 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2694
2695 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2696 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2697 return false;
2698
2699 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2700 .addReg(Src, 0, AMDGPU::sub0);
2701 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2702 .addReg(Src, 0, AMDGPU::sub1);
2703 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2704 .addImm(0x7fffffff);
2705
2706 // Clear sign bit.
2707 // TODO: Should this used S_BITSET0_*?
2708 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2709 .addReg(HiReg)
2710 .addReg(ConstReg)
2711 .setOperandDead(3); // Dead scc
2712 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2713 .addReg(LoReg)
2714 .addImm(AMDGPU::sub0)
2715 .addReg(OpReg)
2716 .addImm(AMDGPU::sub1);
2717
2718 MI.eraseFromParent();
2719 return true;
2720}
2721
2722static bool isConstant(const MachineInstr &MI) {
2723 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2724}
2725
2726void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2727 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2728
2729 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2730 const MachineInstr *PtrMI =
2731 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2732
2733 assert(PtrMI);
2734
2735 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2736 return;
2737
2738 GEPInfo GEPInfo;
2739
2740 for (unsigned i = 1; i != 3; ++i) {
2741 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2742 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2743 assert(OpDef);
2744 if (i == 2 && isConstant(*OpDef)) {
2745 // TODO: Could handle constant base + variable offset, but a combine
2746 // probably should have commuted it.
2747 assert(GEPInfo.Imm == 0);
2748 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2749 continue;
2750 }
2751 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2752 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2753 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2754 else
2755 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2756 }
2757
2758 AddrInfo.push_back(GEPInfo);
2759 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2760}
2761
2762bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2763 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2764}
2765
2766bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2767 if (!MI.hasOneMemOperand())
2768 return false;
2769
2770 const MachineMemOperand *MMO = *MI.memoperands_begin();
2771 const Value *Ptr = MMO->getValue();
2772
2773 // UndefValue means this is a load of a kernel input. These are uniform.
2774 // Sometimes LDS instructions have constant pointers.
2775 // If Ptr is null, then that means this mem operand contains a
2776 // PseudoSourceValue like GOT.
2777 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2778 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2779 return true;
2780
2782 return true;
2783
2784 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2785 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2786 AMDGPU::SGPRRegBankID;
2787
2788 const Instruction *I = dyn_cast<Instruction>(Ptr);
2789 return I && I->getMetadata("amdgpu.uniform");
2790}
2791
2792bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2793 for (const GEPInfo &GEPInfo : AddrInfo) {
2794 if (!GEPInfo.VgprParts.empty())
2795 return true;
2796 }
2797 return false;
2798}
2799
2800void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2801 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2802 unsigned AS = PtrTy.getAddressSpace();
2804 STI.ldsRequiresM0Init()) {
2805 MachineBasicBlock *BB = I.getParent();
2806
2807 // If DS instructions require M0 initialization, insert it before selecting.
2808 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2809 .addImm(-1);
2810 }
2811}
2812
2813bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2814 MachineInstr &I) const {
2815 initM0(I);
2816 return selectImpl(I, *CoverageInfo);
2817}
2818
2820 if (Reg.isPhysical())
2821 return false;
2822
2823 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2824 const unsigned Opcode = MI.getOpcode();
2825
2826 if (Opcode == AMDGPU::COPY)
2827 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2828
2829 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2830 Opcode == AMDGPU::G_XOR)
2831 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2832 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2833
2834 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2835 return GI->is(Intrinsic::amdgcn_class);
2836
2837 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2838}
2839
2840bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2841 MachineBasicBlock *BB = I.getParent();
2842 MachineOperand &CondOp = I.getOperand(0);
2843 Register CondReg = CondOp.getReg();
2844 const DebugLoc &DL = I.getDebugLoc();
2845
2846 unsigned BrOpcode;
2847 Register CondPhysReg;
2848 const TargetRegisterClass *ConstrainRC;
2849
2850 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2851 // whether the branch is uniform when selecting the instruction. In
2852 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2853 // RegBankSelect knows what it's doing if the branch condition is scc, even
2854 // though it currently does not.
2855 if (!isVCC(CondReg, *MRI)) {
2856 if (MRI->getType(CondReg) != LLT::scalar(32))
2857 return false;
2858
2859 CondPhysReg = AMDGPU::SCC;
2860 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2861 ConstrainRC = &AMDGPU::SReg_32RegClass;
2862 } else {
2863 // FIXME: Should scc->vcc copies and with exec?
2864
2865 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2866 // need to insert an and with exec.
2867 if (!isVCmpResult(CondReg, *MRI)) {
2868 const bool Is64 = STI.isWave64();
2869 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2870 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2871
2872 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2873 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2874 .addReg(CondReg)
2875 .addReg(Exec)
2876 .setOperandDead(3); // Dead scc
2877 CondReg = TmpReg;
2878 }
2879
2880 CondPhysReg = TRI.getVCC();
2881 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2882 ConstrainRC = TRI.getBoolRC();
2883 }
2884
2885 if (!MRI->getRegClassOrNull(CondReg))
2886 MRI->setRegClass(CondReg, ConstrainRC);
2887
2888 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2889 .addReg(CondReg);
2890 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2891 .addMBB(I.getOperand(1).getMBB());
2892
2893 I.eraseFromParent();
2894 return true;
2895}
2896
2897bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2898 MachineInstr &I) const {
2899 Register DstReg = I.getOperand(0).getReg();
2900 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2901 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2902 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2903 if (IsVGPR)
2904 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2905
2906 return RBI.constrainGenericRegister(
2907 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2908}
2909
2910bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2911 Register DstReg = I.getOperand(0).getReg();
2912 Register SrcReg = I.getOperand(1).getReg();
2913 Register MaskReg = I.getOperand(2).getReg();
2914 LLT Ty = MRI->getType(DstReg);
2915 LLT MaskTy = MRI->getType(MaskReg);
2916 MachineBasicBlock *BB = I.getParent();
2917 const DebugLoc &DL = I.getDebugLoc();
2918
2919 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2920 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2921 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2922 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2923 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2924 return false;
2925
2926 // Try to avoid emitting a bit operation when we only need to touch half of
2927 // the 64-bit pointer.
2928 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2929 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2930 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2931
2932 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2933 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2934
2935 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2936 !CanCopyLow32 && !CanCopyHi32) {
2937 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2938 .addReg(SrcReg)
2939 .addReg(MaskReg)
2940 .setOperandDead(3); // Dead scc
2941 I.eraseFromParent();
2942 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2943 }
2944
2945 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2946 const TargetRegisterClass &RegRC
2947 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2948
2949 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2950 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2951 const TargetRegisterClass *MaskRC =
2952 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2953
2954 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2955 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2956 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2957 return false;
2958
2959 if (Ty.getSizeInBits() == 32) {
2960 assert(MaskTy.getSizeInBits() == 32 &&
2961 "ptrmask should have been narrowed during legalize");
2962
2963 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2964 .addReg(SrcReg)
2965 .addReg(MaskReg);
2966
2967 if (!IsVGPR)
2968 NewOp.setOperandDead(3); // Dead scc
2969 I.eraseFromParent();
2970 return true;
2971 }
2972
2973 Register HiReg = MRI->createVirtualRegister(&RegRC);
2974 Register LoReg = MRI->createVirtualRegister(&RegRC);
2975
2976 // Extract the subregisters from the source pointer.
2977 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2978 .addReg(SrcReg, 0, AMDGPU::sub0);
2979 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2980 .addReg(SrcReg, 0, AMDGPU::sub1);
2981
2982 Register MaskedLo, MaskedHi;
2983
2984 if (CanCopyLow32) {
2985 // If all the bits in the low half are 1, we only need a copy for it.
2986 MaskedLo = LoReg;
2987 } else {
2988 // Extract the mask subregister and apply the and.
2989 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2990 MaskedLo = MRI->createVirtualRegister(&RegRC);
2991
2992 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2993 .addReg(MaskReg, 0, AMDGPU::sub0);
2994 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2995 .addReg(LoReg)
2996 .addReg(MaskLo);
2997 }
2998
2999 if (CanCopyHi32) {
3000 // If all the bits in the high half are 1, we only need a copy for it.
3001 MaskedHi = HiReg;
3002 } else {
3003 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3004 MaskedHi = MRI->createVirtualRegister(&RegRC);
3005
3006 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3007 .addReg(MaskReg, 0, AMDGPU::sub1);
3008 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3009 .addReg(HiReg)
3010 .addReg(MaskHi);
3011 }
3012
3013 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3014 .addReg(MaskedLo)
3015 .addImm(AMDGPU::sub0)
3016 .addReg(MaskedHi)
3017 .addImm(AMDGPU::sub1);
3018 I.eraseFromParent();
3019 return true;
3020}
3021
3022/// Return the register to use for the index value, and the subregister to use
3023/// for the indirectly accessed register.
3024static std::pair<Register, unsigned>
3026 const TargetRegisterClass *SuperRC, Register IdxReg,
3027 unsigned EltSize, GISelKnownBits &KnownBits) {
3028 Register IdxBaseReg;
3029 int Offset;
3030
3031 std::tie(IdxBaseReg, Offset) =
3033 if (IdxBaseReg == AMDGPU::NoRegister) {
3034 // This will happen if the index is a known constant. This should ordinarily
3035 // be legalized out, but handle it as a register just in case.
3036 assert(Offset == 0);
3037 IdxBaseReg = IdxReg;
3038 }
3039
3040 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3041
3042 // Skip out of bounds offsets, or else we would end up using an undefined
3043 // register.
3044 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3045 return std::pair(IdxReg, SubRegs[0]);
3046 return std::pair(IdxBaseReg, SubRegs[Offset]);
3047}
3048
3049bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3050 MachineInstr &MI) const {
3051 Register DstReg = MI.getOperand(0).getReg();
3052 Register SrcReg = MI.getOperand(1).getReg();
3053 Register IdxReg = MI.getOperand(2).getReg();
3054
3055 LLT DstTy = MRI->getType(DstReg);
3056 LLT SrcTy = MRI->getType(SrcReg);
3057
3058 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3059 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3060 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3061
3062 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3063 // into a waterfall loop.
3064 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3065 return false;
3066
3067 const TargetRegisterClass *SrcRC =
3068 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3069 const TargetRegisterClass *DstRC =
3070 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3071 if (!SrcRC || !DstRC)
3072 return false;
3073 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3074 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3075 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3076 return false;
3077
3078 MachineBasicBlock *BB = MI.getParent();
3079 const DebugLoc &DL = MI.getDebugLoc();
3080 const bool Is64 = DstTy.getSizeInBits() == 64;
3081
3082 unsigned SubReg;
3083 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3084 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3085
3086 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3087 if (DstTy.getSizeInBits() != 32 && !Is64)
3088 return false;
3089
3090 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3091 .addReg(IdxReg);
3092
3093 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3094 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3095 .addReg(SrcReg, 0, SubReg)
3096 .addReg(SrcReg, RegState::Implicit);
3097 MI.eraseFromParent();
3098 return true;
3099 }
3100
3101 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3102 return false;
3103
3104 if (!STI.useVGPRIndexMode()) {
3105 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3106 .addReg(IdxReg);
3107 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3108 .addReg(SrcReg, 0, SubReg)
3109 .addReg(SrcReg, RegState::Implicit);
3110 MI.eraseFromParent();
3111 return true;
3112 }
3113
3114 const MCInstrDesc &GPRIDXDesc =
3115 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3116 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3117 .addReg(SrcReg)
3118 .addReg(IdxReg)
3119 .addImm(SubReg);
3120
3121 MI.eraseFromParent();
3122 return true;
3123}
3124
3125// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3126bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3127 MachineInstr &MI) const {
3128 Register DstReg = MI.getOperand(0).getReg();
3129 Register VecReg = MI.getOperand(1).getReg();
3130 Register ValReg = MI.getOperand(2).getReg();
3131 Register IdxReg = MI.getOperand(3).getReg();
3132
3133 LLT VecTy = MRI->getType(DstReg);
3134 LLT ValTy = MRI->getType(ValReg);
3135 unsigned VecSize = VecTy.getSizeInBits();
3136 unsigned ValSize = ValTy.getSizeInBits();
3137
3138 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3139 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3140 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3141
3142 assert(VecTy.getElementType() == ValTy);
3143
3144 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3145 // into a waterfall loop.
3146 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3147 return false;
3148
3149 const TargetRegisterClass *VecRC =
3150 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3151 const TargetRegisterClass *ValRC =
3152 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3153
3154 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3155 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3156 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3157 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3158 return false;
3159
3160 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3161 return false;
3162
3163 unsigned SubReg;
3164 std::tie(IdxReg, SubReg) =
3165 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3166
3167 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3168 STI.useVGPRIndexMode();
3169
3170 MachineBasicBlock *BB = MI.getParent();
3171 const DebugLoc &DL = MI.getDebugLoc();
3172
3173 if (!IndexMode) {
3174 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3175 .addReg(IdxReg);
3176
3177 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3178 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3179 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3180 .addReg(VecReg)
3181 .addReg(ValReg)
3182 .addImm(SubReg);
3183 MI.eraseFromParent();
3184 return true;
3185 }
3186
3187 const MCInstrDesc &GPRIDXDesc =
3188 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3189 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3190 .addReg(VecReg)
3191 .addReg(ValReg)
3192 .addReg(IdxReg)
3193 .addImm(SubReg);
3194
3195 MI.eraseFromParent();
3196 return true;
3197}
3198
3199bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3201 unsigned Opc;
3202 unsigned Size = MI.getOperand(3).getImm();
3203
3204 // The struct intrinsic variants add one additional operand over raw.
3205 const bool HasVIndex = MI.getNumOperands() == 9;
3206 Register VIndex;
3207 int OpOffset = 0;
3208 if (HasVIndex) {
3209 VIndex = MI.getOperand(4).getReg();
3210 OpOffset = 1;
3211 }
3212
3213 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3214 std::optional<ValueAndVReg> MaybeVOffset =
3216 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3217
3218 switch (Size) {
3219 default:
3220 return false;
3221 case 1:
3222 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3223 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3224 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3225 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3226 break;
3227 case 2:
3228 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3229 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3230 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3231 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3232 break;
3233 case 4:
3234 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3235 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3236 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3237 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3238 break;
3239 }
3240
3241 MachineBasicBlock *MBB = MI.getParent();
3242 const DebugLoc &DL = MI.getDebugLoc();
3243 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3244 .add(MI.getOperand(2));
3245
3246 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3247
3248 if (HasVIndex && HasVOffset) {
3249 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3250 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3251 .addReg(VIndex)
3252 .addImm(AMDGPU::sub0)
3253 .addReg(VOffset)
3254 .addImm(AMDGPU::sub1);
3255
3256 MIB.addReg(IdxReg);
3257 } else if (HasVIndex) {
3258 MIB.addReg(VIndex);
3259 } else if (HasVOffset) {
3260 MIB.addReg(VOffset);
3261 }
3262
3263 MIB.add(MI.getOperand(1)); // rsrc
3264 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3265 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3266 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3267 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3268 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3269
3270 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3271 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3272 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3273 MachinePointerInfo StorePtrI = LoadPtrI;
3274 StorePtrI.V = nullptr;
3276
3277 auto F = LoadMMO->getFlags() &
3279 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3280 Size, LoadMMO->getBaseAlign());
3281
3282 MachineMemOperand *StoreMMO =
3284 sizeof(int32_t), LoadMMO->getBaseAlign());
3285
3286 MIB.setMemRefs({LoadMMO, StoreMMO});
3287
3288 MI.eraseFromParent();
3289 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3290}
3291
3292/// Match a zero extend from a 32-bit value to 64-bits.
3294 Register ZExtSrc;
3295 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3296 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3297
3298 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3299 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3300 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3301 return Register();
3302
3303 assert(Def->getNumOperands() == 3 &&
3304 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3305 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3306 return Def->getOperand(1).getReg();
3307 }
3308
3309 return Register();
3310}
3311
3312bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3313 unsigned Opc;
3314 unsigned Size = MI.getOperand(3).getImm();
3315
3316 switch (Size) {
3317 default:
3318 return false;
3319 case 1:
3320 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3321 break;
3322 case 2:
3323 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3324 break;
3325 case 4:
3326 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3327 break;
3328 }
3329
3330 MachineBasicBlock *MBB = MI.getParent();
3331 const DebugLoc &DL = MI.getDebugLoc();
3332 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3333 .add(MI.getOperand(2));
3334
3335 Register Addr = MI.getOperand(1).getReg();
3336 Register VOffset;
3337 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3338 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3339 if (!isSGPR(Addr)) {
3340 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3341 if (isSGPR(AddrDef->Reg)) {
3342 Addr = AddrDef->Reg;
3343 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3344 Register SAddr =
3345 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3346 if (isSGPR(SAddr)) {
3347 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3348 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3349 Addr = SAddr;
3350 VOffset = Off;
3351 }
3352 }
3353 }
3354 }
3355
3356 if (isSGPR(Addr)) {
3357 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3358 if (!VOffset) {
3359 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3360 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3361 .addImm(0);
3362 }
3363 }
3364
3365 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3366 .addReg(Addr);
3367
3368 if (isSGPR(Addr))
3369 MIB.addReg(VOffset);
3370
3371 MIB.add(MI.getOperand(4)) // offset
3372 .add(MI.getOperand(5)); // cpol
3373
3374 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3375 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3376 LoadPtrI.Offset = MI.getOperand(4).getImm();
3377 MachinePointerInfo StorePtrI = LoadPtrI;
3380 auto F = LoadMMO->getFlags() &
3382 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3383 Size, LoadMMO->getBaseAlign());
3384 MachineMemOperand *StoreMMO =
3386 sizeof(int32_t), Align(4));
3387
3388 MIB.setMemRefs({LoadMMO, StoreMMO});
3389
3390 MI.eraseFromParent();
3391 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3392}
3393
3394bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3395 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3396 MI.removeOperand(1);
3397 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3398 return true;
3399}
3400
3401bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3402 unsigned Opc;
3403 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3404 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3405 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3406 break;
3407 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3408 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3409 break;
3410 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3411 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3412 break;
3413 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3414 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3415 break;
3416 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3417 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3418 break;
3419 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3420 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3421 break;
3422 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3423 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3424 break;
3425 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3426 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3427 break;
3428 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3429 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3430 break;
3431 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3432 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3433 break;
3434 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3435 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3436 break;
3437 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3438 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3439 break;
3440 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3441 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3442 break;
3443 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3444 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3445 break;
3446 default:
3447 llvm_unreachable("unhandled smfmac intrinsic");
3448 }
3449
3450 auto VDst_In = MI.getOperand(4);
3451
3452 MI.setDesc(TII.get(Opc));
3453 MI.removeOperand(4); // VDst_In
3454 MI.removeOperand(1); // Intrinsic ID
3455 MI.addOperand(VDst_In); // Readd VDst_In to the end
3456 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3457 return true;
3458}
3459
3460bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3461 Register DstReg = MI.getOperand(0).getReg();
3462 Register SrcReg = MI.getOperand(1).getReg();
3463 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3464 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3465 MachineBasicBlock *MBB = MI.getParent();
3466 const DebugLoc &DL = MI.getDebugLoc();
3467
3468 if (IsVALU) {
3469 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3470 .addImm(Subtarget->getWavefrontSizeLog2())
3471 .addReg(SrcReg);
3472 } else {
3473 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3474 .addReg(SrcReg)
3475 .addImm(Subtarget->getWavefrontSizeLog2())
3476 .setOperandDead(3); // Dead scc
3477 }
3478
3479 const TargetRegisterClass &RC =
3480 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3481 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3482 return false;
3483
3484 MI.eraseFromParent();
3485 return true;
3486}
3487
3488bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3489 Register SrcReg = MI.getOperand(0).getReg();
3490 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3491 return false;
3492
3493 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3494 Register SP =
3496 Register WaveAddr = getWaveAddress(DefMI);
3497 MachineBasicBlock *MBB = MI.getParent();
3498 const DebugLoc &DL = MI.getDebugLoc();
3499
3500 if (!WaveAddr) {
3501 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3502 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3503 .addReg(SrcReg)
3504 .addImm(Subtarget->getWavefrontSizeLog2())
3505 .setOperandDead(3); // Dead scc
3506 }
3507
3508 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3509 .addReg(WaveAddr);
3510
3511 MI.eraseFromParent();
3512 return true;
3513}
3514
3516
3517 if (!I.isPreISelOpcode()) {
3518 if (I.isCopy())
3519 return selectCOPY(I);
3520 return true;
3521 }
3522
3523 switch (I.getOpcode()) {
3524 case TargetOpcode::G_AND:
3525 case TargetOpcode::G_OR:
3526 case TargetOpcode::G_XOR:
3527 if (selectImpl(I, *CoverageInfo))
3528 return true;
3529 return selectG_AND_OR_XOR(I);
3530 case TargetOpcode::G_ADD:
3531 case TargetOpcode::G_SUB:
3532 if (selectImpl(I, *CoverageInfo))
3533 return true;
3534 return selectG_ADD_SUB(I);
3535 case TargetOpcode::G_UADDO:
3536 case TargetOpcode::G_USUBO:
3537 case TargetOpcode::G_UADDE:
3538 case TargetOpcode::G_USUBE:
3539 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3540 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3541 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3542 return selectG_AMDGPU_MAD_64_32(I);
3543 case TargetOpcode::G_INTTOPTR:
3544 case TargetOpcode::G_BITCAST:
3545 case TargetOpcode::G_PTRTOINT:
3546 return selectCOPY(I);
3547 case TargetOpcode::G_CONSTANT:
3548 case TargetOpcode::G_FCONSTANT:
3549 return selectG_CONSTANT(I);
3550 case TargetOpcode::G_FNEG:
3551 if (selectImpl(I, *CoverageInfo))
3552 return true;
3553 return selectG_FNEG(I);
3554 case TargetOpcode::G_FABS:
3555 if (selectImpl(I, *CoverageInfo))
3556 return true;
3557 return selectG_FABS(I);
3558 case TargetOpcode::G_EXTRACT:
3559 return selectG_EXTRACT(I);
3560 case TargetOpcode::G_MERGE_VALUES:
3561 case TargetOpcode::G_CONCAT_VECTORS:
3562 return selectG_MERGE_VALUES(I);
3563 case TargetOpcode::G_UNMERGE_VALUES:
3564 return selectG_UNMERGE_VALUES(I);
3565 case TargetOpcode::G_BUILD_VECTOR:
3566 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3567 return selectG_BUILD_VECTOR(I);
3568 case TargetOpcode::G_PTR_ADD:
3569 if (selectImpl(I, *CoverageInfo))
3570 return true;
3571 return selectG_PTR_ADD(I);
3572 case TargetOpcode::G_IMPLICIT_DEF:
3573 return selectG_IMPLICIT_DEF(I);
3574 case TargetOpcode::G_FREEZE:
3575 return selectCOPY(I);
3576 case TargetOpcode::G_INSERT:
3577 return selectG_INSERT(I);
3578 case TargetOpcode::G_INTRINSIC:
3579 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3580 return selectG_INTRINSIC(I);
3581 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3582 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3583 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3584 case TargetOpcode::G_ICMP:
3585 case TargetOpcode::G_FCMP:
3586 if (selectG_ICMP_or_FCMP(I))
3587 return true;
3588 return selectImpl(I, *CoverageInfo);
3589 case TargetOpcode::G_LOAD:
3590 case TargetOpcode::G_STORE:
3591 case TargetOpcode::G_ATOMIC_CMPXCHG:
3592 case TargetOpcode::G_ATOMICRMW_XCHG:
3593 case TargetOpcode::G_ATOMICRMW_ADD:
3594 case TargetOpcode::G_ATOMICRMW_SUB:
3595 case TargetOpcode::G_ATOMICRMW_AND:
3596 case TargetOpcode::G_ATOMICRMW_OR:
3597 case TargetOpcode::G_ATOMICRMW_XOR:
3598 case TargetOpcode::G_ATOMICRMW_MIN:
3599 case TargetOpcode::G_ATOMICRMW_MAX:
3600 case TargetOpcode::G_ATOMICRMW_UMIN:
3601 case TargetOpcode::G_ATOMICRMW_UMAX:
3602 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3603 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3604 case TargetOpcode::G_ATOMICRMW_FADD:
3605 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3606 case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3607 return selectG_LOAD_STORE_ATOMICRMW(I);
3608 case TargetOpcode::G_SELECT:
3609 return selectG_SELECT(I);
3610 case TargetOpcode::G_TRUNC:
3611 return selectG_TRUNC(I);
3612 case TargetOpcode::G_SEXT:
3613 case TargetOpcode::G_ZEXT:
3614 case TargetOpcode::G_ANYEXT:
3615 case TargetOpcode::G_SEXT_INREG:
3616 // This is a workaround. For extension from type i1, `selectImpl()` uses
3617 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3618 // i1 can only be hold in a SGPR class.
3619 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3620 selectImpl(I, *CoverageInfo))
3621 return true;
3622 return selectG_SZA_EXT(I);
3623 case TargetOpcode::G_FPEXT:
3624 if (selectG_FPEXT(I))
3625 return true;
3626 return selectImpl(I, *CoverageInfo);
3627 case TargetOpcode::G_BRCOND:
3628 return selectG_BRCOND(I);
3629 case TargetOpcode::G_GLOBAL_VALUE:
3630 return selectG_GLOBAL_VALUE(I);
3631 case TargetOpcode::G_PTRMASK:
3632 return selectG_PTRMASK(I);
3633 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3634 return selectG_EXTRACT_VECTOR_ELT(I);
3635 case TargetOpcode::G_INSERT_VECTOR_ELT:
3636 return selectG_INSERT_VECTOR_ELT(I);
3637 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3638 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3639 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3640 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3643 assert(Intr && "not an image intrinsic with image pseudo");
3644 return selectImageIntrinsic(I, Intr);
3645 }
3646 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3647 return selectBVHIntrinsic(I);
3648 case AMDGPU::G_SBFX:
3649 case AMDGPU::G_UBFX:
3650 return selectG_SBFX_UBFX(I);
3651 case AMDGPU::G_SI_CALL:
3652 I.setDesc(TII.get(AMDGPU::SI_CALL));
3653 return true;
3654 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3655 return selectWaveAddress(I);
3656 case AMDGPU::G_STACKRESTORE:
3657 return selectStackRestore(I);
3658 case AMDGPU::G_PHI:
3659 return selectPHI(I);
3660 default:
3661 return selectImpl(I, *CoverageInfo);
3662 }
3663 return false;
3664}
3665
3667AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3668 return {{
3669 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3670 }};
3671
3672}
3673
3674std::pair<Register, unsigned>
3675AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3676 bool IsCanonicalizing,
3677 bool AllowAbs, bool OpSel) const {
3678 Register Src = Root.getReg();
3679 unsigned Mods = 0;
3680 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3681
3682 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3683 Src = MI->getOperand(1).getReg();
3684 Mods |= SISrcMods::NEG;
3685 MI = getDefIgnoringCopies(Src, *MRI);
3686 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3687 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3688 // denormal mode, but we're implicitly canonicalizing in a source operand.
3689 const ConstantFP *LHS =
3690 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3691 if (LHS && LHS->isZero()) {
3692 Mods |= SISrcMods::NEG;
3693 Src = MI->getOperand(2).getReg();
3694 }
3695 }
3696
3697 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3698 Src = MI->getOperand(1).getReg();
3699 Mods |= SISrcMods::ABS;
3700 }
3701
3702 if (OpSel)
3703 Mods |= SISrcMods::OP_SEL_0;
3704
3705 return std::pair(Src, Mods);
3706}
3707
3708Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3709 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3710 bool ForceVGPR) const {
3711 if ((Mods != 0 || ForceVGPR) &&
3712 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3713
3714 // If we looked through copies to find source modifiers on an SGPR operand,
3715 // we now have an SGPR register source. To avoid potentially violating the
3716 // constant bus restriction, we need to insert a copy to a VGPR.
3717 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3718 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3719 TII.get(AMDGPU::COPY), VGPRSrc)
3720 .addReg(Src);
3721 Src = VGPRSrc;
3722 }
3723
3724 return Src;
3725}
3726
3727///
3728/// This will select either an SGPR or VGPR operand and will save us from
3729/// having to write an extra tablegen pattern.
3731AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3732 return {{
3733 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3734 }};
3735}
3736
3738AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3739 Register Src;
3740 unsigned Mods;
3741 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3742
3743 return {{
3744 [=](MachineInstrBuilder &MIB) {
3745 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3746 },
3747 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3748 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3749 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3750 }};
3751}
3752
3754AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3755 Register Src;
3756 unsigned Mods;
3757 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3758 /*IsCanonicalizing=*/true,
3759 /*AllowAbs=*/false);
3760
3761 return {{
3762 [=](MachineInstrBuilder &MIB) {
3763 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3764 },
3765 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3766 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3767 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3768 }};
3769}
3770
3772AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3773 return {{
3774 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3775 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3776 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3777 }};
3778}
3779
3781AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3782 Register Src;
3783 unsigned Mods;
3784 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3785
3786 return {{
3787 [=](MachineInstrBuilder &MIB) {
3788 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3789 },
3790 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3791 }};
3792}
3793
3795AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3796 MachineOperand &Root) const {
3797 Register Src;
3798 unsigned Mods;
3799 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3800
3801 return {{
3802 [=](MachineInstrBuilder &MIB) {
3803 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3804 },
3805 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3806 }};
3807}
3808
3810AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3811 Register Src;
3812 unsigned Mods;
3813 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3814 /*AllowAbs=*/false);
3815
3816 return {{
3817 [=](MachineInstrBuilder &MIB) {
3818 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3819 },
3820 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3821 }};
3822}
3823
3825AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3826 Register Reg = Root.getReg();
3827 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3828 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3829 return {};
3830 return {{
3831 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3832 }};
3833}
3834
3835std::pair<Register, unsigned>
3836AMDGPUInstructionSelector::selectVOP3PModsImpl(
3837 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3838 unsigned Mods = 0;
3839 MachineInstr *MI = MRI.getVRegDef(Src);
3840
3841 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3842 // It's possible to see an f32 fneg here, but unlikely.
3843 // TODO: Treat f32 fneg as only high bit.
3844 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3846 Src = MI->getOperand(1).getReg();
3847 MI = MRI.getVRegDef(Src);
3848 }
3849
3850 // TODO: Handle G_FSUB 0 as fneg
3851
3852 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3853 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3854
3855 // Packed instructions do not have abs modifiers.
3856 Mods |= SISrcMods::OP_SEL_1;
3857
3858 return std::pair(Src, Mods);
3859}
3860
3862AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3864 = Root.getParent()->getParent()->getParent()->getRegInfo();
3865
3866 Register Src;
3867 unsigned Mods;
3868 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3869
3870 return {{
3871 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3872 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3873 }};
3874}
3875
3877AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3879 = Root.getParent()->getParent()->getParent()->getRegInfo();
3880
3881 Register Src;
3882 unsigned Mods;
3883 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3884
3885 return {{
3886 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3887 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3888 }};
3889}
3890
3892AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3893 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3894 // Value is in Imm operand as i1 sign extended to int64_t.
3895 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3896 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3897 "expected i1 value");
3898 unsigned Mods = SISrcMods::OP_SEL_1;
3899 if (Root.getImm() == -1)
3900 Mods ^= SISrcMods::NEG;
3901 return {{
3902 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3903 }};
3904}
3905
3907AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3908 MachineOperand &Root) const {
3909 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3910 "expected i1 value");
3911 unsigned Mods = SISrcMods::OP_SEL_1;
3912 if (Root.getImm() != 0)
3913 Mods |= SISrcMods::OP_SEL_0;
3914
3915 return {{
3916 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3917 }};
3918}
3919
3921 MachineInstr *InsertPt,
3923 const TargetRegisterClass *DstRegClass;
3924 switch (Elts.size()) {
3925 case 8:
3926 DstRegClass = &AMDGPU::VReg_256RegClass;
3927 break;
3928 case 4:
3929 DstRegClass = &AMDGPU::VReg_128RegClass;
3930 break;
3931 case 2:
3932 DstRegClass = &AMDGPU::VReg_64RegClass;
3933 break;
3934 default:
3935 llvm_unreachable("unhandled Reg sequence size");
3936 }
3937
3938 MachineIRBuilder B(*InsertPt);
3939 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3940 .addDef(MRI.createVirtualRegister(DstRegClass));
3941 for (unsigned i = 0; i < Elts.size(); ++i) {
3942 MIB.addReg(Elts[i]);
3944 }
3945 return MIB->getOperand(0).getReg();
3946}
3947
3948static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3950 MachineInstr *InsertPt,
3952 if (ModOpcode == TargetOpcode::G_FNEG) {
3953 Mods |= SISrcMods::NEG;
3954 // Check if all elements also have abs modifier
3955 SmallVector<Register, 8> NegAbsElts;
3956 for (auto El : Elts) {
3957 Register FabsSrc;
3958 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3959 break;
3960 NegAbsElts.push_back(FabsSrc);
3961 }
3962 if (Elts.size() != NegAbsElts.size()) {
3963 // Neg
3964 Src = buildRegSequence(Elts, InsertPt, MRI);
3965 } else {
3966 // Neg and Abs
3967 Mods |= SISrcMods::NEG_HI;
3968 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
3969 }
3970 } else {
3971 assert(ModOpcode == TargetOpcode::G_FABS);
3972 // Abs
3973 Mods |= SISrcMods::NEG_HI;
3974 Src = buildRegSequence(Elts, InsertPt, MRI);
3975 }
3976}
3977
3979AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3980 Register Src = Root.getReg();
3981 unsigned Mods = SISrcMods::OP_SEL_1;
3983
3984 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3985 assert(BV->getNumSources() > 0);
3986 // Based on first element decide which mod we match, neg or abs
3987 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3988 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3989 ? AMDGPU::G_FNEG
3990 : AMDGPU::G_FABS;
3991 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3992 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3993 if (ElF32->getOpcode() != ModOpcode)
3994 break;
3995 EltsF32.push_back(ElF32->getOperand(1).getReg());
3996 }
3997
3998 // All elements had ModOpcode modifier
3999 if (BV->getNumSources() == EltsF32.size()) {
4000 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4001 *MRI);
4002 }
4003 }
4004
4005 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4006 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4007}
4008
4010AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4011 Register Src = Root.getReg();
4012 unsigned Mods = SISrcMods::OP_SEL_1;
4013 SmallVector<Register, 8> EltsV2F16;
4014
4015 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4016 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4017 Register FNegSrc;
4018 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4019 break;
4020 EltsV2F16.push_back(FNegSrc);
4021 }
4022
4023 // All elements had ModOpcode modifier
4024 if (CV->getNumSources() == EltsV2F16.size()) {
4025 Mods |= SISrcMods::NEG;
4026 Mods |= SISrcMods::NEG_HI;
4027 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4028 }
4029 }
4030
4031 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4032 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4033}
4034
4036AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4037 Register Src = Root.getReg();
4038 unsigned Mods = SISrcMods::OP_SEL_1;
4039 SmallVector<Register, 8> EltsV2F16;
4040
4041 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4042 assert(CV->getNumSources() > 0);
4043 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4044 // Based on first element decide which mod we match, neg or abs
4045 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4046 ? AMDGPU::G_FNEG
4047 : AMDGPU::G_FABS;
4048
4049 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4050 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4051 if (ElV2F16->getOpcode() != ModOpcode)
4052 break;
4053 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4054 }
4055
4056 // All elements had ModOpcode modifier
4057 if (CV->getNumSources() == EltsV2F16.size()) {
4058 MachineIRBuilder B(*Root.getParent());
4059 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4060 *MRI);
4061 }
4062 }
4063
4064 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4065 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4066}
4067
4069AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4070 std::optional<FPValueAndVReg> FPValReg;
4071 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4072 if (TII.isInlineConstant(FPValReg->Value)) {
4073 return {{[=](MachineInstrBuilder &MIB) {
4074 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4075 }}};
4076 }
4077 // Non-inlineable splat floats should not fall-through for integer immediate
4078 // checks.
4079 return {};
4080 }
4081
4082 APInt ICst;
4083 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4084 if (TII.isInlineConstant(ICst)) {
4085 return {
4086 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4087 }
4088 }
4089
4090 return {};
4091}
4092
4094AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4095 Register Src =
4096 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4097 unsigned Key = 0;
4098
4099 Register ShiftSrc;
4100 std::optional<ValueAndVReg> ShiftAmt;
4101 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4102 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4103 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4104 Key = ShiftAmt->Value.getZExtValue() / 8;
4105 Src = ShiftSrc;
4106 }
4107
4108 return {{
4109 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4110 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4111 }};
4112}
4113
4115AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4116
4117 Register Src =
4118 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4119 unsigned Key = 0;
4120
4121 Register ShiftSrc;
4122 std::optional<ValueAndVReg> ShiftAmt;
4123 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4124 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4125 ShiftAmt->Value.getZExtValue() == 16) {
4126 Src = ShiftSrc;
4127 Key = 1;
4128 }
4129
4130 return {{
4131 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4132 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4133 }};
4134}
4135
4137AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4138 Register Src;
4139 unsigned Mods;
4140 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4141
4142 // FIXME: Handle op_sel
4143 return {{
4144 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4145 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4146 }};
4147}
4148
4150AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4151 Register Src;
4152 unsigned Mods;
4153 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4154 /*IsCanonicalizing=*/true,
4155 /*AllowAbs=*/false,
4156 /*OpSel=*/false);
4157
4158 return {{
4159 [=](MachineInstrBuilder &MIB) {
4160 MIB.addReg(
4161 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4162 },
4163 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4164 }};
4165}
4166
4168AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4169 Register Src;
4170 unsigned Mods;
4171 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4172 /*IsCanonicalizing=*/true,
4173 /*AllowAbs=*/false,
4174 /*OpSel=*/true);
4175
4176 return {{
4177 [=](MachineInstrBuilder &MIB) {
4178 MIB.addReg(
4179 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4180 },
4181 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4182 }};
4183}
4184
4185bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4186 Register &Base,
4187 Register *SOffset,
4188 int64_t *Offset) const {
4189 MachineInstr *MI = Root.getParent();
4190 MachineBasicBlock *MBB = MI->getParent();
4191
4192 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4193 // then we can select all ptr + 32-bit offsets.
4194 SmallVector<GEPInfo, 4> AddrInfo;
4195 getAddrModeInfo(*MI, *MRI, AddrInfo);
4196
4197 if (AddrInfo.empty())
4198 return false;
4199
4200 const GEPInfo &GEPI = AddrInfo[0];
4201 std::optional<int64_t> EncodedImm =
4202 AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
4203
4204 if (SOffset && Offset) {
4205 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4206 AddrInfo.size() > 1) {
4207 const GEPInfo &GEPI2 = AddrInfo[1];
4208 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4209 if (Register OffsetReg =
4210 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4211 Base = GEPI2.SgprParts[0];
4212 *SOffset = OffsetReg;
4213 *Offset = *EncodedImm;
4214 return true;
4215 }
4216 }
4217 }
4218 return false;
4219 }
4220
4221 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4222 Base = GEPI.SgprParts[0];
4223 *Offset = *EncodedImm;
4224 return true;
4225 }
4226
4227 // SGPR offset is unsigned.
4228 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4229 GEPI.Imm != 0) {
4230 // If we make it this far we have a load with an 32-bit immediate offset.
4231 // It is OK to select this using a sgpr offset, because we have already
4232 // failed trying to select this load into one of the _IMM variants since
4233 // the _IMM Patterns are considered before the _SGPR patterns.
4234 Base = GEPI.SgprParts[0];
4235 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4236 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4237 .addImm(GEPI.Imm);
4238 return true;
4239 }
4240
4241 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4242 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4243 Base = GEPI.SgprParts[0];
4244 *SOffset = OffsetReg;
4245 return true;
4246 }
4247 }
4248
4249 return false;
4250}
4251
4253AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4254 Register Base;
4255 int64_t Offset;
4256 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4257 return std::nullopt;
4258
4259 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4261}
4262
4264AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4265 SmallVector<GEPInfo, 4> AddrInfo;
4266 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4267
4268 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4269 return std::nullopt;
4270
4271 const GEPInfo &GEPInfo = AddrInfo[0];
4272 Register PtrReg = GEPInfo.SgprParts[0];
4273 std::optional<int64_t> EncodedImm =
4274 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4275 if (!EncodedImm)
4276 return std::nullopt;
4277
4278 return {{
4279 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4280 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4281 }};
4282}
4283
4285AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4286 Register Base, SOffset;
4287 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4288 return std::nullopt;
4289
4290 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4291 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4292}
4293
4295AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4296 Register Base, SOffset;
4297 int64_t Offset;
4298 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4299 return std::nullopt;
4300
4301 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4302 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4303 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4304}
4305
4306std::pair<Register, int>
4307AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4308 uint64_t FlatVariant) const {
4309 MachineInstr *MI = Root.getParent();
4310
4311 auto Default = std::pair(Root.getReg(), 0);
4312
4313 if (!STI.hasFlatInstOffsets())
4314 return Default;
4315
4316 Register PtrBase;
4317 int64_t ConstOffset;
4318 std::tie(PtrBase, ConstOffset) =
4319 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4320
4321 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4322 !isFlatScratchBaseLegal(Root.getReg())))
4323 return Default;
4324
4325 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4326 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4327 return Default;
4328
4329 return std::pair(PtrBase, ConstOffset);
4330}
4331
4333AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4334 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4335
4336 return {{
4337 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4338 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4339 }};
4340}
4341
4343AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4344 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4345
4346 return {{
4347 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4348 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4349 }};
4350}
4351
4353AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4354 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4355
4356 return {{
4357 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4358 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4359 }};
4360}
4361
4362// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4364AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4365 Register Addr = Root.getReg();
4366 Register PtrBase;
4367 int64_t ConstOffset;
4368 int64_t ImmOffset = 0;
4369
4370 // Match the immediate offset first, which canonically is moved as low as
4371 // possible.
4372 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4373
4374 if (ConstOffset != 0) {
4375 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4377 Addr = PtrBase;
4378 ImmOffset = ConstOffset;
4379 } else {
4380 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4381 if (isSGPR(PtrBaseDef->Reg)) {
4382 if (ConstOffset > 0) {
4383 // Offset is too large.
4384 //
4385 // saddr + large_offset -> saddr +
4386 // (voffset = large_offset & ~MaxOffset) +
4387 // (large_offset & MaxOffset);
4388 int64_t SplitImmOffset, RemainderOffset;
4389 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4391
4392 if (isUInt<32>(RemainderOffset)) {
4393 MachineInstr *MI = Root.getParent();
4394 MachineBasicBlock *MBB = MI->getParent();
4395 Register HighBits =
4396 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4397
4398 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4399 HighBits)
4400 .addImm(RemainderOffset);
4401
4402 return {{
4403 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4404 [=](MachineInstrBuilder &MIB) {
4405 MIB.addReg(HighBits);
4406 }, // voffset
4407 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4408 }};
4409 }
4410 }
4411
4412 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4413 // is 1 we would need to perform 1 or 2 extra moves for each half of
4414 // the constant and it is better to do a scalar add and then issue a
4415 // single VALU instruction to materialize zero. Otherwise it is less
4416 // instructions to perform VALU adds with immediates or inline literals.
4417 unsigned NumLiterals =
4418 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4419 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4420 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4421 return std::nullopt;
4422 }
4423 }
4424 }
4425
4426 // Match the variable offset.
4427 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4428 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4429 // Look through the SGPR->VGPR copy.
4430 Register SAddr =
4431 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4432
4433 if (isSGPR(SAddr)) {
4434 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4435
4436 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4437 // inserted later.
4438 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4439 return {{[=](MachineInstrBuilder &MIB) { // saddr
4440 MIB.addReg(SAddr);
4441 },
4442 [=](MachineInstrBuilder &MIB) { // voffset
4443 MIB.addReg(VOffset);
4444 },
4445 [=](MachineInstrBuilder &MIB) { // offset
4446 MIB.addImm(ImmOffset);
4447 }}};
4448 }
4449 }
4450 }
4451
4452 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4453 // drop this.
4454 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4455 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4456 return std::nullopt;
4457
4458 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4459 // moves required to copy a 64-bit SGPR to VGPR.
4460 MachineInstr *MI = Root.getParent();
4461 MachineBasicBlock *MBB = MI->getParent();
4462 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4463
4464 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4465 .addImm(0);
4466
4467 return {{
4468 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4469 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4470 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4471 }};
4472}
4473
4475AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4476 Register Addr = Root.getReg();
4477 Register PtrBase;
4478 int64_t ConstOffset;
4479 int64_t ImmOffset = 0;
4480
4481 // Match the immediate offset first, which canonically is moved as low as
4482 // possible.
4483 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4484
4485 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4488 Addr = PtrBase;
4489 ImmOffset = ConstOffset;
4490 }
4491
4492 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4493 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4494 int FI = AddrDef->MI->getOperand(1).getIndex();
4495 return {{
4496 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4497 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4498 }};
4499 }
4500
4501 Register SAddr = AddrDef->Reg;
4502
4503 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4504 Register LHS = AddrDef->MI->getOperand(1).getReg();
4505 Register RHS = AddrDef->MI->getOperand(2).getReg();
4506 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4507 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4508
4509 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4510 isSGPR(RHSDef->Reg)) {
4511 int FI = LHSDef->MI->getOperand(1).getIndex();
4512 MachineInstr &I = *Root.getParent();
4513 MachineBasicBlock *BB = I.getParent();
4514 const DebugLoc &DL = I.getDebugLoc();
4515 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4516
4517 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4518 .addFrameIndex(FI)
4519 .addReg(RHSDef->Reg)
4520 .setOperandDead(3); // Dead scc
4521 }
4522 }
4523
4524 if (!isSGPR(SAddr))
4525 return std::nullopt;
4526
4527 return {{
4528 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4529 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4530 }};
4531}
4532
4533// Check whether the flat scratch SVS swizzle bug affects this access.
4534bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4535 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4536 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4537 return false;
4538
4539 // The bug affects the swizzling of SVS accesses if there is any carry out
4540 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4541 // voffset to (soffset + inst_offset).
4542 auto VKnown = KB->getKnownBits(VAddr);
4543 auto SKnown = KnownBits::computeForAddSub(
4544 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4545 KnownBits::makeConstant(APInt(32, ImmOffset)));
4546 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4547 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4548 return (VMax & 3) + (SMax & 3) >= 4;
4549}
4550
4552AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4553 Register Addr = Root.getReg();
4554 Register PtrBase;
4555 int64_t ConstOffset;
4556 int64_t ImmOffset = 0;
4557
4558 // Match the immediate offset first, which canonically is moved as low as
4559 // possible.
4560 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4561
4562 Register OrigAddr = Addr;
4563 if (ConstOffset != 0 &&
4564 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4565 Addr = PtrBase;
4566 ImmOffset = ConstOffset;
4567 }
4568
4569 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4570 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4571 return std::nullopt;
4572
4573 Register RHS = AddrDef->MI->getOperand(2).getReg();
4574 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4575 return std::nullopt;
4576
4577 Register LHS = AddrDef->MI->getOperand(1).getReg();
4578 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4579
4580 if (OrigAddr != Addr) {
4581 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4582 return std::nullopt;
4583 } else {
4584 if (!isFlatScratchBaseLegalSV(OrigAddr))
4585 return std::nullopt;
4586 }
4587
4588 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4589 return std::nullopt;
4590
4591 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4592 int FI = LHSDef->MI->getOperand(1).getIndex();
4593 return {{
4594 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4595 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4596 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4597 }};
4598 }
4599
4600 if (!isSGPR(LHS))
4601 return std::nullopt;
4602
4603 return {{
4604 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4605 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4606 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4607 }};
4608}
4609
4611AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4612 MachineInstr *MI = Root.getParent();
4613 MachineBasicBlock *MBB = MI->getParent();
4616
4617 int64_t Offset = 0;
4618 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4620 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4621
4622 // TODO: Should this be inside the render function? The iterator seems to
4623 // move.
4624 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4625 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4626 HighBits)
4627 .addImm(Offset & ~MaxOffset);
4628
4629 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4630 MIB.addReg(Info->getScratchRSrcReg());
4631 },
4632 [=](MachineInstrBuilder &MIB) { // vaddr
4633 MIB.addReg(HighBits);
4634 },
4635 [=](MachineInstrBuilder &MIB) { // soffset
4636 // Use constant zero for soffset and rely on eliminateFrameIndex
4637 // to choose the appropriate frame register if need be.
4638 MIB.addImm(0);
4639 },
4640 [=](MachineInstrBuilder &MIB) { // offset
4641 MIB.addImm(Offset & MaxOffset);
4642 }}};
4643 }
4644
4645 assert(Offset == 0 || Offset == -1);
4646
4647 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4648 // offsets.
4649 std::optional<int> FI;
4650 Register VAddr = Root.getReg();
4651 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4652 Register PtrBase;
4653 int64_t ConstOffset;
4654 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4655 if (ConstOffset != 0) {
4656 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4658 KB->signBitIsZero(PtrBase))) {
4659 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4660 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4661 FI = PtrBaseDef->getOperand(1).getIndex();
4662 else
4663 VAddr = PtrBase;
4664 Offset = ConstOffset;
4665 }
4666 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4667 FI = RootDef->getOperand(1).getIndex();
4668 }
4669 }
4670
4671 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4672 MIB.addReg(Info->getScratchRSrcReg());
4673 },
4674 [=](MachineInstrBuilder &MIB) { // vaddr
4675 if (FI)
4676 MIB.addFrameIndex(*FI);
4677 else
4678 MIB.addReg(VAddr);
4679 },
4680 [=](MachineInstrBuilder &MIB) { // soffset
4681 // Use constant zero for soffset and rely on eliminateFrameIndex
4682 // to choose the appropriate frame register if need be.
4683 MIB.addImm(0);
4684 },
4685 [=](MachineInstrBuilder &MIB) { // offset
4686 MIB.addImm(Offset);
4687 }}};
4688}
4689
4690bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4691 int64_t Offset) const {
4692 if (!isUInt<16>(Offset))
4693 return false;
4694
4696 return true;
4697
4698 // On Southern Islands instruction with a negative base value and an offset
4699 // don't seem to work.
4700 return KB->signBitIsZero(Base);
4701}
4702
4703bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4704 int64_t Offset1,
4705 unsigned Size) const {
4706 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4707 return false;
4708 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4709 return false;
4710
4712 return true;
4713
4714 // On Southern Islands instruction with a negative base value and an offset
4715 // don't seem to work.
4716 return KB->signBitIsZero(Base);
4717}
4718
4719// Return whether the operation has NoUnsignedWrap property.
4721 return Addr->getOpcode() == TargetOpcode::G_OR ||
4722 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4723 Addr->getFlag(MachineInstr::NoUWrap));
4724}
4725
4726// Check that the base address of flat scratch load/store in the form of `base +
4727// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4728// requirement). We always treat the first operand as the base address here.
4729bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4730 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4731
4732 if (isNoUnsignedWrap(AddrMI))
4733 return true;
4734
4735 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4736 // values.
4737 if (STI.hasSignedScratchOffsets())
4738 return true;
4739
4740 Register LHS = AddrMI->getOperand(1).getReg();
4741 Register RHS = AddrMI->getOperand(2).getReg();
4742
4743 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4744 std::optional<ValueAndVReg> RhsValReg =
4746 // If the immediate offset is negative and within certain range, the base
4747 // address cannot also be negative. If the base is also negative, the sum
4748 // would be either negative or much larger than the valid range of scratch
4749 // memory a thread can access.
4750 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4751 RhsValReg->Value.getSExtValue() > -0x40000000)
4752 return true;
4753 }
4754
4755 return KB->signBitIsZero(LHS);
4756}
4757
4758// Check address value in SGPR/VGPR are legal for flat scratch in the form
4759// of: SGPR + VGPR.
4760bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4761 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4762
4763 if (isNoUnsignedWrap(AddrMI))
4764 return true;
4765
4766 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4767 // values.
4768 if (STI.hasSignedScratchOffsets())
4769 return true;
4770
4771 Register LHS = AddrMI->getOperand(1).getReg();
4772 Register RHS = AddrMI->getOperand(2).getReg();
4773 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4774}
4775
4776// Check address value in SGPR/VGPR are legal for flat scratch in the form
4777// of: SGPR + VGPR + Imm.
4778bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4779 Register Addr) const {
4780 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4781 // values.
4782 if (STI.hasSignedScratchOffsets())
4783 return true;
4784
4785 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4786 Register Base = AddrMI->getOperand(1).getReg();
4787 std::optional<DefinitionAndSourceRegister> BaseDef =
4789 std::optional<ValueAndVReg> RHSOffset =
4791 assert(RHSOffset);
4792
4793 // If the immediate offset is negative and within certain range, the base
4794 // address cannot also be negative. If the base is also negative, the sum
4795 // would be either negative or much larger than the valid range of scratch
4796 // memory a thread can access.
4797 if (isNoUnsignedWrap(BaseDef->MI) &&
4798 (isNoUnsignedWrap(AddrMI) ||
4799 (RHSOffset->Value.getSExtValue() < 0 &&
4800 RHSOffset->Value.getSExtValue() > -0x40000000)))
4801 return true;
4802
4803 Register LHS = BaseDef->MI->getOperand(1).getReg();
4804 Register RHS = BaseDef->MI->getOperand(2).getReg();
4805 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4806}
4807
4808bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4809 unsigned ShAmtBits) const {
4810 assert(MI.getOpcode() == TargetOpcode::G_AND);
4811
4812 std::optional<APInt> RHS =
4813 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4814 if (!RHS)
4815 return false;
4816
4817 if (RHS->countr_one() >= ShAmtBits)
4818 return true;
4819
4820 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4821 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4822}
4823
4825AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4826 MachineOperand &Root) const {
4827 Register Reg = Root.getReg();
4829
4830 std::optional<DefinitionAndSourceRegister> Def =
4831 getDefSrcRegIgnoringCopies(Reg, *MRI);
4832 assert(Def && "this shouldn't be an optional result");
4833 Reg = Def->Reg;
4834
4835 if (Register WaveBase = getWaveAddress(Def->MI)) {
4836 return {{
4837 [=](MachineInstrBuilder &MIB) { // rsrc
4838 MIB.addReg(Info->getScratchRSrcReg());
4839 },
4840 [=](MachineInstrBuilder &MIB) { // soffset
4841 MIB.addReg(WaveBase);
4842 },
4843 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4844 }};
4845 }
4846
4847 int64_t Offset = 0;
4848
4849 // FIXME: Copy check is a hack
4851 if (mi_match(Reg, *MRI,
4852 m_GPtrAdd(m_Reg(BasePtr),
4854 if (!TII.isLegalMUBUFImmOffset(Offset))
4855 return {};
4856 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4857 Register WaveBase = getWaveAddress(BasePtrDef);
4858 if (!WaveBase)
4859 return {};
4860
4861 return {{
4862 [=](MachineInstrBuilder &MIB) { // rsrc
4863 MIB.addReg(Info->getScratchRSrcReg());
4864 },
4865 [=](MachineInstrBuilder &MIB) { // soffset
4866 MIB.addReg(WaveBase);
4867 },
4868 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4869 }};
4870 }
4871
4872 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4874 return {};
4875
4876 return {{
4877 [=](MachineInstrBuilder &MIB) { // rsrc
4878 MIB.addReg(Info->getScratchRSrcReg());
4879 },
4880 [=](MachineInstrBuilder &MIB) { // soffset
4881 MIB.addImm(0);
4882 },
4883 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4884 }};
4885}
4886
4887std::pair<Register, unsigned>
4888AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4889 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4890 if (!RootDef)
4891 return std::pair(Root.getReg(), 0);
4892
4893 int64_t ConstAddr = 0;
4894
4895 Register PtrBase;
4896 int64_t Offset;
4897 std::tie(PtrBase, Offset) =
4898 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4899
4900 if (Offset) {
4901 if (isDSOffsetLegal(PtrBase, Offset)) {
4902 // (add n0, c0)
4903 return std::pair(PtrBase, Offset);
4904 }
4905 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4906 // TODO
4907
4908
4909 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4910 // TODO
4911
4912 }
4913
4914 return std::pair(Root.getReg(), 0);
4915}
4916
4918AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4919 Register Reg;
4920 unsigned Offset;
4921 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4922 return {{
4923 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4924 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4925 }};
4926}
4927
4929AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4930 return selectDSReadWrite2(Root, 4);
4931}
4932
4934AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4935 return selectDSReadWrite2(Root, 8);
4936}
4937
4939AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4940 unsigned Size) const {
4941 Register Reg;
4942 unsigned Offset;
4943 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4944 return {{
4945 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4946 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4947 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4948 }};
4949}
4950
4951std::pair<Register, unsigned>
4952AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4953 unsigned Size) const {
4954 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4955 if (!RootDef)
4956 return std::pair(Root.getReg(), 0);
4957
4958 int64_t ConstAddr = 0;
4959
4960 Register PtrBase;
4961 int64_t Offset;
4962 std::tie(PtrBase, Offset) =
4963 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4964
4965 if (Offset) {
4966 int64_t OffsetValue0 = Offset;
4967 int64_t OffsetValue1 = Offset + Size;
4968 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4969 // (add n0, c0)
4970 return std::pair(PtrBase, OffsetValue0 / Size);
4971 }
4972 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4973 // TODO
4974
4975 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4976 // TODO
4977
4978 }
4979
4980 return std::pair(Root.getReg(), 0);
4981}
4982
4983/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4984/// the base value with the constant offset. There may be intervening copies
4985/// between \p Root and the identified constant. Returns \p Root, 0 if this does
4986/// not match the pattern.
4987std::pair<Register, int64_t>
4988AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4989 Register Root, const MachineRegisterInfo &MRI) const {
4990 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4991 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4992 return {Root, 0};
4993
4994 MachineOperand &RHS = RootI->getOperand(2);
4995 std::optional<ValueAndVReg> MaybeOffset =
4997 if (!MaybeOffset)
4998 return {Root, 0};
4999 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5000}
5001
5003 MIB.addImm(0);
5004}
5005
5006/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5007/// BasePtr is not valid, a null base pointer will be used.
5009 uint32_t FormatLo, uint32_t FormatHi,
5010 Register BasePtr) {
5011 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5012 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5013 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5014 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5015
5016 B.buildInstr(AMDGPU::S_MOV_B32)
5017 .addDef(RSrc2)
5018 .addImm(FormatLo);
5019 B.buildInstr(AMDGPU::S_MOV_B32)
5020 .addDef(RSrc3)
5021 .addImm(FormatHi);
5022
5023 // Build the half of the subregister with the constants before building the
5024 // full 128-bit register. If we are building multiple resource descriptors,
5025 // this will allow CSEing of the 2-component register.
5026 B.buildInstr(AMDGPU::REG_SEQUENCE)
5027 .addDef(RSrcHi)
5028 .addReg(RSrc2)
5029 .addImm(AMDGPU::sub0)
5030 .addReg(RSrc3)
5031 .addImm(AMDGPU::sub1);
5032
5033 Register RSrcLo = BasePtr;
5034 if (!BasePtr) {
5035 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5036 B.buildInstr(AMDGPU::S_MOV_B64)
5037 .addDef(RSrcLo)
5038 .addImm(0);
5039 }
5040
5041 B.buildInstr(AMDGPU::REG_SEQUENCE)
5042 .addDef(RSrc)
5043 .addReg(RSrcLo)
5044 .addImm(AMDGPU::sub0_sub1)
5045 .addReg(RSrcHi)
5046 .addImm(AMDGPU::sub2_sub3);
5047
5048 return RSrc;
5049}
5050
5052 const SIInstrInfo &TII, Register BasePtr) {
5053 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5054
5055 // FIXME: Why are half the "default" bits ignored based on the addressing
5056 // mode?
5057 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5058}
5059
5061 const SIInstrInfo &TII, Register BasePtr) {
5062 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5063
5064 // FIXME: Why are half the "default" bits ignored based on the addressing
5065 // mode?
5066 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5067}
5068
5069AMDGPUInstructionSelector::MUBUFAddressData
5070AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5071 MUBUFAddressData Data;
5072 Data.N0 = Src;
5073
5074 Register PtrBase;
5075 int64_t Offset;
5076
5077 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5078 if (isUInt<32>(Offset)) {
5079 Data.N0 = PtrBase;
5080 Data.Offset = Offset;
5081 }
5082
5083 if (MachineInstr *InputAdd
5084 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5085 Data.N2 = InputAdd->getOperand(1).getReg();
5086 Data.N3 = InputAdd->getOperand(2).getReg();
5087
5088 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5089 // FIXME: Don't know this was defined by operand 0
5090 //
5091 // TODO: Remove this when we have copy folding optimizations after
5092 // RegBankSelect.
5093 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5094 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5095 }
5096
5097 return Data;
5098}
5099
5100/// Return if the addr64 mubuf mode should be used for the given address.
5101bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5102 // (ptr_add N2, N3) -> addr64, or
5103 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5104 if (Addr.N2)
5105 return true;
5106
5107 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5108 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5109}
5110
5111/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5112/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5113/// component.
5114void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5115 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5116 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5117 return;
5118
5119 // Illegal offset, store it in soffset.
5120 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5121 B.buildInstr(AMDGPU::S_MOV_B32)
5122 .addDef(SOffset)
5123 .addImm(ImmOffset);
5124 ImmOffset = 0;
5125}
5126
5127bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5128 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5129 Register &SOffset, int64_t &Offset) const {
5130 // FIXME: Predicates should stop this from reaching here.
5131 // addr64 bit was removed for volcanic islands.
5132 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5133 return false;
5134
5135 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5136 if (!shouldUseAddr64(AddrData))
5137 return false;
5138
5139 Register N0 = AddrData.N0;
5140 Register N2 = AddrData.N2;
5141 Register N3 = AddrData.N3;
5142 Offset = AddrData.Offset;
5143
5144 // Base pointer for the SRD.
5145 Register SRDPtr;
5146
5147 if (N2) {
5148 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5149 assert(N3);
5150 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5151 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5152 // addr64, and construct the default resource from a 0 address.
5153 VAddr = N0;
5154 } else {
5155 SRDPtr = N3;
5156 VAddr = N2;
5157 }
5158 } else {
5159 // N2 is not divergent.
5160 SRDPtr = N2;
5161 VAddr = N3;
5162 }
5163 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5164 // Use the default null pointer in the resource
5165 VAddr = N0;
5166 } else {
5167 // N0 -> offset, or
5168 // (N0 + C1) -> offset
5169 SRDPtr = N0;
5170 }
5171
5172 MachineIRBuilder B(*Root.getParent());
5173 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5174 splitIllegalMUBUFOffset(B, SOffset, Offset);
5175 return true;
5176}
5177
5178bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5179 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5180 int64_t &Offset) const {
5181
5182 // FIXME: Pattern should not reach here.
5183 if (STI.useFlatForGlobal())
5184 return false;
5185
5186 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5187 if (shouldUseAddr64(AddrData))
5188 return false;
5189
5190 // N0 -> offset, or
5191 // (N0 + C1) -> offset
5192 Register SRDPtr = AddrData.N0;
5193 Offset = AddrData.Offset;
5194
5195 // TODO: Look through extensions for 32-bit soffset.
5196 MachineIRBuilder B(*Root.getParent());
5197
5198 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5199 splitIllegalMUBUFOffset(B, SOffset, Offset);
5200 return true;
5201}
5202
5204AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5205 Register VAddr;
5206 Register RSrcReg;
5207 Register SOffset;
5208 int64_t Offset = 0;
5209
5210 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5211 return {};
5212
5213 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5214 // pattern.
5215 return {{
5216 [=](MachineInstrBuilder &MIB) { // rsrc
5217 MIB.addReg(RSrcReg);
5218 },
5219 [=](MachineInstrBuilder &MIB) { // vaddr
5220 MIB.addReg(VAddr);
5221 },
5222 [=](MachineInstrBuilder &MIB) { // soffset
5223 if (SOffset)
5224 MIB.addReg(SOffset);
5225 else if (STI.hasRestrictedSOffset())
5226 MIB.addReg(AMDGPU::SGPR_NULL);
5227 else
5228 MIB.addImm(0);
5229 },
5230 [=](MachineInstrBuilder &MIB) { // offset
5231 MIB.addImm(Offset);
5232 },
5233 addZeroImm, // cpol
5234 addZeroImm, // tfe
5235 addZeroImm // swz
5236 }};
5237}
5238
5240AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5241 Register RSrcReg;
5242 Register SOffset;
5243 int64_t Offset = 0;
5244
5245 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5246 return {};
5247
5248 return {{
5249 [=](MachineInstrBuilder &MIB) { // rsrc
5250 MIB.addReg(RSrcReg);
5251 },
5252 [=](MachineInstrBuilder &MIB) { // soffset
5253 if (SOffset)
5254 MIB.addReg(SOffset);
5255 else if (STI.hasRestrictedSOffset())
5256 MIB.addReg(AMDGPU::SGPR_NULL);
5257 else
5258 MIB.addImm(0);
5259 },
5260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5261 addZeroImm, // cpol
5262 addZeroImm, // tfe
5263 addZeroImm, // swz
5264 }};
5265}
5266
5268AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5269
5270 Register SOffset = Root.getReg();
5271
5272 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5273 SOffset = AMDGPU::SGPR_NULL;
5274
5275 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5276}
5277
5278/// Get an immediate that must be 32-bits, and treated as zero extended.
5279static std::optional<uint64_t>
5281 // getIConstantVRegVal sexts any values, so see if that matters.
5282 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5283 if (!OffsetVal || !isInt<32>(*OffsetVal))
5284 return std::nullopt;
5285 return Lo_32(*OffsetVal);
5286}
5287
5289AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5290 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5291 if (!OffsetVal)
5292 return {};
5293
5294 std::optional<int64_t> EncodedImm =
5295 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5296 if (!EncodedImm)
5297 return {};
5298
5299 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5300}
5301
5303AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5305
5306 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5307 if (!OffsetVal)
5308 return {};
5309
5310 std::optional<int64_t> EncodedImm =
5312 if (!EncodedImm)
5313 return {};
5314
5315 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5316}
5317
5319AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5320 // Match the (soffset + offset) pair as a 32-bit register base and
5321 // an immediate offset.
5322 Register SOffset;
5323 unsigned Offset;
5324 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5325 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5326 if (!SOffset)
5327 return std::nullopt;
5328
5329 std::optional<int64_t> EncodedOffset =
5330 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5331 if (!EncodedOffset)
5332 return std::nullopt;
5333
5334 assert(MRI->getType(SOffset) == LLT::scalar(32));
5335 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5336 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5337}
5338
5339// Variant of stripBitCast that returns the instruction instead of a
5340// MachineOperand.
5342 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5343 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5344 return MI;
5345}
5346
5347// Figure out if this is really an extract of the high 16-bits of a dword,
5348// returns nullptr if it isn't.
5351 Inst = stripBitCast(Inst, MRI);
5352
5353 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5354 return nullptr;
5355
5356 MachineInstr *TruncOp =
5358 TruncOp = stripBitCast(TruncOp, MRI);
5359
5360 // G_LSHR x, (G_CONSTANT i32 16)
5361 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5362 auto SrlAmount = getIConstantVRegValWithLookThrough(
5363 TruncOp->getOperand(2).getReg(), MRI);
5364 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5365 MachineInstr *SrlOp =
5366 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5367 return stripBitCast(SrlOp, MRI);
5368 }
5369 }
5370
5371 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5372 // 1, 0 swaps the low/high 16 bits.
5373 // 1, 1 sets the high 16 bits to be the same as the low 16.
5374 // in any case, it selects the high elts.
5375 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5376 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5377 LLT::fixed_vector(2, 16));
5378
5379 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5380 assert(Mask.size() == 2);
5381
5382 if (Mask[0] == 1 && Mask[1] <= 1) {
5383 MachineInstr *LHS =
5384 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5385 return stripBitCast(LHS, MRI);
5386 }
5387 }
5388
5389 return nullptr;
5390}
5391
5392std::pair<Register, unsigned>
5393AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5394 bool &Matched) const {
5395 Matched = false;
5396
5397 Register Src;
5398 unsigned Mods;
5399 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5400
5401 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5402 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5403 MachineOperand *MO = &MI->getOperand(1);
5404 Src = MO->getReg();
5405 MI = getDefIgnoringCopies(Src, *MRI);
5406
5407 assert(MRI->getType(Src) == LLT::scalar(16));
5408
5409 // See through bitcasts.
5410 // FIXME: Would be nice to use stripBitCast here.
5411 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5412 MO = &MI->getOperand(1);
5413 Src = MO->getReg();
5414 MI = getDefIgnoringCopies(Src, *MRI);
5415 }
5416
5417 const auto CheckAbsNeg = [&]() {
5418 // Be careful about folding modifiers if we already have an abs. fneg is
5419 // applied last, so we don't want to apply an earlier fneg.
5420 if ((Mods & SISrcMods::ABS) == 0) {
5421 unsigned ModsTmp;
5422 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5423 MI = getDefIgnoringCopies(Src, *MRI);
5424
5425 if ((ModsTmp & SISrcMods::NEG) != 0)
5426 Mods ^= SISrcMods::NEG;
5427
5428 if ((ModsTmp & SISrcMods::ABS) != 0)
5429 Mods |= SISrcMods::ABS;
5430 }
5431 };
5432
5433 CheckAbsNeg();
5434
5435 // op_sel/op_sel_hi decide the source type and source.
5436 // If the source's op_sel_hi is set, it indicates to do a conversion from
5437 // fp16. If the sources's op_sel is set, it picks the high half of the
5438 // source register.
5439
5440 Mods |= SISrcMods::OP_SEL_1;
5441
5442 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5443 Mods |= SISrcMods::OP_SEL_0;
5444 MI = ExtractHiEltMI;
5445 MO = &MI->getOperand(0);
5446 Src = MO->getReg();
5447
5448 CheckAbsNeg();
5449 }
5450
5451 Matched = true;
5452 }
5453
5454 return {Src, Mods};
5455}
5456
5458AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5459 MachineOperand &Root) const {
5460 Register Src;
5461 unsigned Mods;
5462 bool Matched;
5463 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5464 if (!Matched)
5465 return {};
5466
5467 return {{
5468 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5469 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5470 }};
5471}
5472
5474AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5475 Register Src;
5476 unsigned Mods;
5477 bool Matched;
5478 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5479
5480 return {{
5481 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5482 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5483 }};
5484}
5485
5486bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5487 MachineInstr &I, Intrinsic::ID IntrID) const {
5488 MachineBasicBlock *MBB = I.getParent();
5489 const DebugLoc &DL = I.getDebugLoc();
5490 Register CCReg = I.getOperand(0).getReg();
5491
5492 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5493
5494 if (HasM0) {
5495 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5496 .addReg(I.getOperand(2).getReg());
5497 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5498 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5499 return false;
5500 } else {
5501 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5502 .addImm(I.getOperand(2).getImm());
5503 }
5504
5505 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5506
5507 I.eraseFromParent();
5508 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5509 *MRI);
5510}
5511
5512unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5513 if (HasInlineConst) {
5514 switch (IntrID) {
5515 default:
5516 llvm_unreachable("not a named barrier op");
5517 case Intrinsic::amdgcn_s_barrier_init:
5518 return AMDGPU::S_BARRIER_INIT_IMM;
5519 case Intrinsic::amdgcn_s_barrier_join:
5520 return AMDGPU::S_BARRIER_JOIN_IMM;
5521 case Intrinsic::amdgcn_s_wakeup_barrier:
5522 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5523 case Intrinsic::amdgcn_s_get_barrier_state:
5524 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5525 };
5526 } else {
5527 switch (IntrID) {
5528 default:
5529 llvm_unreachable("not a named barrier op");
5530 case Intrinsic::amdgcn_s_barrier_init:
5531 return AMDGPU::S_BARRIER_INIT_M0;
5532 case Intrinsic::amdgcn_s_barrier_join:
5533 return AMDGPU::S_BARRIER_JOIN_M0;
5534 case Intrinsic::amdgcn_s_wakeup_barrier:
5535 return AMDGPU::S_WAKEUP_BARRIER_M0;
5536 case Intrinsic::amdgcn_s_get_barrier_state:
5537 return AMDGPU::S_GET_BARRIER_STATE_M0;
5538 };
5539 }
5540}
5541
5542bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5543 MachineInstr &I, Intrinsic::ID IntrID) const {
5544 MachineBasicBlock *MBB = I.getParent();
5545 const DebugLoc &DL = I.getDebugLoc();
5546 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5547 ? I.getOperand(2)
5548 : I.getOperand(1);
5549 std::optional<int64_t> BarValImm =
5550 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5551 Register M0Val;
5552 Register TmpReg0;
5553
5554 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5555 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5556 Register MemberCount = I.getOperand(2).getReg();
5557 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5558 // TODO: This should be expanded during legalization so that the the S_LSHL
5559 // and S_OR can be constant-folded
5560 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5561 .addImm(16)
5562 .addReg(MemberCount);
5563 M0Val = TmpReg0;
5564 }
5565
5566 // If not inlinable, get reference to barrier depending on the instruction
5567 if (!BarValImm) {
5568 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5569 // If reference to barrier id is not an inlinable constant then it must be
5570 // referenced with M0[4:0]. Perform an OR with the member count to include
5571 // it in M0 for S_BARRIER_INIT.
5572 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5573 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5574 .addReg(BarOp.getReg())
5575 .addReg(TmpReg0);
5576 M0Val = TmpReg1;
5577 } else {
5578 M0Val = BarOp.getReg();
5579 }
5580 }
5581
5582 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5583 if (M0Val) {
5584 auto CopyMIB =
5585 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5586 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5587 }
5588
5590 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5591 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5592
5593 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5594 MIB.addDef(I.getOperand(0).getReg());
5595
5596 if (BarValImm)
5597 MIB.addImm(*BarValImm);
5598
5599 I.eraseFromParent();
5600 return true;
5601}
5602
5603bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5604 MachineBasicBlock *BB = I.getParent();
5605 const DebugLoc &DL = I.getDebugLoc();
5606 Register CCReg = I.getOperand(0).getReg();
5607
5608 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5609 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5610
5611 I.eraseFromParent();
5612 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5613 *MRI);
5614}
5615
5616void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5617 const MachineInstr &MI,
5618 int OpIdx) const {
5619 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5620 "Expected G_CONSTANT");
5621 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5622}
5623
5624void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5625 const MachineInstr &MI,
5626 int OpIdx) const {
5627 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5628 "Expected G_CONSTANT");
5629 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5630}
5631
5632void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5633 const MachineInstr &MI,
5634 int OpIdx) const {
5635 assert(OpIdx == -1);
5636
5637 const MachineOperand &Op = MI.getOperand(1);
5638 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5639 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5640 else {
5641 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5642 MIB.addImm(Op.getCImm()->getSExtValue());
5643 }
5644}
5645
5646void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5647 const MachineInstr &MI,
5648 int OpIdx) const {
5649 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5650 "Expected G_CONSTANT");
5651 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5652}
5653
5654/// This only really exists to satisfy DAG type checking machinery, so is a
5655/// no-op here.
5656void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5657 const MachineInstr &MI,
5658 int OpIdx) const {
5659 MIB.addImm(MI.getOperand(OpIdx).getImm());
5660}
5661
5662void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5663 const MachineInstr &MI,
5664 int OpIdx) const {
5665 assert(OpIdx >= 0 && "expected to match an immediate operand");
5666 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5667}
5668
5669void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5670 const MachineInstr &MI,
5671 int OpIdx) const {
5672 assert(OpIdx >= 0 && "expected to match an immediate operand");
5673 MIB.addImm(MI.getOperand(OpIdx).getImm() &
5676}
5677
5678void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5679 const MachineInstr &MI,
5680 int OpIdx) const {
5681 assert(OpIdx >= 0 && "expected to match an immediate operand");
5682 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5685 MIB.addImm(Swizzle);
5686}
5687
5688void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5689 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5690 assert(OpIdx >= 0 && "expected to match an immediate operand");
5691 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5694 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5695}
5696
5697void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5698 const MachineInstr &MI,
5699 int OpIdx) const {
5700 MIB.addFrameIndex(MI.getOperand(1).getIndex());
5701}
5702
5703void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5704 const MachineInstr &MI,
5705 int OpIdx) const {
5706 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5707 int ExpVal = APF.getExactLog2Abs();
5708 assert(ExpVal != INT_MIN);
5709 MIB.addImm(ExpVal);
5710}
5711
5712bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5713 return TII.isInlineConstant(Imm);
5714}
5715
5716bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5717 return TII.isInlineConstant(Imm);
5718}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static int sizeToSubRegIndex(unsigned Size)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static MachineInstr * stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
const char LLVMTargetMachineRef TM
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1339
APInt bitcastToAPInt() const
Definition: APFloat.h:1210
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1606
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:960
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:963
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:977
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:989
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:990
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:966
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:975
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:964
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:965
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:984
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:983
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:987
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:974
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:968
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:971
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:985
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:972
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:967
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:969
@ ICMP_EQ
equal
Definition: InstrTypes.h:981
@ ICMP_NE
not equal
Definition: InstrTypes.h:982
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:988
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:976
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:986
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:973
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:962
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:970
bool isFPPredicate() const
Definition: InstrTypes.h:1089
bool isIntPredicate() const
Definition: InstrTypes.h:1090
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:268
const APFloat & getValueAPF() const
Definition: Constants.h:311
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:160
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:944
int getLDSBankCount() const
Definition: GCNSubtarget.h:325
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:453
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:457
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:610
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:538
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:259
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:688
bool isWave32() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:516
Generation getGeneration() const
Definition: GCNSubtarget.h:302
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:718
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:702
bool hasAddr64() const
Definition: GCNSubtarget.h:366
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:710
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1067
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:549
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:475
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
const ConstantFP * getFPImm() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
const Triple & getTargetTriple() const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:370
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1459
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:548
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:882
@ Offset
Definition: DWP.cpp:456
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:54
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:625
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:438
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:293
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:153
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:465
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:305
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:419
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:413
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:446
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:472
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.