LLVM 20.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
60 CodeGenCoverage *CoverageInfo,
62 BlockFrequencyInfo *BFI) {
63 MRI = &MF.getRegInfo();
64 Subtarget = &MF.getSubtarget<GCNSubtarget>();
67}
68
69// Return the wave level SGPR base address if this is a wave address.
71 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72 ? Def->getOperand(1).getReg()
73 : Register();
74}
75
76bool AMDGPUInstructionSelector::isVCC(Register Reg,
77 const MachineRegisterInfo &MRI) const {
78 // The verifier is oblivious to s1 being a valid value for wavesize registers.
79 if (Reg.isPhysical())
80 return false;
81
82 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
83 const TargetRegisterClass *RC =
84 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
85 if (RC) {
86 const LLT Ty = MRI.getType(Reg);
87 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
88 return false;
89 // G_TRUNC s1 result is never vcc.
90 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
91 RC->hasSuperClassEq(TRI.getBoolRC());
92 }
93
94 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
95 return RB->getID() == AMDGPU::VCCRegBankID;
96}
97
98bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
99 unsigned NewOpc) const {
100 MI.setDesc(TII.get(NewOpc));
101 MI.removeOperand(1); // Remove intrinsic ID.
102 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
103
104 MachineOperand &Dst = MI.getOperand(0);
105 MachineOperand &Src = MI.getOperand(1);
106
107 // TODO: This should be legalized to s32 if needed
108 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
109 return false;
110
111 const TargetRegisterClass *DstRC
112 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
113 const TargetRegisterClass *SrcRC
114 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
115 if (!DstRC || DstRC != SrcRC)
116 return false;
117
118 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
119 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
120}
121
122bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
123 const DebugLoc &DL = I.getDebugLoc();
124 MachineBasicBlock *BB = I.getParent();
125 I.setDesc(TII.get(TargetOpcode::COPY));
126
127 const MachineOperand &Src = I.getOperand(1);
128 MachineOperand &Dst = I.getOperand(0);
129 Register DstReg = Dst.getReg();
130 Register SrcReg = Src.getReg();
131
132 if (isVCC(DstReg, *MRI)) {
133 if (SrcReg == AMDGPU::SCC) {
134 const TargetRegisterClass *RC
135 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
136 if (!RC)
137 return true;
138 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
139 }
140
141 if (!isVCC(SrcReg, *MRI)) {
142 // TODO: Should probably leave the copy and let copyPhysReg expand it.
143 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
144 return false;
145
146 const TargetRegisterClass *SrcRC
147 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
148
149 std::optional<ValueAndVReg> ConstVal =
150 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
151 if (ConstVal) {
152 unsigned MovOpc =
153 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
154 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
155 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
156 } else {
157 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
158
159 // We can't trust the high bits at this point, so clear them.
160
161 // TODO: Skip masking high bits if def is known boolean.
162
163 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
164 assert(Subtarget->useRealTrue16Insts());
165 const int64_t NoMods = 0;
166 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
167 .addImm(NoMods)
168 .addImm(1)
169 .addImm(NoMods)
170 .addReg(SrcReg)
171 .addImm(NoMods);
172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
173 .addImm(NoMods)
174 .addImm(0)
175 .addImm(NoMods)
176 .addReg(MaskedReg)
177 .addImm(NoMods);
178 } else {
179 bool IsSGPR = TRI.isSGPRClass(SrcRC);
180 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
181 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
182 .addImm(1)
183 .addReg(SrcReg);
184 if (IsSGPR)
185 And.setOperandDead(3); // Dead scc
186
187 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
188 .addImm(0)
189 .addReg(MaskedReg);
190 }
191 }
192
193 if (!MRI->getRegClassOrNull(SrcReg))
194 MRI->setRegClass(SrcReg, SrcRC);
195 I.eraseFromParent();
196 return true;
197 }
198
199 const TargetRegisterClass *RC =
201 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
202 return false;
203
204 return true;
205 }
206
207 for (const MachineOperand &MO : I.operands()) {
208 if (MO.getReg().isPhysical())
209 continue;
210
211 const TargetRegisterClass *RC =
213 if (!RC)
214 continue;
215 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
216 }
217 return true;
218}
219
220bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
221 const Register DefReg = I.getOperand(0).getReg();
222 const LLT DefTy = MRI->getType(DefReg);
223
224 // S1 G_PHIs should not be selected in instruction-select, instead:
225 // - divergent S1 G_PHI should go through lane mask merging algorithm
226 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
227 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
228 if (DefTy == LLT::scalar(1))
229 return false;
230
231 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
232
233 const RegClassOrRegBank &RegClassOrBank =
234 MRI->getRegClassOrRegBank(DefReg);
235
236 const TargetRegisterClass *DefRC =
237 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
238 if (!DefRC) {
239 if (!DefTy.isValid()) {
240 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
241 return false;
242 }
243
244 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
245 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
246 if (!DefRC) {
247 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
248 return false;
249 }
250 }
251
252 // TODO: Verify that all registers have the same bank
253 I.setDesc(TII.get(TargetOpcode::PHI));
254 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
255}
256
258AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
259 const TargetRegisterClass &SubRC,
260 unsigned SubIdx) const {
261
262 MachineInstr *MI = MO.getParent();
264 Register DstReg = MRI->createVirtualRegister(&SubRC);
265
266 if (MO.isReg()) {
267 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
268 Register Reg = MO.getReg();
269 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
270 .addReg(Reg, 0, ComposedSubIdx);
271
272 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
273 MO.isKill(), MO.isDead(), MO.isUndef(),
274 MO.isEarlyClobber(), 0, MO.isDebug(),
275 MO.isInternalRead());
276 }
277
278 assert(MO.isImm());
279
280 APInt Imm(64, MO.getImm());
281
282 switch (SubIdx) {
283 default:
284 llvm_unreachable("do not know to split immediate with this sub index.");
285 case AMDGPU::sub0:
286 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
287 case AMDGPU::sub1:
288 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
289 }
290}
291
292static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
293 switch (Opc) {
294 case AMDGPU::G_AND:
295 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
296 case AMDGPU::G_OR:
297 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
298 case AMDGPU::G_XOR:
299 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
300 default:
301 llvm_unreachable("not a bit op");
302 }
303}
304
305bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
306 Register DstReg = I.getOperand(0).getReg();
307 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
308
309 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
310 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
311 DstRB->getID() != AMDGPU::VCCRegBankID)
312 return false;
313
314 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
315 STI.isWave64());
316 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
317
318 // Dead implicit-def of scc
319 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
320 true, // isImp
321 false, // isKill
322 true)); // isDead
323 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
324}
325
326bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
327 MachineBasicBlock *BB = I.getParent();
329 Register DstReg = I.getOperand(0).getReg();
330 const DebugLoc &DL = I.getDebugLoc();
331 LLT Ty = MRI->getType(DstReg);
332 if (Ty.isVector())
333 return false;
334
335 unsigned Size = Ty.getSizeInBits();
336 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
337 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
338 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
339
340 if (Size == 32) {
341 if (IsSALU) {
342 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
344 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
345 .add(I.getOperand(1))
346 .add(I.getOperand(2))
347 .setOperandDead(3); // Dead scc
348 I.eraseFromParent();
349 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
350 }
351
352 if (STI.hasAddNoCarry()) {
353 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
354 I.setDesc(TII.get(Opc));
355 I.addOperand(*MF, MachineOperand::CreateImm(0));
356 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
357 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
358 }
359
360 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
361
362 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
364 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
365 .addDef(UnusedCarry, RegState::Dead)
366 .add(I.getOperand(1))
367 .add(I.getOperand(2))
368 .addImm(0);
369 I.eraseFromParent();
370 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
371 }
372
373 assert(!Sub && "illegal sub should not reach here");
374
375 const TargetRegisterClass &RC
376 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
377 const TargetRegisterClass &HalfRC
378 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
379
380 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
381 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
382 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
383 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
384
385 Register DstLo = MRI->createVirtualRegister(&HalfRC);
386 Register DstHi = MRI->createVirtualRegister(&HalfRC);
387
388 if (IsSALU) {
389 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
390 .add(Lo1)
391 .add(Lo2);
392 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
393 .add(Hi1)
394 .add(Hi2)
395 .setOperandDead(3); // Dead scc
396 } else {
397 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
398 Register CarryReg = MRI->createVirtualRegister(CarryRC);
399 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
400 .addDef(CarryReg)
401 .add(Lo1)
402 .add(Lo2)
403 .addImm(0);
404 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
405 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
406 .add(Hi1)
407 .add(Hi2)
408 .addReg(CarryReg, RegState::Kill)
409 .addImm(0);
410
411 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
412 return false;
413 }
414
415 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
416 .addReg(DstLo)
417 .addImm(AMDGPU::sub0)
418 .addReg(DstHi)
419 .addImm(AMDGPU::sub1);
420
421
422 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
423 return false;
424
425 I.eraseFromParent();
426 return true;
427}
428
429bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
430 MachineInstr &I) const {
431 MachineBasicBlock *BB = I.getParent();
433 const DebugLoc &DL = I.getDebugLoc();
434 Register Dst0Reg = I.getOperand(0).getReg();
435 Register Dst1Reg = I.getOperand(1).getReg();
436 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
437 I.getOpcode() == AMDGPU::G_UADDE;
438 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
439 I.getOpcode() == AMDGPU::G_USUBE;
440
441 if (isVCC(Dst1Reg, *MRI)) {
442 unsigned NoCarryOpc =
443 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
444 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
445 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
446 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
447 I.addOperand(*MF, MachineOperand::CreateImm(0));
448 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
449 }
450
451 Register Src0Reg = I.getOperand(2).getReg();
452 Register Src1Reg = I.getOperand(3).getReg();
453
454 if (HasCarryIn) {
455 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
456 .addReg(I.getOperand(4).getReg());
457 }
458
459 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
460 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
461
462 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
463 .add(I.getOperand(2))
464 .add(I.getOperand(3));
465
466 if (MRI->use_nodbg_empty(Dst1Reg)) {
467 CarryInst.setOperandDead(3); // Dead scc
468 } else {
469 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
470 .addReg(AMDGPU::SCC);
471 if (!MRI->getRegClassOrNull(Dst1Reg))
472 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
473 }
474
475 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
476 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
477 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
478 return false;
479
480 if (HasCarryIn &&
481 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
482 AMDGPU::SReg_32RegClass, *MRI))
483 return false;
484
485 I.eraseFromParent();
486 return true;
487}
488
489bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
490 MachineInstr &I) const {
491 MachineBasicBlock *BB = I.getParent();
493 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
494
495 unsigned Opc;
496 if (Subtarget->hasMADIntraFwdBug())
497 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
498 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
499 else
500 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
501 I.setDesc(TII.get(Opc));
502 I.addOperand(*MF, MachineOperand::CreateImm(0));
503 I.addImplicitDefUseOperands(*MF);
504 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
505}
506
507// TODO: We should probably legalize these to only using 32-bit results.
508bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
509 MachineBasicBlock *BB = I.getParent();
510 Register DstReg = I.getOperand(0).getReg();
511 Register SrcReg = I.getOperand(1).getReg();
512 LLT DstTy = MRI->getType(DstReg);
513 LLT SrcTy = MRI->getType(SrcReg);
514 const unsigned SrcSize = SrcTy.getSizeInBits();
515 unsigned DstSize = DstTy.getSizeInBits();
516
517 // TODO: Should handle any multiple of 32 offset.
518 unsigned Offset = I.getOperand(2).getImm();
519 if (Offset % 32 != 0 || DstSize > 128)
520 return false;
521
522 // 16-bit operations really use 32-bit registers.
523 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
524 if (DstSize == 16)
525 DstSize = 32;
526
527 const TargetRegisterClass *DstRC =
528 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
529 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
530 return false;
531
532 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
533 const TargetRegisterClass *SrcRC =
534 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
535 if (!SrcRC)
536 return false;
538 DstSize / 32);
539 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
540 if (!SrcRC)
541 return false;
542
543 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
544 *SrcRC, I.getOperand(1));
545 const DebugLoc &DL = I.getDebugLoc();
546 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
547 .addReg(SrcReg, 0, SubReg);
548
549 I.eraseFromParent();
550 return true;
551}
552
553bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
554 MachineBasicBlock *BB = MI.getParent();
555 Register DstReg = MI.getOperand(0).getReg();
556 LLT DstTy = MRI->getType(DstReg);
557 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
558
559 const unsigned SrcSize = SrcTy.getSizeInBits();
560 if (SrcSize < 32)
561 return selectImpl(MI, *CoverageInfo);
562
563 const DebugLoc &DL = MI.getDebugLoc();
564 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
565 const unsigned DstSize = DstTy.getSizeInBits();
566 const TargetRegisterClass *DstRC =
567 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
568 if (!DstRC)
569 return false;
570
571 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
573 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
574 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
575 MachineOperand &Src = MI.getOperand(I + 1);
576 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
577 MIB.addImm(SubRegs[I]);
578
579 const TargetRegisterClass *SrcRC
580 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
581 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
582 return false;
583 }
584
585 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
586 return false;
587
588 MI.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
593 MachineBasicBlock *BB = MI.getParent();
594 const int NumDst = MI.getNumOperands() - 1;
595
596 MachineOperand &Src = MI.getOperand(NumDst);
597
598 Register SrcReg = Src.getReg();
599 Register DstReg0 = MI.getOperand(0).getReg();
600 LLT DstTy = MRI->getType(DstReg0);
601 LLT SrcTy = MRI->getType(SrcReg);
602
603 const unsigned DstSize = DstTy.getSizeInBits();
604 const unsigned SrcSize = SrcTy.getSizeInBits();
605 const DebugLoc &DL = MI.getDebugLoc();
606 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
607
608 const TargetRegisterClass *SrcRC =
609 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
610 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
611 return false;
612
613 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
614 // source, and this relies on the fact that the same subregister indices are
615 // used for both.
616 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
617 for (int I = 0, E = NumDst; I != E; ++I) {
618 MachineOperand &Dst = MI.getOperand(I);
619 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
620 .addReg(SrcReg, 0, SubRegs[I]);
621
622 // Make sure the subregister index is valid for the source register.
623 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
624 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
625 return false;
626
627 const TargetRegisterClass *DstRC =
629 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
630 return false;
631 }
632
633 MI.eraseFromParent();
634 return true;
635}
636
637bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
638 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
639 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
640
641 Register Src0 = MI.getOperand(1).getReg();
642 Register Src1 = MI.getOperand(2).getReg();
643 LLT SrcTy = MRI->getType(Src0);
644 const unsigned SrcSize = SrcTy.getSizeInBits();
645
646 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
647 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
648 return selectG_MERGE_VALUES(MI);
649 }
650
651 // Selection logic below is for V2S16 only.
652 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
653 Register Dst = MI.getOperand(0).getReg();
654 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
655 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
656 SrcTy != LLT::scalar(32)))
657 return selectImpl(MI, *CoverageInfo);
658
659 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
660 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
661 return false;
662
663 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
664 DstBank->getID() == AMDGPU::VGPRRegBankID);
665 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
666
667 const DebugLoc &DL = MI.getDebugLoc();
668 MachineBasicBlock *BB = MI.getParent();
669
670 // First, before trying TableGen patterns, check if both sources are
671 // constants. In those cases, we can trivially compute the final constant
672 // and emit a simple move.
673 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
674 if (ConstSrc1) {
675 auto ConstSrc0 =
676 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
677 if (ConstSrc0) {
678 const int64_t K0 = ConstSrc0->Value.getSExtValue();
679 const int64_t K1 = ConstSrc1->Value.getSExtValue();
680 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
681 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
682 uint32_t Imm = Lo16 | (Hi16 << 16);
683
684 // VALU
685 if (IsVector) {
686 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
687 MI.eraseFromParent();
688 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
689 }
690
691 // SALU
692 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
693 MI.eraseFromParent();
694 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
695 }
696 }
697
698 // Now try TableGen patterns.
699 if (selectImpl(MI, *CoverageInfo))
700 return true;
701
702 // TODO: This should probably be a combine somewhere
703 // (build_vector $src0, undef) -> copy $src0
704 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
705 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
706 MI.setDesc(TII.get(AMDGPU::COPY));
707 MI.removeOperand(2);
708 const auto &RC =
709 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
710 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
711 RBI.constrainGenericRegister(Src0, RC, *MRI);
712 }
713
714 // TODO: Can be improved?
715 if (IsVector) {
716 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
717 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
718 .addImm(0xFFFF)
719 .addReg(Src0);
720 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
721 return false;
722
723 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
724 .addReg(Src1)
725 .addImm(16)
726 .addReg(TmpReg);
727 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
728 return false;
729
730 MI.eraseFromParent();
731 return true;
732 }
733
734 Register ShiftSrc0;
735 Register ShiftSrc1;
736
737 // With multiple uses of the shift, this will duplicate the shift and
738 // increase register pressure.
739 //
740 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
741 // => (S_PACK_HH_B32_B16 $src0, $src1)
742 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
743 // => (S_PACK_HL_B32_B16 $src0, $src1)
744 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
745 // => (S_PACK_LH_B32_B16 $src0, $src1)
746 // (build_vector $src0, $src1)
747 // => (S_PACK_LL_B32_B16 $src0, $src1)
748
749 bool Shift0 = mi_match(
750 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
751
752 bool Shift1 = mi_match(
753 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
754
755 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
756 if (Shift0 && Shift1) {
757 Opc = AMDGPU::S_PACK_HH_B32_B16;
758 MI.getOperand(1).setReg(ShiftSrc0);
759 MI.getOperand(2).setReg(ShiftSrc1);
760 } else if (Shift1) {
761 Opc = AMDGPU::S_PACK_LH_B32_B16;
762 MI.getOperand(2).setReg(ShiftSrc1);
763 } else if (Shift0) {
764 auto ConstSrc1 =
765 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
766 if (ConstSrc1 && ConstSrc1->Value == 0) {
767 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
768 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
769 .addReg(ShiftSrc0)
770 .addImm(16)
771 .setOperandDead(3); // Dead scc
772
773 MI.eraseFromParent();
774 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
775 }
776 if (STI.hasSPackHL()) {
777 Opc = AMDGPU::S_PACK_HL_B32_B16;
778 MI.getOperand(1).setReg(ShiftSrc0);
779 }
780 }
781
782 MI.setDesc(TII.get(Opc));
783 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
784}
785
786bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
787 const MachineOperand &MO = I.getOperand(0);
788
789 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
790 // regbank check here is to know why getConstrainedRegClassForOperand failed.
792 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
793 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
794 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
795 return true;
796 }
797
798 return false;
799}
800
801bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
802 MachineBasicBlock *BB = I.getParent();
803
804 Register DstReg = I.getOperand(0).getReg();
805 Register Src0Reg = I.getOperand(1).getReg();
806 Register Src1Reg = I.getOperand(2).getReg();
807 LLT Src1Ty = MRI->getType(Src1Reg);
808
809 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
810 unsigned InsSize = Src1Ty.getSizeInBits();
811
812 int64_t Offset = I.getOperand(3).getImm();
813
814 // FIXME: These cases should have been illegal and unnecessary to check here.
815 if (Offset % 32 != 0 || InsSize % 32 != 0)
816 return false;
817
818 // Currently not handled by getSubRegFromChannel.
819 if (InsSize > 128)
820 return false;
821
822 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
823 if (SubReg == AMDGPU::NoSubRegister)
824 return false;
825
826 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
827 const TargetRegisterClass *DstRC =
828 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
829 if (!DstRC)
830 return false;
831
832 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
833 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
834 const TargetRegisterClass *Src0RC =
835 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
836 const TargetRegisterClass *Src1RC =
837 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
838
839 // Deal with weird cases where the class only partially supports the subreg
840 // index.
841 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
842 if (!Src0RC || !Src1RC)
843 return false;
844
845 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
846 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
847 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
848 return false;
849
850 const DebugLoc &DL = I.getDebugLoc();
851 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
852 .addReg(Src0Reg)
853 .addReg(Src1Reg)
854 .addImm(SubReg);
855
856 I.eraseFromParent();
857 return true;
858}
859
860bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
861 Register DstReg = MI.getOperand(0).getReg();
862 Register SrcReg = MI.getOperand(1).getReg();
863 Register OffsetReg = MI.getOperand(2).getReg();
864 Register WidthReg = MI.getOperand(3).getReg();
865
866 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
867 "scalar BFX instructions are expanded in regbankselect");
868 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
869 "64-bit vector BFX instructions are expanded in regbankselect");
870
871 const DebugLoc &DL = MI.getDebugLoc();
872 MachineBasicBlock *MBB = MI.getParent();
873
874 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
875 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
876 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
877 .addReg(SrcReg)
878 .addReg(OffsetReg)
879 .addReg(WidthReg);
880 MI.eraseFromParent();
881 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
882}
883
884bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
885 if (STI.getLDSBankCount() != 16)
886 return selectImpl(MI, *CoverageInfo);
887
888 Register Dst = MI.getOperand(0).getReg();
889 Register Src0 = MI.getOperand(2).getReg();
890 Register M0Val = MI.getOperand(6).getReg();
891 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
892 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
893 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
894 return false;
895
896 // This requires 2 instructions. It is possible to write a pattern to support
897 // this, but the generated isel emitter doesn't correctly deal with multiple
898 // output instructions using the same physical register input. The copy to m0
899 // is incorrectly placed before the second instruction.
900 //
901 // TODO: Match source modifiers.
902
903 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
904 const DebugLoc &DL = MI.getDebugLoc();
905 MachineBasicBlock *MBB = MI.getParent();
906
907 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
908 .addReg(M0Val);
909 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
910 .addImm(2)
911 .addImm(MI.getOperand(4).getImm()) // $attr
912 .addImm(MI.getOperand(3).getImm()); // $attrchan
913
914 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
915 .addImm(0) // $src0_modifiers
916 .addReg(Src0) // $src0
917 .addImm(MI.getOperand(4).getImm()) // $attr
918 .addImm(MI.getOperand(3).getImm()) // $attrchan
919 .addImm(0) // $src2_modifiers
920 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
921 .addImm(MI.getOperand(5).getImm()) // $high
922 .addImm(0) // $clamp
923 .addImm(0); // $omod
924
925 MI.eraseFromParent();
926 return true;
927}
928
929// Writelane is special in that it can use SGPR and M0 (which would normally
930// count as using the constant bus twice - but in this case it is allowed since
931// the lane selector doesn't count as a use of the constant bus). However, it is
932// still required to abide by the 1 SGPR rule. Fix this up if we might have
933// multiple SGPRs.
934bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
935 // With a constant bus limit of at least 2, there's no issue.
936 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
937 return selectImpl(MI, *CoverageInfo);
938
939 MachineBasicBlock *MBB = MI.getParent();
940 const DebugLoc &DL = MI.getDebugLoc();
941 Register VDst = MI.getOperand(0).getReg();
942 Register Val = MI.getOperand(2).getReg();
943 Register LaneSelect = MI.getOperand(3).getReg();
944 Register VDstIn = MI.getOperand(4).getReg();
945
946 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
947
948 std::optional<ValueAndVReg> ConstSelect =
949 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
950 if (ConstSelect) {
951 // The selector has to be an inline immediate, so we can use whatever for
952 // the other operands.
953 MIB.addReg(Val);
954 MIB.addImm(ConstSelect->Value.getSExtValue() &
955 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
956 } else {
957 std::optional<ValueAndVReg> ConstVal =
959
960 // If the value written is an inline immediate, we can get away without a
961 // copy to m0.
962 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
963 STI.hasInv2PiInlineImm())) {
964 MIB.addImm(ConstVal->Value.getSExtValue());
965 MIB.addReg(LaneSelect);
966 } else {
967 MIB.addReg(Val);
968
969 // If the lane selector was originally in a VGPR and copied with
970 // readfirstlane, there's a hazard to read the same SGPR from the
971 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
972 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
973
974 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
975 .addReg(LaneSelect);
976 MIB.addReg(AMDGPU::M0);
977 }
978 }
979
980 MIB.addReg(VDstIn);
981
982 MI.eraseFromParent();
983 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
984}
985
986// We need to handle this here because tablegen doesn't support matching
987// instructions with multiple outputs.
988bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
989 Register Dst0 = MI.getOperand(0).getReg();
990 Register Dst1 = MI.getOperand(1).getReg();
991
992 LLT Ty = MRI->getType(Dst0);
993 unsigned Opc;
994 if (Ty == LLT::scalar(32))
995 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
996 else if (Ty == LLT::scalar(64))
997 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
998 else
999 return false;
1000
1001 // TODO: Match source modifiers.
1002
1003 const DebugLoc &DL = MI.getDebugLoc();
1004 MachineBasicBlock *MBB = MI.getParent();
1005
1006 Register Numer = MI.getOperand(3).getReg();
1007 Register Denom = MI.getOperand(4).getReg();
1008 unsigned ChooseDenom = MI.getOperand(5).getImm();
1009
1010 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1011
1012 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1013 .addDef(Dst1)
1014 .addImm(0) // $src0_modifiers
1015 .addUse(Src0) // $src0
1016 .addImm(0) // $src1_modifiers
1017 .addUse(Denom) // $src1
1018 .addImm(0) // $src2_modifiers
1019 .addUse(Numer) // $src2
1020 .addImm(0) // $clamp
1021 .addImm(0); // $omod
1022
1023 MI.eraseFromParent();
1024 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1025}
1026
1027bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1028 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1029 switch (IntrinsicID) {
1030 case Intrinsic::amdgcn_if_break: {
1031 MachineBasicBlock *BB = I.getParent();
1032
1033 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1034 // SelectionDAG uses for wave32 vs wave64.
1035 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1036 .add(I.getOperand(0))
1037 .add(I.getOperand(2))
1038 .add(I.getOperand(3));
1039
1040 Register DstReg = I.getOperand(0).getReg();
1041 Register Src0Reg = I.getOperand(2).getReg();
1042 Register Src1Reg = I.getOperand(3).getReg();
1043
1044 I.eraseFromParent();
1045
1046 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1047 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1048
1049 return true;
1050 }
1051 case Intrinsic::amdgcn_interp_p1_f16:
1052 return selectInterpP1F16(I);
1053 case Intrinsic::amdgcn_wqm:
1054 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1055 case Intrinsic::amdgcn_softwqm:
1056 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1057 case Intrinsic::amdgcn_strict_wwm:
1058 case Intrinsic::amdgcn_wwm:
1059 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1060 case Intrinsic::amdgcn_strict_wqm:
1061 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1062 case Intrinsic::amdgcn_writelane:
1063 return selectWritelane(I);
1064 case Intrinsic::amdgcn_div_scale:
1065 return selectDivScale(I);
1066 case Intrinsic::amdgcn_icmp:
1067 case Intrinsic::amdgcn_fcmp:
1068 if (selectImpl(I, *CoverageInfo))
1069 return true;
1070 return selectIntrinsicCmp(I);
1071 case Intrinsic::amdgcn_ballot:
1072 return selectBallot(I);
1073 case Intrinsic::amdgcn_reloc_constant:
1074 return selectRelocConstant(I);
1075 case Intrinsic::amdgcn_groupstaticsize:
1076 return selectGroupStaticSize(I);
1077 case Intrinsic::returnaddress:
1078 return selectReturnAddress(I);
1079 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1080 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1081 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1082 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1083 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1084 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1085 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1086 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1087 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1088 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1089 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1090 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1091 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1092 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1093 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1094 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1095 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1096 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1097 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1098 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1099 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1100 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1101 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1102 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1103 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1104 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1105 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1106 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1107 return selectSMFMACIntrin(I);
1108 case Intrinsic::amdgcn_permlane16_swap:
1109 case Intrinsic::amdgcn_permlane32_swap:
1110 return selectPermlaneSwapIntrin(I, IntrinsicID);
1111 default:
1112 return selectImpl(I, *CoverageInfo);
1113 }
1114}
1115
1117 const GCNSubtarget &ST) {
1118 if (Size != 16 && Size != 32 && Size != 64)
1119 return -1;
1120
1121 if (Size == 16 && !ST.has16BitInsts())
1122 return -1;
1123
1124 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1125 unsigned FakeS16Opc, unsigned S32Opc,
1126 unsigned S64Opc) {
1127 if (Size == 16)
1128 // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code
1129 return ST.hasTrue16BitInsts()
1130 ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc
1131 : S16Opc;
1132 if (Size == 32)
1133 return S32Opc;
1134 return S64Opc;
1135 };
1136
1137 switch (P) {
1138 default:
1139 llvm_unreachable("Unknown condition code!");
1140 case CmpInst::ICMP_NE:
1141 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1142 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1143 AMDGPU::V_CMP_NE_U64_e64);
1144 case CmpInst::ICMP_EQ:
1145 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1146 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1147 AMDGPU::V_CMP_EQ_U64_e64);
1148 case CmpInst::ICMP_SGT:
1149 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1150 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1151 AMDGPU::V_CMP_GT_I64_e64);
1152 case CmpInst::ICMP_SGE:
1153 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1154 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1155 AMDGPU::V_CMP_GE_I64_e64);
1156 case CmpInst::ICMP_SLT:
1157 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1158 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1159 AMDGPU::V_CMP_LT_I64_e64);
1160 case CmpInst::ICMP_SLE:
1161 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1162 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1163 AMDGPU::V_CMP_LE_I64_e64);
1164 case CmpInst::ICMP_UGT:
1165 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1166 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1167 AMDGPU::V_CMP_GT_U64_e64);
1168 case CmpInst::ICMP_UGE:
1169 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1170 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1171 AMDGPU::V_CMP_GE_U64_e64);
1172 case CmpInst::ICMP_ULT:
1173 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1174 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1175 AMDGPU::V_CMP_LT_U64_e64);
1176 case CmpInst::ICMP_ULE:
1177 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1178 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1179 AMDGPU::V_CMP_LE_U64_e64);
1180
1181 case CmpInst::FCMP_OEQ:
1182 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1183 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1184 AMDGPU::V_CMP_EQ_F64_e64);
1185 case CmpInst::FCMP_OGT:
1186 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1187 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1188 AMDGPU::V_CMP_GT_F64_e64);
1189 case CmpInst::FCMP_OGE:
1190 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1191 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1192 AMDGPU::V_CMP_GE_F64_e64);
1193 case CmpInst::FCMP_OLT:
1194 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1195 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1196 AMDGPU::V_CMP_LT_F64_e64);
1197 case CmpInst::FCMP_OLE:
1198 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1199 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1200 AMDGPU::V_CMP_LE_F64_e64);
1201 case CmpInst::FCMP_ONE:
1202 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1203 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1204 AMDGPU::V_CMP_NEQ_F64_e64);
1205 case CmpInst::FCMP_ORD:
1206 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1207 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1208 AMDGPU::V_CMP_O_F64_e64);
1209 case CmpInst::FCMP_UNO:
1210 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1211 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1212 AMDGPU::V_CMP_U_F64_e64);
1213 case CmpInst::FCMP_UEQ:
1214 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1215 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1216 AMDGPU::V_CMP_NLG_F64_e64);
1217 case CmpInst::FCMP_UGT:
1218 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1219 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1220 AMDGPU::V_CMP_NLE_F64_e64);
1221 case CmpInst::FCMP_UGE:
1222 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1223 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1224 AMDGPU::V_CMP_NLT_F64_e64);
1225 case CmpInst::FCMP_ULT:
1226 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1227 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1228 AMDGPU::V_CMP_NGE_F64_e64);
1229 case CmpInst::FCMP_ULE:
1230 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1231 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1232 AMDGPU::V_CMP_NGT_F64_e64);
1233 case CmpInst::FCMP_UNE:
1234 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1235 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1236 AMDGPU::V_CMP_NEQ_F64_e64);
1237 case CmpInst::FCMP_TRUE:
1238 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1239 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1240 AMDGPU::V_CMP_TRU_F64_e64);
1242 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1243 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1244 AMDGPU::V_CMP_F_F64_e64);
1245 }
1246}
1247
1248int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1249 unsigned Size) const {
1250 if (Size == 64) {
1251 if (!STI.hasScalarCompareEq64())
1252 return -1;
1253
1254 switch (P) {
1255 case CmpInst::ICMP_NE:
1256 return AMDGPU::S_CMP_LG_U64;
1257 case CmpInst::ICMP_EQ:
1258 return AMDGPU::S_CMP_EQ_U64;
1259 default:
1260 return -1;
1261 }
1262 }
1263
1264 if (Size == 32) {
1265 switch (P) {
1266 case CmpInst::ICMP_NE:
1267 return AMDGPU::S_CMP_LG_U32;
1268 case CmpInst::ICMP_EQ:
1269 return AMDGPU::S_CMP_EQ_U32;
1270 case CmpInst::ICMP_SGT:
1271 return AMDGPU::S_CMP_GT_I32;
1272 case CmpInst::ICMP_SGE:
1273 return AMDGPU::S_CMP_GE_I32;
1274 case CmpInst::ICMP_SLT:
1275 return AMDGPU::S_CMP_LT_I32;
1276 case CmpInst::ICMP_SLE:
1277 return AMDGPU::S_CMP_LE_I32;
1278 case CmpInst::ICMP_UGT:
1279 return AMDGPU::S_CMP_GT_U32;
1280 case CmpInst::ICMP_UGE:
1281 return AMDGPU::S_CMP_GE_U32;
1282 case CmpInst::ICMP_ULT:
1283 return AMDGPU::S_CMP_LT_U32;
1284 case CmpInst::ICMP_ULE:
1285 return AMDGPU::S_CMP_LE_U32;
1286 case CmpInst::FCMP_OEQ:
1287 return AMDGPU::S_CMP_EQ_F32;
1288 case CmpInst::FCMP_OGT:
1289 return AMDGPU::S_CMP_GT_F32;
1290 case CmpInst::FCMP_OGE:
1291 return AMDGPU::S_CMP_GE_F32;
1292 case CmpInst::FCMP_OLT:
1293 return AMDGPU::S_CMP_LT_F32;
1294 case CmpInst::FCMP_OLE:
1295 return AMDGPU::S_CMP_LE_F32;
1296 case CmpInst::FCMP_ONE:
1297 return AMDGPU::S_CMP_LG_F32;
1298 case CmpInst::FCMP_ORD:
1299 return AMDGPU::S_CMP_O_F32;
1300 case CmpInst::FCMP_UNO:
1301 return AMDGPU::S_CMP_U_F32;
1302 case CmpInst::FCMP_UEQ:
1303 return AMDGPU::S_CMP_NLG_F32;
1304 case CmpInst::FCMP_UGT:
1305 return AMDGPU::S_CMP_NLE_F32;
1306 case CmpInst::FCMP_UGE:
1307 return AMDGPU::S_CMP_NLT_F32;
1308 case CmpInst::FCMP_ULT:
1309 return AMDGPU::S_CMP_NGE_F32;
1310 case CmpInst::FCMP_ULE:
1311 return AMDGPU::S_CMP_NGT_F32;
1312 case CmpInst::FCMP_UNE:
1313 return AMDGPU::S_CMP_NEQ_F32;
1314 default:
1315 llvm_unreachable("Unknown condition code!");
1316 }
1317 }
1318
1319 if (Size == 16) {
1320 if (!STI.hasSALUFloatInsts())
1321 return -1;
1322
1323 switch (P) {
1324 case CmpInst::FCMP_OEQ:
1325 return AMDGPU::S_CMP_EQ_F16;
1326 case CmpInst::FCMP_OGT:
1327 return AMDGPU::S_CMP_GT_F16;
1328 case CmpInst::FCMP_OGE:
1329 return AMDGPU::S_CMP_GE_F16;
1330 case CmpInst::FCMP_OLT:
1331 return AMDGPU::S_CMP_LT_F16;
1332 case CmpInst::FCMP_OLE:
1333 return AMDGPU::S_CMP_LE_F16;
1334 case CmpInst::FCMP_ONE:
1335 return AMDGPU::S_CMP_LG_F16;
1336 case CmpInst::FCMP_ORD:
1337 return AMDGPU::S_CMP_O_F16;
1338 case CmpInst::FCMP_UNO:
1339 return AMDGPU::S_CMP_U_F16;
1340 case CmpInst::FCMP_UEQ:
1341 return AMDGPU::S_CMP_NLG_F16;
1342 case CmpInst::FCMP_UGT:
1343 return AMDGPU::S_CMP_NLE_F16;
1344 case CmpInst::FCMP_UGE:
1345 return AMDGPU::S_CMP_NLT_F16;
1346 case CmpInst::FCMP_ULT:
1347 return AMDGPU::S_CMP_NGE_F16;
1348 case CmpInst::FCMP_ULE:
1349 return AMDGPU::S_CMP_NGT_F16;
1350 case CmpInst::FCMP_UNE:
1351 return AMDGPU::S_CMP_NEQ_F16;
1352 default:
1353 llvm_unreachable("Unknown condition code!");
1354 }
1355 }
1356
1357 return -1;
1358}
1359
1360bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1361
1362 MachineBasicBlock *BB = I.getParent();
1363 const DebugLoc &DL = I.getDebugLoc();
1364
1365 Register SrcReg = I.getOperand(2).getReg();
1366 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1367
1368 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1369
1370 Register CCReg = I.getOperand(0).getReg();
1371 if (!isVCC(CCReg, *MRI)) {
1372 int Opcode = getS_CMPOpcode(Pred, Size);
1373 if (Opcode == -1)
1374 return false;
1375 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1376 .add(I.getOperand(2))
1377 .add(I.getOperand(3));
1378 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1379 .addReg(AMDGPU::SCC);
1380 bool Ret =
1381 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1382 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1383 I.eraseFromParent();
1384 return Ret;
1385 }
1386
1387 if (I.getOpcode() == AMDGPU::G_FCMP)
1388 return false;
1389
1390 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1391 if (Opcode == -1)
1392 return false;
1393
1394 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1395 I.getOperand(0).getReg())
1396 .add(I.getOperand(2))
1397 .add(I.getOperand(3));
1399 *TRI.getBoolRC(), *MRI);
1400 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1401 I.eraseFromParent();
1402 return Ret;
1403}
1404
1405bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1406 Register Dst = I.getOperand(0).getReg();
1407 if (isVCC(Dst, *MRI))
1408 return false;
1409
1410 LLT DstTy = MRI->getType(Dst);
1411 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1412 return false;
1413
1414 MachineBasicBlock *BB = I.getParent();
1415 const DebugLoc &DL = I.getDebugLoc();
1416 Register SrcReg = I.getOperand(2).getReg();
1417 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1418
1419 // i1 inputs are not supported in GlobalISel.
1420 if (Size == 1)
1421 return false;
1422
1423 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1424 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1425 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1426 I.eraseFromParent();
1427 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1428 }
1429
1430 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1431 if (Opcode == -1)
1432 return false;
1433
1434 MachineInstrBuilder SelectedMI;
1435 MachineOperand &LHS = I.getOperand(2);
1436 MachineOperand &RHS = I.getOperand(3);
1437 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1438 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1439 Register Src0Reg =
1440 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1441 Register Src1Reg =
1442 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1443 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1444 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1445 SelectedMI.addImm(Src0Mods);
1446 SelectedMI.addReg(Src0Reg);
1447 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1448 SelectedMI.addImm(Src1Mods);
1449 SelectedMI.addReg(Src1Reg);
1450 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1451 SelectedMI.addImm(0); // clamp
1452 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1453 SelectedMI.addImm(0); // op_sel
1454
1455 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1456 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1457 return false;
1458
1459 I.eraseFromParent();
1460 return true;
1461}
1462
1463// Ballot has to zero bits in input lane-mask that are zero in current exec,
1464// Done as AND with exec. For inputs that are results of instruction that
1465// implicitly use same exec, for example compares in same basic block or SCC to
1466// VCC copy, use copy.
1469 MachineInstr *MI = MRI.getVRegDef(Reg);
1470 if (MI->getParent() != MBB)
1471 return false;
1472
1473 // Lane mask generated by SCC to VCC copy.
1474 if (MI->getOpcode() == AMDGPU::COPY) {
1475 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1476 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1477 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1478 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1479 return true;
1480 }
1481
1482 // Lane mask generated using compare with same exec.
1483 if (isa<GAnyCmp>(MI))
1484 return true;
1485
1486 Register LHS, RHS;
1487 // Look through AND.
1488 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1489 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1491
1492 return false;
1493}
1494
1495bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1496 MachineBasicBlock *BB = I.getParent();
1497 const DebugLoc &DL = I.getDebugLoc();
1498 Register DstReg = I.getOperand(0).getReg();
1499 Register SrcReg = I.getOperand(2).getReg();
1500 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1501 const unsigned WaveSize = STI.getWavefrontSize();
1502
1503 // In the common case, the return type matches the wave size.
1504 // However we also support emitting i64 ballots in wave32 mode.
1505 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1506 return false;
1507
1508 std::optional<ValueAndVReg> Arg =
1510
1511 Register Dst = DstReg;
1512 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1513 if (BallotSize != WaveSize) {
1514 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1515 }
1516
1517 if (Arg) {
1518 const int64_t Value = Arg->Value.getZExtValue();
1519 if (Value == 0) {
1520 // Dst = S_MOV 0
1521 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1522 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1523 } else {
1524 // Dst = COPY EXEC
1525 assert(Value == 1);
1526 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1527 }
1528 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1529 return false;
1530 } else {
1531 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1532 // Dst = COPY SrcReg
1533 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1534 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1535 return false;
1536 } else {
1537 // Dst = S_AND SrcReg, EXEC
1538 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1539 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1540 .addReg(SrcReg)
1541 .addReg(TRI.getExec())
1542 .setOperandDead(3); // Dead scc
1543 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1544 return false;
1545 }
1546 }
1547
1548 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1549 if (BallotSize != WaveSize) {
1550 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1551 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1552 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1553 .addReg(Dst)
1554 .addImm(AMDGPU::sub0)
1555 .addReg(HiReg)
1556 .addImm(AMDGPU::sub1);
1557 }
1558
1559 I.eraseFromParent();
1560 return true;
1561}
1562
1563bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1564 Register DstReg = I.getOperand(0).getReg();
1565 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1566 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1567 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1568 return false;
1569
1570 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1571
1573 const MDNode *Metadata = I.getOperand(2).getMetadata();
1574 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1575 auto *RelocSymbol = cast<GlobalVariable>(
1576 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1577
1578 MachineBasicBlock *BB = I.getParent();
1579 BuildMI(*BB, &I, I.getDebugLoc(),
1580 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1582
1583 I.eraseFromParent();
1584 return true;
1585}
1586
1587bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1589
1590 Register DstReg = I.getOperand(0).getReg();
1591 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1592 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1593 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1594
1595 MachineBasicBlock *MBB = I.getParent();
1596 const DebugLoc &DL = I.getDebugLoc();
1597
1598 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1599
1600 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1602 MIB.addImm(MFI->getLDSSize());
1603 } else {
1605 const GlobalValue *GV =
1606 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1608 }
1609
1610 I.eraseFromParent();
1611 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1612}
1613
1614bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1615 MachineBasicBlock *MBB = I.getParent();
1617 const DebugLoc &DL = I.getDebugLoc();
1618
1619 MachineOperand &Dst = I.getOperand(0);
1620 Register DstReg = Dst.getReg();
1621 unsigned Depth = I.getOperand(2).getImm();
1622
1623 const TargetRegisterClass *RC
1624 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1625 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1626 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1627 return false;
1628
1629 // Check for kernel and shader functions
1630 if (Depth != 0 ||
1632 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1633 .addImm(0);
1634 I.eraseFromParent();
1635 return true;
1636 }
1637
1639 // There is a call to @llvm.returnaddress in this function
1640 MFI.setReturnAddressIsTaken(true);
1641
1642 // Get the return address reg and mark it as an implicit live-in
1643 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1644 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1645 AMDGPU::SReg_64RegClass, DL);
1646 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1647 .addReg(LiveIn);
1648 I.eraseFromParent();
1649 return true;
1650}
1651
1652bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1653 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1654 // SelectionDAG uses for wave32 vs wave64.
1655 MachineBasicBlock *BB = MI.getParent();
1656 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1657 .add(MI.getOperand(1));
1658
1659 Register Reg = MI.getOperand(1).getReg();
1660 MI.eraseFromParent();
1661
1662 if (!MRI->getRegClassOrNull(Reg))
1663 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1664 return true;
1665}
1666
1667bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1668 MachineInstr &MI, Intrinsic::ID IntrID) const {
1669 MachineBasicBlock *MBB = MI.getParent();
1671 const DebugLoc &DL = MI.getDebugLoc();
1672
1673 unsigned IndexOperand = MI.getOperand(7).getImm();
1674 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1675 bool WaveDone = MI.getOperand(9).getImm() != 0;
1676
1677 if (WaveDone && !WaveRelease)
1678 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1679
1680 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1681 IndexOperand &= ~0x3f;
1682 unsigned CountDw = 0;
1683
1685 CountDw = (IndexOperand >> 24) & 0xf;
1686 IndexOperand &= ~(0xf << 24);
1687
1688 if (CountDw < 1 || CountDw > 4) {
1690 "ds_ordered_count: dword count must be between 1 and 4");
1691 }
1692 }
1693
1694 if (IndexOperand)
1695 report_fatal_error("ds_ordered_count: bad index operand");
1696
1697 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1698 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1699
1700 unsigned Offset0 = OrderedCountIndex << 2;
1701 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1702
1704 Offset1 |= (CountDw - 1) << 6;
1705
1707 Offset1 |= ShaderType << 2;
1708
1709 unsigned Offset = Offset0 | (Offset1 << 8);
1710
1711 Register M0Val = MI.getOperand(2).getReg();
1712 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1713 .addReg(M0Val);
1714
1715 Register DstReg = MI.getOperand(0).getReg();
1716 Register ValReg = MI.getOperand(3).getReg();
1718 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1719 .addReg(ValReg)
1720 .addImm(Offset)
1721 .cloneMemRefs(MI);
1722
1723 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1724 return false;
1725
1726 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1727 MI.eraseFromParent();
1728 return Ret;
1729}
1730
1731static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1732 switch (IntrID) {
1733 case Intrinsic::amdgcn_ds_gws_init:
1734 return AMDGPU::DS_GWS_INIT;
1735 case Intrinsic::amdgcn_ds_gws_barrier:
1736 return AMDGPU::DS_GWS_BARRIER;
1737 case Intrinsic::amdgcn_ds_gws_sema_v:
1738 return AMDGPU::DS_GWS_SEMA_V;
1739 case Intrinsic::amdgcn_ds_gws_sema_br:
1740 return AMDGPU::DS_GWS_SEMA_BR;
1741 case Intrinsic::amdgcn_ds_gws_sema_p:
1742 return AMDGPU::DS_GWS_SEMA_P;
1743 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1744 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1745 default:
1746 llvm_unreachable("not a gws intrinsic");
1747 }
1748}
1749
1750bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1751 Intrinsic::ID IID) const {
1752 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1753 !STI.hasGWSSemaReleaseAll()))
1754 return false;
1755
1756 // intrinsic ID, vsrc, offset
1757 const bool HasVSrc = MI.getNumOperands() == 3;
1758 assert(HasVSrc || MI.getNumOperands() == 2);
1759
1760 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1761 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1762 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1763 return false;
1764
1765 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1766 unsigned ImmOffset;
1767
1768 MachineBasicBlock *MBB = MI.getParent();
1769 const DebugLoc &DL = MI.getDebugLoc();
1770
1771 MachineInstr *Readfirstlane = nullptr;
1772
1773 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1774 // incoming offset, in case there's an add of a constant. We'll have to put it
1775 // back later.
1776 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1777 Readfirstlane = OffsetDef;
1778 BaseOffset = OffsetDef->getOperand(1).getReg();
1779 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1780 }
1781
1782 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1783 // If we have a constant offset, try to use the 0 in m0 as the base.
1784 // TODO: Look into changing the default m0 initialization value. If the
1785 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1786 // the immediate offset.
1787
1788 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1789 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1790 .addImm(0);
1791 } else {
1792 std::tie(BaseOffset, ImmOffset) =
1793 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1794
1795 if (Readfirstlane) {
1796 // We have the constant offset now, so put the readfirstlane back on the
1797 // variable component.
1798 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1799 return false;
1800
1801 Readfirstlane->getOperand(1).setReg(BaseOffset);
1802 BaseOffset = Readfirstlane->getOperand(0).getReg();
1803 } else {
1804 if (!RBI.constrainGenericRegister(BaseOffset,
1805 AMDGPU::SReg_32RegClass, *MRI))
1806 return false;
1807 }
1808
1809 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1810 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1811 .addReg(BaseOffset)
1812 .addImm(16)
1813 .setOperandDead(3); // Dead scc
1814
1815 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1816 .addReg(M0Base);
1817 }
1818
1819 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1820 // offset field) % 64. Some versions of the programming guide omit the m0
1821 // part, or claim it's from offset 0.
1822 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1823
1824 if (HasVSrc) {
1825 Register VSrc = MI.getOperand(1).getReg();
1826 MIB.addReg(VSrc);
1827
1828 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1829 return false;
1830 }
1831
1832 MIB.addImm(ImmOffset)
1833 .cloneMemRefs(MI);
1834
1835 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1836
1837 MI.eraseFromParent();
1838 return true;
1839}
1840
1841bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1842 bool IsAppend) const {
1843 Register PtrBase = MI.getOperand(2).getReg();
1844 LLT PtrTy = MRI->getType(PtrBase);
1845 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1846
1847 unsigned Offset;
1848 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1849
1850 // TODO: Should this try to look through readfirstlane like GWS?
1851 if (!isDSOffsetLegal(PtrBase, Offset)) {
1852 PtrBase = MI.getOperand(2).getReg();
1853 Offset = 0;
1854 }
1855
1856 MachineBasicBlock *MBB = MI.getParent();
1857 const DebugLoc &DL = MI.getDebugLoc();
1858 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1859
1860 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1861 .addReg(PtrBase);
1862 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1863 return false;
1864
1865 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1866 .addImm(Offset)
1867 .addImm(IsGDS ? -1 : 0)
1868 .cloneMemRefs(MI);
1869 MI.eraseFromParent();
1870 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1871}
1872
1873bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1874 MachineFunction *MF = MI.getParent()->getParent();
1876
1877 MFInfo->setInitWholeWave();
1878 return selectImpl(MI, *CoverageInfo);
1879}
1880
1881bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1882 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1884 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1885 if (WGSize <= STI.getWavefrontSize()) {
1886 // If the workgroup fits in a wave, remove s_barrier_signal and lower
1887 // s_barrier/s_barrier_wait to wave_barrier.
1888 if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1889 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1890 MachineBasicBlock *MBB = MI.getParent();
1891 const DebugLoc &DL = MI.getDebugLoc();
1892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1893 }
1894 MI.eraseFromParent();
1895 return true;
1896 }
1897 }
1898
1899 if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
1900 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1901 MachineBasicBlock *MBB = MI.getParent();
1902 const DebugLoc &DL = MI.getDebugLoc();
1903 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1905 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1907 MI.eraseFromParent();
1908 return true;
1909 }
1910
1911 return selectImpl(MI, *CoverageInfo);
1912}
1913
1914static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1915 bool &IsTexFail) {
1916 if (TexFailCtrl)
1917 IsTexFail = true;
1918
1919 TFE = (TexFailCtrl & 0x1) ? true : false;
1920 TexFailCtrl &= ~(uint64_t)0x1;
1921 LWE = (TexFailCtrl & 0x2) ? true : false;
1922 TexFailCtrl &= ~(uint64_t)0x2;
1923
1924 return TexFailCtrl == 0;
1925}
1926
1927bool AMDGPUInstructionSelector::selectImageIntrinsic(
1929 MachineBasicBlock *MBB = MI.getParent();
1930 const DebugLoc &DL = MI.getDebugLoc();
1931
1932 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1934
1935 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1936 unsigned IntrOpcode = Intr->BaseOpcode;
1937 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1938 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1939 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1940
1941 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1942
1943 Register VDataIn, VDataOut;
1944 LLT VDataTy;
1945 int NumVDataDwords = -1;
1946 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1947 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1948
1949 bool Unorm;
1950 if (!BaseOpcode->Sampler)
1951 Unorm = true;
1952 else
1953 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1954
1955 bool TFE;
1956 bool LWE;
1957 bool IsTexFail = false;
1958 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1959 TFE, LWE, IsTexFail))
1960 return false;
1961
1962 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1963 const bool IsA16 = (Flags & 1) != 0;
1964 const bool IsG16 = (Flags & 2) != 0;
1965
1966 // A16 implies 16 bit gradients if subtarget doesn't support G16
1967 if (IsA16 && !STI.hasG16() && !IsG16)
1968 return false;
1969
1970 unsigned DMask = 0;
1971 unsigned DMaskLanes = 0;
1972
1973 if (BaseOpcode->Atomic) {
1974 VDataOut = MI.getOperand(0).getReg();
1975 VDataIn = MI.getOperand(2).getReg();
1976 LLT Ty = MRI->getType(VDataIn);
1977
1978 // Be careful to allow atomic swap on 16-bit element vectors.
1979 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1980 Ty.getSizeInBits() == 128 :
1981 Ty.getSizeInBits() == 64;
1982
1983 if (BaseOpcode->AtomicX2) {
1984 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1985
1986 DMask = Is64Bit ? 0xf : 0x3;
1987 NumVDataDwords = Is64Bit ? 4 : 2;
1988 } else {
1989 DMask = Is64Bit ? 0x3 : 0x1;
1990 NumVDataDwords = Is64Bit ? 2 : 1;
1991 }
1992 } else {
1993 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1994 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1995
1996 if (BaseOpcode->Store) {
1997 VDataIn = MI.getOperand(1).getReg();
1998 VDataTy = MRI->getType(VDataIn);
1999 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2000 } else if (BaseOpcode->NoReturn) {
2001 NumVDataDwords = 0;
2002 } else {
2003 VDataOut = MI.getOperand(0).getReg();
2004 VDataTy = MRI->getType(VDataOut);
2005 NumVDataDwords = DMaskLanes;
2006
2007 if (IsD16 && !STI.hasUnpackedD16VMem())
2008 NumVDataDwords = (DMaskLanes + 1) / 2;
2009 }
2010 }
2011
2012 // Set G16 opcode
2013 if (Subtarget->hasG16() && IsG16) {
2014 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2016 assert(G16MappingInfo);
2017 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2018 }
2019
2020 // TODO: Check this in verifier.
2021 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2022
2023 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2024 if (BaseOpcode->Atomic)
2025 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2026 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2028 return false;
2029
2030 int NumVAddrRegs = 0;
2031 int NumVAddrDwords = 0;
2032 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2033 // Skip the $noregs and 0s inserted during legalization.
2034 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2035 if (!AddrOp.isReg())
2036 continue; // XXX - Break?
2037
2038 Register Addr = AddrOp.getReg();
2039 if (!Addr)
2040 break;
2041
2042 ++NumVAddrRegs;
2043 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2044 }
2045
2046 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2047 // NSA, these should have been packed into a single value in the first
2048 // address register
2049 const bool UseNSA =
2050 NumVAddrRegs != 1 &&
2051 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2052 : NumVAddrDwords == NumVAddrRegs);
2053 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2054 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2055 return false;
2056 }
2057
2058 if (IsTexFail)
2059 ++NumVDataDwords;
2060
2061 int Opcode = -1;
2062 if (IsGFX12Plus) {
2063 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2064 NumVDataDwords, NumVAddrDwords);
2065 } else if (IsGFX11Plus) {
2066 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2067 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2068 : AMDGPU::MIMGEncGfx11Default,
2069 NumVDataDwords, NumVAddrDwords);
2070 } else if (IsGFX10Plus) {
2071 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2072 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2073 : AMDGPU::MIMGEncGfx10Default,
2074 NumVDataDwords, NumVAddrDwords);
2075 } else {
2076 if (Subtarget->hasGFX90AInsts()) {
2077 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2078 NumVDataDwords, NumVAddrDwords);
2079 if (Opcode == -1) {
2080 LLVM_DEBUG(
2081 dbgs()
2082 << "requested image instruction is not supported on this GPU\n");
2083 return false;
2084 }
2085 }
2086 if (Opcode == -1 &&
2088 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2089 NumVDataDwords, NumVAddrDwords);
2090 if (Opcode == -1)
2091 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2092 NumVDataDwords, NumVAddrDwords);
2093 }
2094 if (Opcode == -1)
2095 return false;
2096
2097 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2098 .cloneMemRefs(MI);
2099
2100 if (VDataOut) {
2101 if (BaseOpcode->AtomicX2) {
2102 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2103
2104 Register TmpReg = MRI->createVirtualRegister(
2105 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2106 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2107
2108 MIB.addDef(TmpReg);
2109 if (!MRI->use_empty(VDataOut)) {
2110 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2111 .addReg(TmpReg, RegState::Kill, SubReg);
2112 }
2113
2114 } else {
2115 MIB.addDef(VDataOut); // vdata output
2116 }
2117 }
2118
2119 if (VDataIn)
2120 MIB.addReg(VDataIn); // vdata input
2121
2122 for (int I = 0; I != NumVAddrRegs; ++I) {
2123 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2124 if (SrcOp.isReg()) {
2125 assert(SrcOp.getReg() != 0);
2126 MIB.addReg(SrcOp.getReg());
2127 }
2128 }
2129
2130 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2131 if (BaseOpcode->Sampler)
2132 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2133
2134 MIB.addImm(DMask); // dmask
2135
2136 if (IsGFX10Plus)
2137 MIB.addImm(DimInfo->Encoding);
2138 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2139 MIB.addImm(Unorm);
2140
2141 MIB.addImm(CPol);
2142 MIB.addImm(IsA16 && // a16 or r128
2143 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2144 if (IsGFX10Plus)
2145 MIB.addImm(IsA16 ? -1 : 0);
2146
2147 if (!Subtarget->hasGFX90AInsts()) {
2148 MIB.addImm(TFE); // tfe
2149 } else if (TFE) {
2150 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2151 return false;
2152 }
2153
2154 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2155 MIB.addImm(LWE); // lwe
2156 if (!IsGFX10Plus)
2157 MIB.addImm(DimInfo->DA ? -1 : 0);
2158 if (BaseOpcode->HasD16)
2159 MIB.addImm(IsD16 ? -1 : 0);
2160
2161 MI.eraseFromParent();
2162 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2163 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2164 return true;
2165}
2166
2167// We need to handle this here because tablegen doesn't support matching
2168// instructions with multiple outputs.
2169bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2170 MachineInstr &MI) const {
2171 Register Dst0 = MI.getOperand(0).getReg();
2172 Register Dst1 = MI.getOperand(1).getReg();
2173
2174 const DebugLoc &DL = MI.getDebugLoc();
2175 MachineBasicBlock *MBB = MI.getParent();
2176
2177 Register Addr = MI.getOperand(3).getReg();
2178 Register Data0 = MI.getOperand(4).getReg();
2179 Register Data1 = MI.getOperand(5).getReg();
2180 unsigned Offset = MI.getOperand(6).getImm();
2181
2182 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2183 .addDef(Dst1)
2184 .addUse(Addr)
2185 .addUse(Data0)
2186 .addUse(Data1)
2187 .addImm(Offset)
2188 .cloneMemRefs(MI);
2189
2190 MI.eraseFromParent();
2191 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2192}
2193
2194bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2195 MachineInstr &I) const {
2196 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2197 switch (IntrinsicID) {
2198 case Intrinsic::amdgcn_end_cf:
2199 return selectEndCfIntrinsic(I);
2200 case Intrinsic::amdgcn_ds_ordered_add:
2201 case Intrinsic::amdgcn_ds_ordered_swap:
2202 return selectDSOrderedIntrinsic(I, IntrinsicID);
2203 case Intrinsic::amdgcn_ds_gws_init:
2204 case Intrinsic::amdgcn_ds_gws_barrier:
2205 case Intrinsic::amdgcn_ds_gws_sema_v:
2206 case Intrinsic::amdgcn_ds_gws_sema_br:
2207 case Intrinsic::amdgcn_ds_gws_sema_p:
2208 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2209 return selectDSGWSIntrinsic(I, IntrinsicID);
2210 case Intrinsic::amdgcn_ds_append:
2211 return selectDSAppendConsume(I, true);
2212 case Intrinsic::amdgcn_ds_consume:
2213 return selectDSAppendConsume(I, false);
2214 case Intrinsic::amdgcn_init_whole_wave:
2215 return selectInitWholeWave(I);
2216 case Intrinsic::amdgcn_s_barrier:
2217 case Intrinsic::amdgcn_s_barrier_signal:
2218 case Intrinsic::amdgcn_s_barrier_wait:
2219 return selectSBarrier(I);
2220 case Intrinsic::amdgcn_raw_buffer_load_lds:
2221 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2222 case Intrinsic::amdgcn_struct_buffer_load_lds:
2223 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2224 return selectBufferLoadLds(I);
2225 case Intrinsic::amdgcn_global_load_lds:
2226 return selectGlobalLoadLds(I);
2227 case Intrinsic::amdgcn_exp_compr:
2228 if (!STI.hasCompressedExport()) {
2229 Function &F = I.getMF()->getFunction();
2231 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2232 F.getContext().diagnose(NoFpRet);
2233 return false;
2234 }
2235 break;
2236 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2237 return selectDSBvhStackIntrinsic(I);
2238 case Intrinsic::amdgcn_s_barrier_init:
2239 case Intrinsic::amdgcn_s_barrier_signal_var:
2240 return selectNamedBarrierInit(I, IntrinsicID);
2241 case Intrinsic::amdgcn_s_barrier_join:
2242 case Intrinsic::amdgcn_s_wakeup_barrier:
2243 case Intrinsic::amdgcn_s_get_named_barrier_state:
2244 return selectNamedBarrierInst(I, IntrinsicID);
2245 case Intrinsic::amdgcn_s_get_barrier_state:
2246 return selectSGetBarrierState(I, IntrinsicID);
2247 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2248 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2249 }
2250 return selectImpl(I, *CoverageInfo);
2251}
2252
2253bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2254 if (selectImpl(I, *CoverageInfo))
2255 return true;
2256
2257 MachineBasicBlock *BB = I.getParent();
2258 const DebugLoc &DL = I.getDebugLoc();
2259
2260 Register DstReg = I.getOperand(0).getReg();
2261 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2262 assert(Size <= 32 || Size == 64);
2263 const MachineOperand &CCOp = I.getOperand(1);
2264 Register CCReg = CCOp.getReg();
2265 if (!isVCC(CCReg, *MRI)) {
2266 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2267 AMDGPU::S_CSELECT_B32;
2268 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2269 .addReg(CCReg);
2270
2271 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2272 // bank, because it does not cover the register class that we used to represent
2273 // for it. So we need to manually set the register class here.
2274 if (!MRI->getRegClassOrNull(CCReg))
2275 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2276 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2277 .add(I.getOperand(2))
2278 .add(I.getOperand(3));
2279
2280 bool Ret = false;
2281 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2282 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2283 I.eraseFromParent();
2284 return Ret;
2285 }
2286
2287 // Wide VGPR select should have been split in RegBankSelect.
2288 if (Size > 32)
2289 return false;
2290
2292 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2293 .addImm(0)
2294 .add(I.getOperand(3))
2295 .addImm(0)
2296 .add(I.getOperand(2))
2297 .add(I.getOperand(1));
2298
2299 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2300 I.eraseFromParent();
2301 return Ret;
2302}
2303
2304bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2305 Register DstReg = I.getOperand(0).getReg();
2306 Register SrcReg = I.getOperand(1).getReg();
2307 const LLT DstTy = MRI->getType(DstReg);
2308 const LLT SrcTy = MRI->getType(SrcReg);
2309 const LLT S1 = LLT::scalar(1);
2310
2311 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2312 const RegisterBank *DstRB;
2313 if (DstTy == S1) {
2314 // This is a special case. We don't treat s1 for legalization artifacts as
2315 // vcc booleans.
2316 DstRB = SrcRB;
2317 } else {
2318 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2319 if (SrcRB != DstRB)
2320 return false;
2321 }
2322
2323 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2324
2325 unsigned DstSize = DstTy.getSizeInBits();
2326 unsigned SrcSize = SrcTy.getSizeInBits();
2327
2328 const TargetRegisterClass *SrcRC =
2329 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2330 const TargetRegisterClass *DstRC =
2331 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2332 if (!SrcRC || !DstRC)
2333 return false;
2334
2335 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2336 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2337 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2338 return false;
2339 }
2340
2341 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2343 const DebugLoc &DL = I.getDebugLoc();
2344 MachineBasicBlock *MBB = I.getParent();
2345 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2346 .addReg(SrcReg, 0, AMDGPU::lo16);
2347 I.eraseFromParent();
2348 return true;
2349 }
2350
2351 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2352 MachineBasicBlock *MBB = I.getParent();
2353 const DebugLoc &DL = I.getDebugLoc();
2354
2355 Register LoReg = MRI->createVirtualRegister(DstRC);
2356 Register HiReg = MRI->createVirtualRegister(DstRC);
2357 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2358 .addReg(SrcReg, 0, AMDGPU::sub0);
2359 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2360 .addReg(SrcReg, 0, AMDGPU::sub1);
2361
2362 if (IsVALU && STI.hasSDWA()) {
2363 // Write the low 16-bits of the high element into the high 16-bits of the
2364 // low element.
2365 MachineInstr *MovSDWA =
2366 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2367 .addImm(0) // $src0_modifiers
2368 .addReg(HiReg) // $src0
2369 .addImm(0) // $clamp
2370 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2371 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2372 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2373 .addReg(LoReg, RegState::Implicit);
2374 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2375 } else {
2376 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2377 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2378 Register ImmReg = MRI->createVirtualRegister(DstRC);
2379 if (IsVALU) {
2380 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2381 .addImm(16)
2382 .addReg(HiReg);
2383 } else {
2384 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2385 .addReg(HiReg)
2386 .addImm(16)
2387 .setOperandDead(3); // Dead scc
2388 }
2389
2390 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2391 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2392 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2393
2394 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2395 .addImm(0xffff);
2396 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2397 .addReg(LoReg)
2398 .addReg(ImmReg);
2399 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2400 .addReg(TmpReg0)
2401 .addReg(TmpReg1);
2402
2403 if (!IsVALU) {
2404 And.setOperandDead(3); // Dead scc
2405 Or.setOperandDead(3); // Dead scc
2406 }
2407 }
2408
2409 I.eraseFromParent();
2410 return true;
2411 }
2412
2413 if (!DstTy.isScalar())
2414 return false;
2415
2416 if (SrcSize > 32) {
2417 unsigned SubRegIdx =
2418 DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
2419 if (SubRegIdx == AMDGPU::NoSubRegister)
2420 return false;
2421
2422 // Deal with weird cases where the class only partially supports the subreg
2423 // index.
2424 const TargetRegisterClass *SrcWithSubRC
2425 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2426 if (!SrcWithSubRC)
2427 return false;
2428
2429 if (SrcWithSubRC != SrcRC) {
2430 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2431 return false;
2432 }
2433
2434 I.getOperand(1).setSubReg(SubRegIdx);
2435 }
2436
2437 I.setDesc(TII.get(TargetOpcode::COPY));
2438 return true;
2439}
2440
2441/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2442static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2443 Mask = maskTrailingOnes<unsigned>(Size);
2444 int SignedMask = static_cast<int>(Mask);
2445 return SignedMask >= -16 && SignedMask <= 64;
2446}
2447
2448// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2449const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2450 Register Reg, const MachineRegisterInfo &MRI,
2451 const TargetRegisterInfo &TRI) const {
2452 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2453 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2454 return RB;
2455
2456 // Ignore the type, since we don't use vcc in artifacts.
2457 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2458 return &RBI.getRegBankFromRegClass(*RC, LLT());
2459 return nullptr;
2460}
2461
2462bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2463 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2464 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2465 const DebugLoc &DL = I.getDebugLoc();
2466 MachineBasicBlock &MBB = *I.getParent();
2467 const Register DstReg = I.getOperand(0).getReg();
2468 const Register SrcReg = I.getOperand(1).getReg();
2469
2470 const LLT DstTy = MRI->getType(DstReg);
2471 const LLT SrcTy = MRI->getType(SrcReg);
2472 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2473 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2474 const unsigned DstSize = DstTy.getSizeInBits();
2475 if (!DstTy.isScalar())
2476 return false;
2477
2478 // Artifact casts should never use vcc.
2479 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2480
2481 // FIXME: This should probably be illegal and split earlier.
2482 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2483 if (DstSize <= 32)
2484 return selectCOPY(I);
2485
2486 const TargetRegisterClass *SrcRC =
2487 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2488 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2489 const TargetRegisterClass *DstRC =
2490 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2491
2492 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2493 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2494 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2495 .addReg(SrcReg)
2496 .addImm(AMDGPU::sub0)
2497 .addReg(UndefReg)
2498 .addImm(AMDGPU::sub1);
2499 I.eraseFromParent();
2500
2501 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2502 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2503 }
2504
2505 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2506 // 64-bit should have been split up in RegBankSelect
2507
2508 // Try to use an and with a mask if it will save code size.
2509 unsigned Mask;
2510 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2511 MachineInstr *ExtI =
2512 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2513 .addImm(Mask)
2514 .addReg(SrcReg);
2515 I.eraseFromParent();
2516 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2517 }
2518
2519 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2520 MachineInstr *ExtI =
2521 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2522 .addReg(SrcReg)
2523 .addImm(0) // Offset
2524 .addImm(SrcSize); // Width
2525 I.eraseFromParent();
2526 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2527 }
2528
2529 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2530 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2531 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2532 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2533 return false;
2534
2535 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2536 const unsigned SextOpc = SrcSize == 8 ?
2537 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2538 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2539 .addReg(SrcReg);
2540 I.eraseFromParent();
2541 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2542 }
2543
2544 // Using a single 32-bit SALU to calculate the high half is smaller than
2545 // S_BFE with a literal constant operand.
2546 if (DstSize > 32 && SrcSize == 32) {
2547 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2548 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2549 if (Signed) {
2550 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2551 .addReg(SrcReg, 0, SubReg)
2552 .addImm(31)
2553 .setOperandDead(3); // Dead scc
2554 } else {
2555 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2556 .addImm(0);
2557 }
2558 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2559 .addReg(SrcReg, 0, SubReg)
2560 .addImm(AMDGPU::sub0)
2561 .addReg(HiReg)
2562 .addImm(AMDGPU::sub1);
2563 I.eraseFromParent();
2564 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2565 *MRI);
2566 }
2567
2568 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2569 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2570
2571 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2572 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2573 // We need a 64-bit register source, but the high bits don't matter.
2574 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2575 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2576 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2577
2578 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2579 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2580 .addReg(SrcReg, 0, SubReg)
2581 .addImm(AMDGPU::sub0)
2582 .addReg(UndefReg)
2583 .addImm(AMDGPU::sub1);
2584
2585 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2586 .addReg(ExtReg)
2587 .addImm(SrcSize << 16);
2588
2589 I.eraseFromParent();
2590 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2591 }
2592
2593 unsigned Mask;
2594 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2595 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2596 .addReg(SrcReg)
2597 .addImm(Mask)
2598 .setOperandDead(3); // Dead scc
2599 } else {
2600 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2601 .addReg(SrcReg)
2602 .addImm(SrcSize << 16);
2603 }
2604
2605 I.eraseFromParent();
2606 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2607 }
2608
2609 return false;
2610}
2611
2613 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2614}
2615
2617 Register BitcastSrc;
2618 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2619 Reg = BitcastSrc;
2620 return Reg;
2621}
2622
2624 Register &Out) {
2625 Register Trunc;
2626 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2627 return false;
2628
2629 Register LShlSrc;
2630 Register Cst;
2631 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2632 Cst = stripCopy(Cst, MRI);
2633 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2634 Out = stripBitCast(LShlSrc, MRI);
2635 return true;
2636 }
2637 }
2638
2639 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2640 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2641 return false;
2642
2643 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2644 LLT::fixed_vector(2, 16));
2645
2646 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2647 assert(Mask.size() == 2);
2648
2649 if (Mask[0] == 1 && Mask[1] <= 1) {
2650 Out = Shuffle->getOperand(0).getReg();
2651 return true;
2652 }
2653
2654 return false;
2655}
2656
2657bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2658 if (!Subtarget->hasSALUFloatInsts())
2659 return false;
2660
2661 Register Dst = I.getOperand(0).getReg();
2662 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2663 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2664 return false;
2665
2666 Register Src = I.getOperand(1).getReg();
2667
2668 if (MRI->getType(Dst) == LLT::scalar(32) &&
2669 MRI->getType(Src) == LLT::scalar(16)) {
2670 if (isExtractHiElt(*MRI, Src, Src)) {
2671 MachineBasicBlock *BB = I.getParent();
2672 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2673 .addUse(Src);
2674 I.eraseFromParent();
2675 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2676 }
2677 }
2678
2679 return false;
2680}
2681
2682bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2683 // Only manually handle the f64 SGPR case.
2684 //
2685 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2686 // the bit ops theoretically have a second result due to the implicit def of
2687 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2688 // that is easy by disabling the check. The result works, but uses a
2689 // nonsensical sreg32orlds_and_sreg_1 regclass.
2690 //
2691 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2692 // the variadic REG_SEQUENCE operands.
2693
2694 Register Dst = MI.getOperand(0).getReg();
2695 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2696 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2697 MRI->getType(Dst) != LLT::scalar(64))
2698 return false;
2699
2700 Register Src = MI.getOperand(1).getReg();
2701 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2702 if (Fabs)
2703 Src = Fabs->getOperand(1).getReg();
2704
2705 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2706 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2707 return false;
2708
2709 MachineBasicBlock *BB = MI.getParent();
2710 const DebugLoc &DL = MI.getDebugLoc();
2711 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2712 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2713 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2714 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2715
2716 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2717 .addReg(Src, 0, AMDGPU::sub0);
2718 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2719 .addReg(Src, 0, AMDGPU::sub1);
2720 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2721 .addImm(0x80000000);
2722
2723 // Set or toggle sign bit.
2724 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2725 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2726 .addReg(HiReg)
2727 .addReg(ConstReg)
2728 .setOperandDead(3); // Dead scc
2729 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2730 .addReg(LoReg)
2731 .addImm(AMDGPU::sub0)
2732 .addReg(OpReg)
2733 .addImm(AMDGPU::sub1);
2734 MI.eraseFromParent();
2735 return true;
2736}
2737
2738// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2739bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2740 Register Dst = MI.getOperand(0).getReg();
2741 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2742 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2743 MRI->getType(Dst) != LLT::scalar(64))
2744 return false;
2745
2746 Register Src = MI.getOperand(1).getReg();
2747 MachineBasicBlock *BB = MI.getParent();
2748 const DebugLoc &DL = MI.getDebugLoc();
2749 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2750 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2751 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2752 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2753
2754 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2755 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2756 return false;
2757
2758 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2759 .addReg(Src, 0, AMDGPU::sub0);
2760 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2761 .addReg(Src, 0, AMDGPU::sub1);
2762 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2763 .addImm(0x7fffffff);
2764
2765 // Clear sign bit.
2766 // TODO: Should this used S_BITSET0_*?
2767 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2768 .addReg(HiReg)
2769 .addReg(ConstReg)
2770 .setOperandDead(3); // Dead scc
2771 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2772 .addReg(LoReg)
2773 .addImm(AMDGPU::sub0)
2774 .addReg(OpReg)
2775 .addImm(AMDGPU::sub1);
2776
2777 MI.eraseFromParent();
2778 return true;
2779}
2780
2781static bool isConstant(const MachineInstr &MI) {
2782 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2783}
2784
2785void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2786 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2787
2788 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2789 const MachineInstr *PtrMI =
2790 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2791
2792 assert(PtrMI);
2793
2794 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2795 return;
2796
2797 GEPInfo GEPInfo;
2798
2799 for (unsigned i = 1; i != 3; ++i) {
2800 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2801 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2802 assert(OpDef);
2803 if (i == 2 && isConstant(*OpDef)) {
2804 // TODO: Could handle constant base + variable offset, but a combine
2805 // probably should have commuted it.
2806 assert(GEPInfo.Imm == 0);
2807 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2808 continue;
2809 }
2810 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2811 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2812 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2813 else
2814 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2815 }
2816
2817 AddrInfo.push_back(GEPInfo);
2818 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2819}
2820
2821bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2822 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2823}
2824
2825bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2826 if (!MI.hasOneMemOperand())
2827 return false;
2828
2829 const MachineMemOperand *MMO = *MI.memoperands_begin();
2830 const Value *Ptr = MMO->getValue();
2831
2832 // UndefValue means this is a load of a kernel input. These are uniform.
2833 // Sometimes LDS instructions have constant pointers.
2834 // If Ptr is null, then that means this mem operand contains a
2835 // PseudoSourceValue like GOT.
2836 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2837 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2838 return true;
2839
2841 return true;
2842
2843 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2844 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2845 AMDGPU::SGPRRegBankID;
2846
2847 const Instruction *I = dyn_cast<Instruction>(Ptr);
2848 return I && I->getMetadata("amdgpu.uniform");
2849}
2850
2851bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2852 for (const GEPInfo &GEPInfo : AddrInfo) {
2853 if (!GEPInfo.VgprParts.empty())
2854 return true;
2855 }
2856 return false;
2857}
2858
2859void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2860 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2861 unsigned AS = PtrTy.getAddressSpace();
2863 STI.ldsRequiresM0Init()) {
2864 MachineBasicBlock *BB = I.getParent();
2865
2866 // If DS instructions require M0 initialization, insert it before selecting.
2867 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2868 .addImm(-1);
2869 }
2870}
2871
2872bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2873 MachineInstr &I) const {
2874 initM0(I);
2875 return selectImpl(I, *CoverageInfo);
2876}
2877
2879 if (Reg.isPhysical())
2880 return false;
2881
2882 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2883 const unsigned Opcode = MI.getOpcode();
2884
2885 if (Opcode == AMDGPU::COPY)
2886 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2887
2888 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2889 Opcode == AMDGPU::G_XOR)
2890 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2891 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2892
2893 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2894 return GI->is(Intrinsic::amdgcn_class);
2895
2896 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2897}
2898
2899bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2900 MachineBasicBlock *BB = I.getParent();
2901 MachineOperand &CondOp = I.getOperand(0);
2902 Register CondReg = CondOp.getReg();
2903 const DebugLoc &DL = I.getDebugLoc();
2904
2905 unsigned BrOpcode;
2906 Register CondPhysReg;
2907 const TargetRegisterClass *ConstrainRC;
2908
2909 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2910 // whether the branch is uniform when selecting the instruction. In
2911 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2912 // RegBankSelect knows what it's doing if the branch condition is scc, even
2913 // though it currently does not.
2914 if (!isVCC(CondReg, *MRI)) {
2915 if (MRI->getType(CondReg) != LLT::scalar(32))
2916 return false;
2917
2918 CondPhysReg = AMDGPU::SCC;
2919 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2920 ConstrainRC = &AMDGPU::SReg_32RegClass;
2921 } else {
2922 // FIXME: Should scc->vcc copies and with exec?
2923
2924 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2925 // need to insert an and with exec.
2926 if (!isVCmpResult(CondReg, *MRI)) {
2927 const bool Is64 = STI.isWave64();
2928 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2929 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2930
2931 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2932 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2933 .addReg(CondReg)
2934 .addReg(Exec)
2935 .setOperandDead(3); // Dead scc
2936 CondReg = TmpReg;
2937 }
2938
2939 CondPhysReg = TRI.getVCC();
2940 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2941 ConstrainRC = TRI.getBoolRC();
2942 }
2943
2944 if (!MRI->getRegClassOrNull(CondReg))
2945 MRI->setRegClass(CondReg, ConstrainRC);
2946
2947 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2948 .addReg(CondReg);
2949 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2950 .addMBB(I.getOperand(1).getMBB());
2951
2952 I.eraseFromParent();
2953 return true;
2954}
2955
2956bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2957 MachineInstr &I) const {
2958 Register DstReg = I.getOperand(0).getReg();
2959 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2960 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2961 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2962 if (IsVGPR)
2963 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2964
2965 return RBI.constrainGenericRegister(
2966 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2967}
2968
2969bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2970 Register DstReg = I.getOperand(0).getReg();
2971 Register SrcReg = I.getOperand(1).getReg();
2972 Register MaskReg = I.getOperand(2).getReg();
2973 LLT Ty = MRI->getType(DstReg);
2974 LLT MaskTy = MRI->getType(MaskReg);
2975 MachineBasicBlock *BB = I.getParent();
2976 const DebugLoc &DL = I.getDebugLoc();
2977
2978 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2979 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2980 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2981 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2982 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2983 return false;
2984
2985 // Try to avoid emitting a bit operation when we only need to touch half of
2986 // the 64-bit pointer.
2987 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2988 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2989 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2990
2991 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2992 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2993
2994 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2995 !CanCopyLow32 && !CanCopyHi32) {
2996 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2997 .addReg(SrcReg)
2998 .addReg(MaskReg)
2999 .setOperandDead(3); // Dead scc
3000 I.eraseFromParent();
3001 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3002 }
3003
3004 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3005 const TargetRegisterClass &RegRC
3006 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3007
3008 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3009 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3010 const TargetRegisterClass *MaskRC =
3011 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3012
3013 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3014 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3015 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3016 return false;
3017
3018 if (Ty.getSizeInBits() == 32) {
3019 assert(MaskTy.getSizeInBits() == 32 &&
3020 "ptrmask should have been narrowed during legalize");
3021
3022 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3023 .addReg(SrcReg)
3024 .addReg(MaskReg);
3025
3026 if (!IsVGPR)
3027 NewOp.setOperandDead(3); // Dead scc
3028 I.eraseFromParent();
3029 return true;
3030 }
3031
3032 Register HiReg = MRI->createVirtualRegister(&RegRC);
3033 Register LoReg = MRI->createVirtualRegister(&RegRC);
3034
3035 // Extract the subregisters from the source pointer.
3036 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3037 .addReg(SrcReg, 0, AMDGPU::sub0);
3038 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3039 .addReg(SrcReg, 0, AMDGPU::sub1);
3040
3041 Register MaskedLo, MaskedHi;
3042
3043 if (CanCopyLow32) {
3044 // If all the bits in the low half are 1, we only need a copy for it.
3045 MaskedLo = LoReg;
3046 } else {
3047 // Extract the mask subregister and apply the and.
3048 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3049 MaskedLo = MRI->createVirtualRegister(&RegRC);
3050
3051 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3052 .addReg(MaskReg, 0, AMDGPU::sub0);
3053 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3054 .addReg(LoReg)
3055 .addReg(MaskLo);
3056 }
3057
3058 if (CanCopyHi32) {
3059 // If all the bits in the high half are 1, we only need a copy for it.
3060 MaskedHi = HiReg;
3061 } else {
3062 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3063 MaskedHi = MRI->createVirtualRegister(&RegRC);
3064
3065 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3066 .addReg(MaskReg, 0, AMDGPU::sub1);
3067 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3068 .addReg(HiReg)
3069 .addReg(MaskHi);
3070 }
3071
3072 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3073 .addReg(MaskedLo)
3074 .addImm(AMDGPU::sub0)
3075 .addReg(MaskedHi)
3076 .addImm(AMDGPU::sub1);
3077 I.eraseFromParent();
3078 return true;
3079}
3080
3081/// Return the register to use for the index value, and the subregister to use
3082/// for the indirectly accessed register.
3083static std::pair<Register, unsigned>
3085 const TargetRegisterClass *SuperRC, Register IdxReg,
3086 unsigned EltSize, GISelKnownBits &KnownBits) {
3087 Register IdxBaseReg;
3088 int Offset;
3089
3090 std::tie(IdxBaseReg, Offset) =
3092 if (IdxBaseReg == AMDGPU::NoRegister) {
3093 // This will happen if the index is a known constant. This should ordinarily
3094 // be legalized out, but handle it as a register just in case.
3095 assert(Offset == 0);
3096 IdxBaseReg = IdxReg;
3097 }
3098
3099 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3100
3101 // Skip out of bounds offsets, or else we would end up using an undefined
3102 // register.
3103 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3104 return std::pair(IdxReg, SubRegs[0]);
3105 return std::pair(IdxBaseReg, SubRegs[Offset]);
3106}
3107
3108bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3109 MachineInstr &MI) const {
3110 Register DstReg = MI.getOperand(0).getReg();
3111 Register SrcReg = MI.getOperand(1).getReg();
3112 Register IdxReg = MI.getOperand(2).getReg();
3113
3114 LLT DstTy = MRI->getType(DstReg);
3115 LLT SrcTy = MRI->getType(SrcReg);
3116
3117 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3118 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3119 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3120
3121 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3122 // into a waterfall loop.
3123 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3124 return false;
3125
3126 const TargetRegisterClass *SrcRC =
3127 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3128 const TargetRegisterClass *DstRC =
3129 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3130 if (!SrcRC || !DstRC)
3131 return false;
3132 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3133 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3134 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3135 return false;
3136
3137 MachineBasicBlock *BB = MI.getParent();
3138 const DebugLoc &DL = MI.getDebugLoc();
3139 const bool Is64 = DstTy.getSizeInBits() == 64;
3140
3141 unsigned SubReg;
3142 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3143 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3144
3145 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3146 if (DstTy.getSizeInBits() != 32 && !Is64)
3147 return false;
3148
3149 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3150 .addReg(IdxReg);
3151
3152 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3153 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3154 .addReg(SrcReg, 0, SubReg)
3155 .addReg(SrcReg, RegState::Implicit);
3156 MI.eraseFromParent();
3157 return true;
3158 }
3159
3160 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3161 return false;
3162
3163 if (!STI.useVGPRIndexMode()) {
3164 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3165 .addReg(IdxReg);
3166 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3167 .addReg(SrcReg, 0, SubReg)
3168 .addReg(SrcReg, RegState::Implicit);
3169 MI.eraseFromParent();
3170 return true;
3171 }
3172
3173 const MCInstrDesc &GPRIDXDesc =
3174 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3175 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3176 .addReg(SrcReg)
3177 .addReg(IdxReg)
3178 .addImm(SubReg);
3179
3180 MI.eraseFromParent();
3181 return true;
3182}
3183
3184// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3185bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3186 MachineInstr &MI) const {
3187 Register DstReg = MI.getOperand(0).getReg();
3188 Register VecReg = MI.getOperand(1).getReg();
3189 Register ValReg = MI.getOperand(2).getReg();
3190 Register IdxReg = MI.getOperand(3).getReg();
3191
3192 LLT VecTy = MRI->getType(DstReg);
3193 LLT ValTy = MRI->getType(ValReg);
3194 unsigned VecSize = VecTy.getSizeInBits();
3195 unsigned ValSize = ValTy.getSizeInBits();
3196
3197 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3198 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3199 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3200
3201 assert(VecTy.getElementType() == ValTy);
3202
3203 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3204 // into a waterfall loop.
3205 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3206 return false;
3207
3208 const TargetRegisterClass *VecRC =
3209 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3210 const TargetRegisterClass *ValRC =
3211 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3212
3213 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3214 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3215 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3216 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3217 return false;
3218
3219 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3220 return false;
3221
3222 unsigned SubReg;
3223 std::tie(IdxReg, SubReg) =
3224 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3225
3226 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3227 STI.useVGPRIndexMode();
3228
3229 MachineBasicBlock *BB = MI.getParent();
3230 const DebugLoc &DL = MI.getDebugLoc();
3231
3232 if (!IndexMode) {
3233 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3234 .addReg(IdxReg);
3235
3236 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3237 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3238 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3239 .addReg(VecReg)
3240 .addReg(ValReg)
3241 .addImm(SubReg);
3242 MI.eraseFromParent();
3243 return true;
3244 }
3245
3246 const MCInstrDesc &GPRIDXDesc =
3247 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3248 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3249 .addReg(VecReg)
3250 .addReg(ValReg)
3251 .addReg(IdxReg)
3252 .addImm(SubReg);
3253
3254 MI.eraseFromParent();
3255 return true;
3256}
3257
3258bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3260 unsigned Opc;
3261 unsigned Size = MI.getOperand(3).getImm();
3262
3263 // The struct intrinsic variants add one additional operand over raw.
3264 const bool HasVIndex = MI.getNumOperands() == 9;
3265 Register VIndex;
3266 int OpOffset = 0;
3267 if (HasVIndex) {
3268 VIndex = MI.getOperand(4).getReg();
3269 OpOffset = 1;
3270 }
3271
3272 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3273 std::optional<ValueAndVReg> MaybeVOffset =
3275 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3276
3277 switch (Size) {
3278 default:
3279 return false;
3280 case 1:
3281 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3282 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3283 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3284 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3285 break;
3286 case 2:
3287 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3288 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3289 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3290 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3291 break;
3292 case 4:
3293 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3294 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3295 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3296 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3297 break;
3298 case 12:
3299 if (!Subtarget->hasLDSLoadB96_B128())
3300 return false;
3301
3302 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3303 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3304 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3305 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3306 break;
3307 case 16:
3308 if (!Subtarget->hasLDSLoadB96_B128())
3309 return false;
3310
3311 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3312 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3313 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3314 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3315 break;
3316 }
3317
3318 MachineBasicBlock *MBB = MI.getParent();
3319 const DebugLoc &DL = MI.getDebugLoc();
3320 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3321 .add(MI.getOperand(2));
3322
3323 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3324
3325 if (HasVIndex && HasVOffset) {
3326 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3327 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3328 .addReg(VIndex)
3329 .addImm(AMDGPU::sub0)
3330 .addReg(VOffset)
3331 .addImm(AMDGPU::sub1);
3332
3333 MIB.addReg(IdxReg);
3334 } else if (HasVIndex) {
3335 MIB.addReg(VIndex);
3336 } else if (HasVOffset) {
3337 MIB.addReg(VOffset);
3338 }
3339
3340 MIB.add(MI.getOperand(1)); // rsrc
3341 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3342 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3343 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3344 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3345 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3346 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3347 MIB.addImm(
3348 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3349 ? 1
3350 : 0); // swz
3351
3352 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3353 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3354 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3355 MachinePointerInfo StorePtrI = LoadPtrI;
3356 StorePtrI.V = nullptr;
3358
3359 auto F = LoadMMO->getFlags() &
3361 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3362 Size, LoadMMO->getBaseAlign());
3363
3364 MachineMemOperand *StoreMMO =
3366 sizeof(int32_t), LoadMMO->getBaseAlign());
3367
3368 MIB.setMemRefs({LoadMMO, StoreMMO});
3369
3370 MI.eraseFromParent();
3371 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3372}
3373
3374/// Match a zero extend from a 32-bit value to 64-bits.
3376 Register ZExtSrc;
3377 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3378 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3379
3380 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3381 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3382 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3383 return Register();
3384
3385 assert(Def->getNumOperands() == 3 &&
3386 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3387 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3388 return Def->getOperand(1).getReg();
3389 }
3390
3391 return Register();
3392}
3393
3394bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3395 unsigned Opc;
3396 unsigned Size = MI.getOperand(3).getImm();
3397
3398 switch (Size) {
3399 default:
3400 return false;
3401 case 1:
3402 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3403 break;
3404 case 2:
3405 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3406 break;
3407 case 4:
3408 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3409 break;
3410 case 12:
3411 if (!Subtarget->hasLDSLoadB96_B128())
3412 return false;
3413 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3414 break;
3415 case 16:
3416 if (!Subtarget->hasLDSLoadB96_B128())
3417 return false;
3418 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3419 break;
3420 }
3421
3422 MachineBasicBlock *MBB = MI.getParent();
3423 const DebugLoc &DL = MI.getDebugLoc();
3424 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3425 .add(MI.getOperand(2));
3426
3427 Register Addr = MI.getOperand(1).getReg();
3428 Register VOffset;
3429 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3430 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3431 if (!isSGPR(Addr)) {
3432 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3433 if (isSGPR(AddrDef->Reg)) {
3434 Addr = AddrDef->Reg;
3435 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3436 Register SAddr =
3437 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3438 if (isSGPR(SAddr)) {
3439 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3440 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3441 Addr = SAddr;
3442 VOffset = Off;
3443 }
3444 }
3445 }
3446 }
3447
3448 if (isSGPR(Addr)) {
3449 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3450 if (!VOffset) {
3451 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3452 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3453 .addImm(0);
3454 }
3455 }
3456
3457 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3458 .addReg(Addr);
3459
3460 if (isSGPR(Addr))
3461 MIB.addReg(VOffset);
3462
3463 MIB.add(MI.getOperand(4)) // offset
3464 .add(MI.getOperand(5)); // cpol
3465
3466 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3467 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3468 LoadPtrI.Offset = MI.getOperand(4).getImm();
3469 MachinePointerInfo StorePtrI = LoadPtrI;
3472 auto F = LoadMMO->getFlags() &
3474 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3475 Size, LoadMMO->getBaseAlign());
3476 MachineMemOperand *StoreMMO =
3478 sizeof(int32_t), Align(4));
3479
3480 MIB.setMemRefs({LoadMMO, StoreMMO});
3481
3482 MI.eraseFromParent();
3483 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3484}
3485
3486bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3487 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3488 MI.removeOperand(1);
3489 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3490 return true;
3491}
3492
3493// FIXME: This should be removed and let the patterns select. We just need the
3494// AGPR/VGPR combination versions.
3495bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3496 unsigned Opc;
3497 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3498 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3499 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3500 break;
3501 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3502 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3503 break;
3504 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3505 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3506 break;
3507 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3508 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3509 break;
3510 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3511 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3512 break;
3513 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3514 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3515 break;
3516 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3517 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3518 break;
3519 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3520 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3521 break;
3522 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3523 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3524 break;
3525 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3526 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3527 break;
3528 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3529 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3530 break;
3531 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3532 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3533 break;
3534 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3535 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3536 break;
3537 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3538 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3539 break;
3540 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3541 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3542 break;
3543 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3544 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3545 break;
3546 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3547 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3548 break;
3549 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3550 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3551 break;
3552 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3553 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3554 break;
3555 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3556 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3557 break;
3558 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3559 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3560 break;
3561 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3562 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3563 break;
3564 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3565 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3566 break;
3567 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3568 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3569 break;
3570 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3571 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3572 break;
3573 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3574 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3575 break;
3576 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3577 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3578 break;
3579 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3580 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3581 break;
3582 default:
3583 llvm_unreachable("unhandled smfmac intrinsic");
3584 }
3585
3586 auto VDst_In = MI.getOperand(4);
3587
3588 MI.setDesc(TII.get(Opc));
3589 MI.removeOperand(4); // VDst_In
3590 MI.removeOperand(1); // Intrinsic ID
3591 MI.addOperand(VDst_In); // Readd VDst_In to the end
3592 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3593 return true;
3594}
3595
3596bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3597 MachineInstr &MI, Intrinsic::ID IntrID) const {
3598 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3599 !Subtarget->hasPermlane16Swap())
3600 return false;
3601 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3602 !Subtarget->hasPermlane32Swap())
3603 return false;
3604
3605 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3606 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3607 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3608
3609 MI.removeOperand(2);
3610 MI.setDesc(TII.get(Opcode));
3611 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3612
3613 MachineOperand &FI = MI.getOperand(4);
3615
3616 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3617}
3618
3619bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3620 Register DstReg = MI.getOperand(0).getReg();
3621 Register SrcReg = MI.getOperand(1).getReg();
3622 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3623 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3624 MachineBasicBlock *MBB = MI.getParent();
3625 const DebugLoc &DL = MI.getDebugLoc();
3626
3627 if (IsVALU) {
3628 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3629 .addImm(Subtarget->getWavefrontSizeLog2())
3630 .addReg(SrcReg);
3631 } else {
3632 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3633 .addReg(SrcReg)
3634 .addImm(Subtarget->getWavefrontSizeLog2())
3635 .setOperandDead(3); // Dead scc
3636 }
3637
3638 const TargetRegisterClass &RC =
3639 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3640 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3641 return false;
3642
3643 MI.eraseFromParent();
3644 return true;
3645}
3646
3647// Match BITOP3 operation and return a number of matched instructions plus
3648// truth table.
3649static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3651 const MachineRegisterInfo &MRI) {
3652 unsigned NumOpcodes = 0;
3653 uint8_t LHSBits, RHSBits;
3654
3655 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3656 // Define truth table given Src0, Src1, Src2 bits permutations:
3657 // 0 0 0
3658 // 0 0 1
3659 // 0 1 0
3660 // 0 1 1
3661 // 1 0 0
3662 // 1 0 1
3663 // 1 1 0
3664 // 1 1 1
3665 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3666
3667 if (mi_match(Op, MRI, m_AllOnesInt())) {
3668 Bits = 0xff;
3669 return true;
3670 }
3671 if (mi_match(Op, MRI, m_ZeroInt())) {
3672 Bits = 0;
3673 return true;
3674 }
3675
3676 for (unsigned I = 0; I < Src.size(); ++I) {
3677 // Try to find existing reused operand
3678 if (Src[I] == Op) {
3679 Bits = SrcBits[I];
3680 return true;
3681 }
3682 // Try to replace parent operator
3683 if (Src[I] == R) {
3684 Bits = SrcBits[I];
3685 Src[I] = Op;
3686 return true;
3687 }
3688 }
3689
3690 if (Src.size() == 3) {
3691 // No room left for operands. Try one last time, there can be a 'not' of
3692 // one of our source operands. In this case we can compute the bits
3693 // without growing Src vector.
3694 Register LHS;
3695 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3697 for (unsigned I = 0; I < Src.size(); ++I) {
3698 if (Src[I] == LHS) {
3699 Bits = ~SrcBits[I];
3700 return true;
3701 }
3702 }
3703 }
3704
3705 return false;
3706 }
3707
3708 Bits = SrcBits[Src.size()];
3709 Src.push_back(Op);
3710 return true;
3711 };
3712
3713 MachineInstr *MI = MRI.getVRegDef(R);
3714 switch (MI->getOpcode()) {
3715 case TargetOpcode::G_AND:
3716 case TargetOpcode::G_OR:
3717 case TargetOpcode::G_XOR: {
3718 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3719 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3720
3721 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3722 if (!getOperandBits(LHS, LHSBits) ||
3723 !getOperandBits(RHS, RHSBits)) {
3724 Src = Backup;
3725 return std::make_pair(0, 0);
3726 }
3727
3728 // Recursion is naturally limited by the size of the operand vector.
3729 auto Op = BitOp3_Op(LHS, Src, MRI);
3730 if (Op.first) {
3731 NumOpcodes += Op.first;
3732 LHSBits = Op.second;
3733 }
3734
3735 Op = BitOp3_Op(RHS, Src, MRI);
3736 if (Op.first) {
3737 NumOpcodes += Op.first;
3738 RHSBits = Op.second;
3739 }
3740 break;
3741 }
3742 default:
3743 return std::make_pair(0, 0);
3744 }
3745
3746 uint8_t TTbl;
3747 switch (MI->getOpcode()) {
3748 case TargetOpcode::G_AND:
3749 TTbl = LHSBits & RHSBits;
3750 break;
3751 case TargetOpcode::G_OR:
3752 TTbl = LHSBits | RHSBits;
3753 break;
3754 case TargetOpcode::G_XOR:
3755 TTbl = LHSBits ^ RHSBits;
3756 break;
3757 default:
3758 break;
3759 }
3760
3761 return std::make_pair(NumOpcodes + 1, TTbl);
3762}
3763
3764bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3765 if (!Subtarget->hasBitOp3Insts())
3766 return false;
3767
3768 Register DstReg = MI.getOperand(0).getReg();
3769 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3770 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3771 if (!IsVALU)
3772 return false;
3773
3775 uint8_t TTbl;
3776 unsigned NumOpcodes;
3777
3778 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3779
3780 // Src.empty() case can happen if all operands are all zero or all ones.
3781 // Normally it shall be optimized out before reaching this.
3782 if (NumOpcodes < 2 || Src.empty())
3783 return false;
3784
3785 // For a uniform case threshold should be higher to account for moves between
3786 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3787 // and a readtfirstlane after.
3788 if (NumOpcodes < 4)
3789 return false;
3790
3791 bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3792 if (NumOpcodes == 2 && IsB32) {
3793 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3794 // asm more readable. This cannot be modeled with AddedComplexity because
3795 // selector does not know how many operations did we match.
3796 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3797 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3798 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3799 return false;
3800 }
3801
3802 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3803 unsigned CBL = STI.getConstantBusLimit(Opc);
3804 MachineBasicBlock *MBB = MI.getParent();
3805 const DebugLoc &DL = MI.getDebugLoc();
3806
3807 for (unsigned I = 0; I < Src.size(); ++I) {
3808 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3809 if (RB->getID() != AMDGPU::SGPRRegBankID)
3810 continue;
3811 if (CBL > 0) {
3812 --CBL;
3813 continue;
3814 }
3815 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3816 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3817 .addReg(Src[I]);
3818 Src[I] = NewReg;
3819 }
3820
3821 // Last operand can be ignored, turning a ternary operation into a binary.
3822 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3823 // 'c' with 'a' here without changing the answer. In some pathological
3824 // cases it should be possible to get an operation with a single operand
3825 // too if optimizer would not catch it.
3826 while (Src.size() < 3)
3827 Src.push_back(Src[0]);
3828
3829 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
3830 if (!IsB32)
3831 MIB.addImm(0); // src_mod0
3832 MIB.addReg(Src[0]);
3833 if (!IsB32)
3834 MIB.addImm(0); // src_mod1
3835 MIB.addReg(Src[1]);
3836 if (!IsB32)
3837 MIB.addImm(0); // src_mod2
3838 MIB.addReg(Src[2])
3839 .addImm(TTbl);
3840 if (!IsB32)
3841 MIB.addImm(0); // op_sel
3842
3843 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3844 MI.eraseFromParent();
3845
3846 return true;
3847}
3848
3849bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3850 Register SrcReg = MI.getOperand(0).getReg();
3851 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3852 return false;
3853
3854 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3855 Register SP =
3857 Register WaveAddr = getWaveAddress(DefMI);
3858 MachineBasicBlock *MBB = MI.getParent();
3859 const DebugLoc &DL = MI.getDebugLoc();
3860
3861 if (!WaveAddr) {
3862 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3863 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3864 .addReg(SrcReg)
3865 .addImm(Subtarget->getWavefrontSizeLog2())
3866 .setOperandDead(3); // Dead scc
3867 }
3868
3869 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3870 .addReg(WaveAddr);
3871
3872 MI.eraseFromParent();
3873 return true;
3874}
3875
3877
3878 if (!I.isPreISelOpcode()) {
3879 if (I.isCopy())
3880 return selectCOPY(I);
3881 return true;
3882 }
3883
3884 switch (I.getOpcode()) {
3885 case TargetOpcode::G_AND:
3886 case TargetOpcode::G_OR:
3887 case TargetOpcode::G_XOR:
3888 if (selectBITOP3(I))
3889 return true;
3890 if (selectImpl(I, *CoverageInfo))
3891 return true;
3892 return selectG_AND_OR_XOR(I);
3893 case TargetOpcode::G_ADD:
3894 case TargetOpcode::G_SUB:
3895 case TargetOpcode::G_PTR_ADD:
3896 if (selectImpl(I, *CoverageInfo))
3897 return true;
3898 return selectG_ADD_SUB(I);
3899 case TargetOpcode::G_UADDO:
3900 case TargetOpcode::G_USUBO:
3901 case TargetOpcode::G_UADDE:
3902 case TargetOpcode::G_USUBE:
3903 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3904 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3905 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3906 return selectG_AMDGPU_MAD_64_32(I);
3907 case TargetOpcode::G_INTTOPTR:
3908 case TargetOpcode::G_BITCAST:
3909 case TargetOpcode::G_PTRTOINT:
3910 case TargetOpcode::G_FREEZE:
3911 return selectCOPY(I);
3912 case TargetOpcode::G_FNEG:
3913 if (selectImpl(I, *CoverageInfo))
3914 return true;
3915 return selectG_FNEG(I);
3916 case TargetOpcode::G_FABS:
3917 if (selectImpl(I, *CoverageInfo))
3918 return true;
3919 return selectG_FABS(I);
3920 case TargetOpcode::G_EXTRACT:
3921 return selectG_EXTRACT(I);
3922 case TargetOpcode::G_MERGE_VALUES:
3923 case TargetOpcode::G_CONCAT_VECTORS:
3924 return selectG_MERGE_VALUES(I);
3925 case TargetOpcode::G_UNMERGE_VALUES:
3926 return selectG_UNMERGE_VALUES(I);
3927 case TargetOpcode::G_BUILD_VECTOR:
3928 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3929 return selectG_BUILD_VECTOR(I);
3930 case TargetOpcode::G_IMPLICIT_DEF:
3931 return selectG_IMPLICIT_DEF(I);
3932 case TargetOpcode::G_INSERT:
3933 return selectG_INSERT(I);
3934 case TargetOpcode::G_INTRINSIC:
3935 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3936 return selectG_INTRINSIC(I);
3937 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3938 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3939 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3940 case TargetOpcode::G_ICMP:
3941 case TargetOpcode::G_FCMP:
3942 if (selectG_ICMP_or_FCMP(I))
3943 return true;
3944 return selectImpl(I, *CoverageInfo);
3945 case TargetOpcode::G_LOAD:
3946 case TargetOpcode::G_ZEXTLOAD:
3947 case TargetOpcode::G_SEXTLOAD:
3948 case TargetOpcode::G_STORE:
3949 case TargetOpcode::G_ATOMIC_CMPXCHG:
3950 case TargetOpcode::G_ATOMICRMW_XCHG:
3951 case TargetOpcode::G_ATOMICRMW_ADD:
3952 case TargetOpcode::G_ATOMICRMW_SUB:
3953 case TargetOpcode::G_ATOMICRMW_AND:
3954 case TargetOpcode::G_ATOMICRMW_OR:
3955 case TargetOpcode::G_ATOMICRMW_XOR:
3956 case TargetOpcode::G_ATOMICRMW_MIN:
3957 case TargetOpcode::G_ATOMICRMW_MAX:
3958 case TargetOpcode::G_ATOMICRMW_UMIN:
3959 case TargetOpcode::G_ATOMICRMW_UMAX:
3960 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3961 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3962 case TargetOpcode::G_ATOMICRMW_FADD:
3963 case TargetOpcode::G_ATOMICRMW_FMIN:
3964 case TargetOpcode::G_ATOMICRMW_FMAX:
3965 return selectG_LOAD_STORE_ATOMICRMW(I);
3966 case TargetOpcode::G_SELECT:
3967 return selectG_SELECT(I);
3968 case TargetOpcode::G_TRUNC:
3969 return selectG_TRUNC(I);
3970 case TargetOpcode::G_SEXT:
3971 case TargetOpcode::G_ZEXT:
3972 case TargetOpcode::G_ANYEXT:
3973 case TargetOpcode::G_SEXT_INREG:
3974 // This is a workaround. For extension from type i1, `selectImpl()` uses
3975 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3976 // i1 can only be hold in a SGPR class.
3977 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3978 selectImpl(I, *CoverageInfo))
3979 return true;
3980 return selectG_SZA_EXT(I);
3981 case TargetOpcode::G_FPEXT:
3982 if (selectG_FPEXT(I))
3983 return true;
3984 return selectImpl(I, *CoverageInfo);
3985 case TargetOpcode::G_BRCOND:
3986 return selectG_BRCOND(I);
3987 case TargetOpcode::G_GLOBAL_VALUE:
3988 return selectG_GLOBAL_VALUE(I);
3989 case TargetOpcode::G_PTRMASK:
3990 return selectG_PTRMASK(I);
3991 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3992 return selectG_EXTRACT_VECTOR_ELT(I);
3993 case TargetOpcode::G_INSERT_VECTOR_ELT:
3994 return selectG_INSERT_VECTOR_ELT(I);
3995 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3996 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3997 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3998 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3999 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4002 assert(Intr && "not an image intrinsic with image pseudo");
4003 return selectImageIntrinsic(I, Intr);
4004 }
4005 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
4006 return selectBVHIntrinsic(I);
4007 case AMDGPU::G_SBFX:
4008 case AMDGPU::G_UBFX:
4009 return selectG_SBFX_UBFX(I);
4010 case AMDGPU::G_SI_CALL:
4011 I.setDesc(TII.get(AMDGPU::SI_CALL));
4012 return true;
4013 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4014 return selectWaveAddress(I);
4015 case AMDGPU::G_STACKRESTORE:
4016 return selectStackRestore(I);
4017 case AMDGPU::G_PHI:
4018 return selectPHI(I);
4019 case TargetOpcode::G_CONSTANT:
4020 case TargetOpcode::G_FCONSTANT:
4021 default:
4022 return selectImpl(I, *CoverageInfo);
4023 }
4024 return false;
4025}
4026
4028AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4029 return {{
4030 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4031 }};
4032
4033}
4034
4035std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4036 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4037 unsigned Mods = 0;
4038 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4039
4040 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4041 Src = MI->getOperand(1).getReg();
4042 Mods |= SISrcMods::NEG;
4043 MI = getDefIgnoringCopies(Src, *MRI);
4044 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4045 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4046 // denormal mode, but we're implicitly canonicalizing in a source operand.
4047 const ConstantFP *LHS =
4048 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4049 if (LHS && LHS->isZero()) {
4050 Mods |= SISrcMods::NEG;
4051 Src = MI->getOperand(2).getReg();
4052 }
4053 }
4054
4055 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4056 Src = MI->getOperand(1).getReg();
4057 Mods |= SISrcMods::ABS;
4058 }
4059
4060 if (OpSel)
4061 Mods |= SISrcMods::OP_SEL_0;
4062
4063 return std::pair(Src, Mods);
4064}
4065
4066Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4067 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4068 bool ForceVGPR) const {
4069 if ((Mods != 0 || ForceVGPR) &&
4070 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4071
4072 // If we looked through copies to find source modifiers on an SGPR operand,
4073 // we now have an SGPR register source. To avoid potentially violating the
4074 // constant bus restriction, we need to insert a copy to a VGPR.
4075 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4076 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4077 TII.get(AMDGPU::COPY), VGPRSrc)
4078 .addReg(Src);
4079 Src = VGPRSrc;
4080 }
4081
4082 return Src;
4083}
4084
4085///
4086/// This will select either an SGPR or VGPR operand and will save us from
4087/// having to write an extra tablegen pattern.
4089AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4090 return {{
4091 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4092 }};
4093}
4094
4096AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4097 Register Src;
4098 unsigned Mods;
4099 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4100
4101 return {{
4102 [=](MachineInstrBuilder &MIB) {
4103 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4104 },
4105 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4106 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4107 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4108 }};
4109}
4110
4112AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4113 Register Src;
4114 unsigned Mods;
4115 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4116 /*IsCanonicalizing=*/true,
4117 /*AllowAbs=*/false);
4118
4119 return {{
4120 [=](MachineInstrBuilder &MIB) {
4121 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4122 },
4123 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4124 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4125 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4126 }};
4127}
4128
4130AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4131 return {{
4132 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4133 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4134 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4135 }};
4136}
4137
4139AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4140 Register Src;
4141 unsigned Mods;
4142 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4143
4144 return {{
4145 [=](MachineInstrBuilder &MIB) {
4146 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4147 },
4148 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4149 }};
4150}
4151
4153AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4154 MachineOperand &Root) const {
4155 Register Src;
4156 unsigned Mods;
4157 std::tie(Src, Mods) =
4158 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4159
4160 return {{
4161 [=](MachineInstrBuilder &MIB) {
4162 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4163 },
4164 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4165 }};
4166}
4167
4169AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4170 Register Src;
4171 unsigned Mods;
4172 std::tie(Src, Mods) =
4173 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4174 /*AllowAbs=*/false);
4175
4176 return {{
4177 [=](MachineInstrBuilder &MIB) {
4178 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4179 },
4180 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4181 }};
4182}
4183
4185AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4186 Register Reg = Root.getReg();
4187 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4188 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4189 return {};
4190 return {{
4191 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4192 }};
4193}
4194
4195std::pair<Register, unsigned>
4196AMDGPUInstructionSelector::selectVOP3PModsImpl(
4197 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
4198 unsigned Mods = 0;
4199 MachineInstr *MI = MRI.getVRegDef(Src);
4200
4201 if (MI->getOpcode() == AMDGPU::G_FNEG &&
4202 // It's possible to see an f32 fneg here, but unlikely.
4203 // TODO: Treat f32 fneg as only high bit.
4204 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
4206 Src = MI->getOperand(1).getReg();
4207 MI = MRI.getVRegDef(Src);
4208 }
4209
4210 // TODO: Handle G_FSUB 0 as fneg
4211
4212 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
4213 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
4214
4215 // Packed instructions do not have abs modifiers.
4216 Mods |= SISrcMods::OP_SEL_1;
4217
4218 return std::pair(Src, Mods);
4219}
4220
4222AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4224 = Root.getParent()->getParent()->getParent()->getRegInfo();
4225
4226 Register Src;
4227 unsigned Mods;
4228 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
4229
4230 return {{
4231 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4232 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4233 }};
4234}
4235
4237AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4239 = Root.getParent()->getParent()->getParent()->getRegInfo();
4240
4241 Register Src;
4242 unsigned Mods;
4243 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
4244
4245 return {{
4246 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4247 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4248 }};
4249}
4250
4252AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
4253 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4254 // Value is in Imm operand as i1 sign extended to int64_t.
4255 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4256 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4257 "expected i1 value");
4258 unsigned Mods = SISrcMods::OP_SEL_1;
4259 if (Root.getImm() == -1)
4260 Mods ^= SISrcMods::NEG;
4261 return {{
4262 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4263 }};
4264}
4265
4267AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4268 MachineOperand &Root) const {
4269 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4270 "expected i1 value");
4271 unsigned Mods = SISrcMods::OP_SEL_1;
4272 if (Root.getImm() != 0)
4273 Mods |= SISrcMods::OP_SEL_0;
4274
4275 return {{
4276 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4277 }};
4278}
4279
4281 MachineInstr *InsertPt,
4283 const TargetRegisterClass *DstRegClass;
4284 switch (Elts.size()) {
4285 case 8:
4286 DstRegClass = &AMDGPU::VReg_256RegClass;
4287 break;
4288 case 4:
4289 DstRegClass = &AMDGPU::VReg_128RegClass;
4290 break;
4291 case 2:
4292 DstRegClass = &AMDGPU::VReg_64RegClass;
4293 break;
4294 default:
4295 llvm_unreachable("unhandled Reg sequence size");
4296 }
4297
4298 MachineIRBuilder B(*InsertPt);
4299 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4300 .addDef(MRI.createVirtualRegister(DstRegClass));
4301 for (unsigned i = 0; i < Elts.size(); ++i) {
4302 MIB.addReg(Elts[i]);
4304 }
4305 return MIB->getOperand(0).getReg();
4306}
4307
4308static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
4310 MachineInstr *InsertPt,
4312 if (ModOpcode == TargetOpcode::G_FNEG) {
4313 Mods |= SISrcMods::NEG;
4314 // Check if all elements also have abs modifier
4315 SmallVector<Register, 8> NegAbsElts;
4316 for (auto El : Elts) {
4317 Register FabsSrc;
4318 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
4319 break;
4320 NegAbsElts.push_back(FabsSrc);
4321 }
4322 if (Elts.size() != NegAbsElts.size()) {
4323 // Neg
4324 Src = buildRegSequence(Elts, InsertPt, MRI);
4325 } else {
4326 // Neg and Abs
4327 Mods |= SISrcMods::NEG_HI;
4328 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4329 }
4330 } else {
4331 assert(ModOpcode == TargetOpcode::G_FABS);
4332 // Abs
4333 Mods |= SISrcMods::NEG_HI;
4334 Src = buildRegSequence(Elts, InsertPt, MRI);
4335 }
4336}
4337
4339AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4340 Register Src = Root.getReg();
4341 unsigned Mods = SISrcMods::OP_SEL_1;
4343
4344 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4345 assert(BV->getNumSources() > 0);
4346 // Based on first element decide which mod we match, neg or abs
4347 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
4348 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
4349 ? AMDGPU::G_FNEG
4350 : AMDGPU::G_FABS;
4351 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4352 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
4353 if (ElF32->getOpcode() != ModOpcode)
4354 break;
4355 EltsF32.push_back(ElF32->getOperand(1).getReg());
4356 }
4357
4358 // All elements had ModOpcode modifier
4359 if (BV->getNumSources() == EltsF32.size()) {
4360 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4361 *MRI);
4362 }
4363 }
4364
4365 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4366 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4367}
4368
4370AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4371 Register Src = Root.getReg();
4372 unsigned Mods = SISrcMods::OP_SEL_1;
4373 SmallVector<Register, 8> EltsV2F16;
4374
4375 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4376 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4377 Register FNegSrc;
4378 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4379 break;
4380 EltsV2F16.push_back(FNegSrc);
4381 }
4382
4383 // All elements had ModOpcode modifier
4384 if (CV->getNumSources() == EltsV2F16.size()) {
4385 Mods |= SISrcMods::NEG;
4386 Mods |= SISrcMods::NEG_HI;
4387 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4388 }
4389 }
4390
4391 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4392 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4393}
4394
4396AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4397 Register Src = Root.getReg();
4398 unsigned Mods = SISrcMods::OP_SEL_1;
4399 SmallVector<Register, 8> EltsV2F16;
4400
4401 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4402 assert(CV->getNumSources() > 0);
4403 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4404 // Based on first element decide which mod we match, neg or abs
4405 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4406 ? AMDGPU::G_FNEG
4407 : AMDGPU::G_FABS;
4408
4409 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4410 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4411 if (ElV2F16->getOpcode() != ModOpcode)
4412 break;
4413 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4414 }
4415
4416 // All elements had ModOpcode modifier
4417 if (CV->getNumSources() == EltsV2F16.size()) {
4418 MachineIRBuilder B(*Root.getParent());
4419 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4420 *MRI);
4421 }
4422 }
4423
4424 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4425 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4426}
4427
4429AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4430 std::optional<FPValueAndVReg> FPValReg;
4431 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4432 if (TII.isInlineConstant(FPValReg->Value)) {
4433 return {{[=](MachineInstrBuilder &MIB) {
4434 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4435 }}};
4436 }
4437 // Non-inlineable splat floats should not fall-through for integer immediate
4438 // checks.
4439 return {};
4440 }
4441
4442 APInt ICst;
4443 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4444 if (TII.isInlineConstant(ICst)) {
4445 return {
4446 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4447 }
4448 }
4449
4450 return {};
4451}
4452
4454AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4455 Register Src =
4456 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4457 unsigned Key = 0;
4458
4459 Register ShiftSrc;
4460 std::optional<ValueAndVReg> ShiftAmt;
4461 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4462 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4463 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4464 Key = ShiftAmt->Value.getZExtValue() / 8;
4465 Src = ShiftSrc;
4466 }
4467
4468 return {{
4469 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4470 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4471 }};
4472}
4473
4475AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4476
4477 Register Src =
4478 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4479 unsigned Key = 0;
4480
4481 Register ShiftSrc;
4482 std::optional<ValueAndVReg> ShiftAmt;
4483 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4484 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4485 ShiftAmt->Value.getZExtValue() == 16) {
4486 Src = ShiftSrc;
4487 Key = 1;
4488 }
4489
4490 return {{
4491 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4492 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4493 }};
4494}
4495
4497AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4498 Register Src;
4499 unsigned Mods;
4500 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4501
4502 // FIXME: Handle op_sel
4503 return {{
4504 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4505 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4506 }};
4507}
4508
4510AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4511 Register Src;
4512 unsigned Mods;
4513 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4514 /*IsCanonicalizing=*/true,
4515 /*AllowAbs=*/false,
4516 /*OpSel=*/false);
4517
4518 return {{
4519 [=](MachineInstrBuilder &MIB) {
4520 MIB.addReg(
4521 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4522 },
4523 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4524 }};
4525}
4526
4528AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4529 Register Src;
4530 unsigned Mods;
4531 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4532 /*IsCanonicalizing=*/true,
4533 /*AllowAbs=*/false,
4534 /*OpSel=*/true);
4535
4536 return {{
4537 [=](MachineInstrBuilder &MIB) {
4538 MIB.addReg(
4539 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4540 },
4541 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4542 }};
4543}
4544
4545bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4546 Register &Base,
4547 Register *SOffset,
4548 int64_t *Offset) const {
4549 MachineInstr *MI = Root.getParent();
4550 MachineBasicBlock *MBB = MI->getParent();
4551
4552 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4553 // then we can select all ptr + 32-bit offsets.
4554 SmallVector<GEPInfo, 4> AddrInfo;
4555 getAddrModeInfo(*MI, *MRI, AddrInfo);
4556
4557 if (AddrInfo.empty())
4558 return false;
4559
4560 const GEPInfo &GEPI = AddrInfo[0];
4561 std::optional<int64_t> EncodedImm;
4562
4563 if (SOffset && Offset) {
4564 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4565 /*HasSOffset=*/true);
4566 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4567 AddrInfo.size() > 1) {
4568 const GEPInfo &GEPI2 = AddrInfo[1];
4569 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4570 if (Register OffsetReg =
4571 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4572 Base = GEPI2.SgprParts[0];
4573 *SOffset = OffsetReg;
4574 *Offset = *EncodedImm;
4575 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4576 return true;
4577
4578 // For unbuffered smem loads, it is illegal for the Immediate Offset
4579 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4580 // is negative. Handle the case where the Immediate Offset + SOffset
4581 // is negative.
4582 auto SKnown = KB->getKnownBits(*SOffset);
4583 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4584 return false;
4585
4586 return true;
4587 }
4588 }
4589 }
4590 return false;
4591 }
4592
4593 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4594 /*HasSOffset=*/false);
4595 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4596 Base = GEPI.SgprParts[0];
4597 *Offset = *EncodedImm;
4598 return true;
4599 }
4600
4601 // SGPR offset is unsigned.
4602 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4603 GEPI.Imm != 0) {
4604 // If we make it this far we have a load with an 32-bit immediate offset.
4605 // It is OK to select this using a sgpr offset, because we have already
4606 // failed trying to select this load into one of the _IMM variants since
4607 // the _IMM Patterns are considered before the _SGPR patterns.
4608 Base = GEPI.SgprParts[0];
4609 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4610 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4611 .addImm(GEPI.Imm);
4612 return true;
4613 }
4614
4615 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4616 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4617 Base = GEPI.SgprParts[0];
4618 *SOffset = OffsetReg;
4619 return true;
4620 }
4621 }
4622
4623 return false;
4624}
4625
4627AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4628 Register Base;
4629 int64_t Offset;
4630 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4631 return std::nullopt;
4632
4633 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4634 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4635}
4636
4638AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4639 SmallVector<GEPInfo, 4> AddrInfo;
4640 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4641
4642 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4643 return std::nullopt;
4644
4645 const GEPInfo &GEPInfo = AddrInfo[0];
4646 Register PtrReg = GEPInfo.SgprParts[0];
4647 std::optional<int64_t> EncodedImm =
4648 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4649 if (!EncodedImm)
4650 return std::nullopt;
4651
4652 return {{
4653 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4654 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4655 }};
4656}
4657
4659AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4660 Register Base, SOffset;
4661 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4662 return std::nullopt;
4663
4664 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4665 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4666}
4667
4669AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4670 Register Base, SOffset;
4671 int64_t Offset;
4672 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4673 return std::nullopt;
4674
4675 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4676 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4677 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4678}
4679
4680std::pair<Register, int>
4681AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4682 uint64_t FlatVariant) const {
4683 MachineInstr *MI = Root.getParent();
4684
4685 auto Default = std::pair(Root.getReg(), 0);
4686
4687 if (!STI.hasFlatInstOffsets())
4688 return Default;
4689
4690 Register PtrBase;
4691 int64_t ConstOffset;
4692 std::tie(PtrBase, ConstOffset) =
4693 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4694
4695 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4696 !isFlatScratchBaseLegal(Root.getReg())))
4697 return Default;
4698
4699 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4700 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4701 return Default;
4702
4703 return std::pair(PtrBase, ConstOffset);
4704}
4705
4707AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4708 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4709
4710 return {{
4711 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4712 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4713 }};
4714}
4715
4717AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4718 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4719
4720 return {{
4721 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4722 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4723 }};
4724}
4725
4727AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4728 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4729
4730 return {{
4731 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4732 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4733 }};
4734}
4735
4736// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4738AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4739 Register Addr = Root.getReg();
4740 Register PtrBase;
4741 int64_t ConstOffset;
4742 int64_t ImmOffset = 0;
4743
4744 // Match the immediate offset first, which canonically is moved as low as
4745 // possible.
4746 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4747
4748 if (ConstOffset != 0) {
4749 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4751 Addr = PtrBase;
4752 ImmOffset = ConstOffset;
4753 } else {
4754 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4755 if (isSGPR(PtrBaseDef->Reg)) {
4756 if (ConstOffset > 0) {
4757 // Offset is too large.
4758 //
4759 // saddr + large_offset -> saddr +
4760 // (voffset = large_offset & ~MaxOffset) +
4761 // (large_offset & MaxOffset);
4762 int64_t SplitImmOffset, RemainderOffset;
4763 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4765
4766 if (isUInt<32>(RemainderOffset)) {
4767 MachineInstr *MI = Root.getParent();
4768 MachineBasicBlock *MBB = MI->getParent();
4769 Register HighBits =
4770 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4771
4772 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4773 HighBits)
4774 .addImm(RemainderOffset);
4775
4776 return {{
4777 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4778 [=](MachineInstrBuilder &MIB) {
4779 MIB.addReg(HighBits);
4780 }, // voffset
4781 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4782 }};
4783 }
4784 }
4785
4786 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4787 // is 1 we would need to perform 1 or 2 extra moves for each half of
4788 // the constant and it is better to do a scalar add and then issue a
4789 // single VALU instruction to materialize zero. Otherwise it is less
4790 // instructions to perform VALU adds with immediates or inline literals.
4791 unsigned NumLiterals =
4792 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
4793 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
4794 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4795 return std::nullopt;
4796 }
4797 }
4798 }
4799
4800 // Match the variable offset.
4801 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4802 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4803 // Look through the SGPR->VGPR copy.
4804 Register SAddr =
4805 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4806
4807 if (isSGPR(SAddr)) {
4808 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4809
4810 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4811 // inserted later.
4812 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4813 return {{[=](MachineInstrBuilder &MIB) { // saddr
4814 MIB.addReg(SAddr);
4815 },
4816 [=](MachineInstrBuilder &MIB) { // voffset
4817 MIB.addReg(VOffset);
4818 },
4819 [=](MachineInstrBuilder &MIB) { // offset
4820 MIB.addImm(ImmOffset);
4821 }}};
4822 }
4823 }
4824 }
4825
4826 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4827 // drop this.
4828 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4829 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4830 return std::nullopt;
4831
4832 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4833 // moves required to copy a 64-bit SGPR to VGPR.
4834 MachineInstr *MI = Root.getParent();
4835 MachineBasicBlock *MBB = MI->getParent();
4836 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4837
4838 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4839 .addImm(0);
4840
4841 return {{
4842 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4843 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4844 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4845 }};
4846}
4847
4849AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4850 Register Addr = Root.getReg();
4851 Register PtrBase;
4852 int64_t ConstOffset;
4853 int64_t ImmOffset = 0;
4854
4855 // Match the immediate offset first, which canonically is moved as low as
4856 // possible.
4857 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4858
4859 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4862 Addr = PtrBase;
4863 ImmOffset = ConstOffset;
4864 }
4865
4866 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4867 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4868 int FI = AddrDef->MI->getOperand(1).getIndex();
4869 return {{
4870 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4871 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4872 }};
4873 }
4874
4875 Register SAddr = AddrDef->Reg;
4876
4877 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4878 Register LHS = AddrDef->MI->getOperand(1).getReg();
4879 Register RHS = AddrDef->MI->getOperand(2).getReg();
4880 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4881 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4882
4883 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4884 isSGPR(RHSDef->Reg)) {
4885 int FI = LHSDef->MI->getOperand(1).getIndex();
4886 MachineInstr &I = *Root.getParent();
4887 MachineBasicBlock *BB = I.getParent();
4888 const DebugLoc &DL = I.getDebugLoc();
4889 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4890
4891 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4892 .addFrameIndex(FI)
4893 .addReg(RHSDef->Reg)
4894 .setOperandDead(3); // Dead scc
4895 }
4896 }
4897
4898 if (!isSGPR(SAddr))
4899 return std::nullopt;
4900
4901 return {{
4902 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4903 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4904 }};
4905}
4906
4907// Check whether the flat scratch SVS swizzle bug affects this access.
4908bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4909 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4910 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4911 return false;
4912
4913 // The bug affects the swizzling of SVS accesses if there is any carry out
4914 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4915 // voffset to (soffset + inst_offset).
4916 auto VKnown = KB->getKnownBits(VAddr);
4917 auto SKnown = KnownBits::add(KB->getKnownBits(SAddr),
4918 KnownBits::makeConstant(APInt(32, ImmOffset)));
4919 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4920 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4921 return (VMax & 3) + (SMax & 3) >= 4;
4922}
4923
4925AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4926 Register Addr = Root.getReg();
4927 Register PtrBase;
4928 int64_t ConstOffset;
4929 int64_t ImmOffset = 0;
4930
4931 // Match the immediate offset first, which canonically is moved as low as
4932 // possible.
4933 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4934
4935 Register OrigAddr = Addr;
4936 if (ConstOffset != 0 &&
4937 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4938 Addr = PtrBase;
4939 ImmOffset = ConstOffset;
4940 }
4941
4942 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4943 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4944 return std::nullopt;
4945
4946 Register RHS = AddrDef->MI->getOperand(2).getReg();
4947 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4948 return std::nullopt;
4949
4950 Register LHS = AddrDef->MI->getOperand(1).getReg();
4951 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4952
4953 if (OrigAddr != Addr) {
4954 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4955 return std::nullopt;
4956 } else {
4957 if (!isFlatScratchBaseLegalSV(OrigAddr))
4958 return std::nullopt;
4959 }
4960
4961 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4962 return std::nullopt;
4963
4964 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4965 int FI = LHSDef->MI->getOperand(1).getIndex();
4966 return {{
4967 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4968 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4969 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4970 }};
4971 }
4972
4973 if (!isSGPR(LHS))
4974 return std::nullopt;
4975
4976 return {{
4977 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4978 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4979 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4980 }};
4981}
4982
4984AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4985 MachineInstr *MI = Root.getParent();
4986 MachineBasicBlock *MBB = MI->getParent();
4989
4990 int64_t Offset = 0;
4991 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4993 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4994
4995 // TODO: Should this be inside the render function? The iterator seems to
4996 // move.
4997 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4998 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4999 HighBits)
5000 .addImm(Offset & ~MaxOffset);
5001
5002 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5003 MIB.addReg(Info->getScratchRSrcReg());
5004 },
5005 [=](MachineInstrBuilder &MIB) { // vaddr
5006 MIB.addReg(HighBits);
5007 },
5008 [=](MachineInstrBuilder &MIB) { // soffset
5009 // Use constant zero for soffset and rely on eliminateFrameIndex
5010 // to choose the appropriate frame register if need be.
5011 MIB.addImm(0);
5012 },
5013 [=](MachineInstrBuilder &MIB) { // offset
5014 MIB.addImm(Offset & MaxOffset);
5015 }}};
5016 }
5017
5018 assert(Offset == 0 || Offset == -1);
5019
5020 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5021 // offsets.
5022 std::optional<int> FI;
5023 Register VAddr = Root.getReg();
5024
5025 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5026 Register PtrBase;
5027 int64_t ConstOffset;
5028 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5029 if (ConstOffset != 0) {
5030 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5032 KB->signBitIsZero(PtrBase))) {
5033 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5034 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5035 FI = PtrBaseDef->getOperand(1).getIndex();
5036 else
5037 VAddr = PtrBase;
5038 Offset = ConstOffset;
5039 }
5040 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5041 FI = RootDef->getOperand(1).getIndex();
5042 }
5043
5044 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5045 MIB.addReg(Info->getScratchRSrcReg());
5046 },
5047 [=](MachineInstrBuilder &MIB) { // vaddr
5048 if (FI)
5049 MIB.addFrameIndex(*FI);
5050 else
5051 MIB.addReg(VAddr);
5052 },
5053 [=](MachineInstrBuilder &MIB) { // soffset
5054 // Use constant zero for soffset and rely on eliminateFrameIndex
5055 // to choose the appropriate frame register if need be.
5056 MIB.addImm(0);
5057 },
5058 [=](MachineInstrBuilder &MIB) { // offset
5059 MIB.addImm(Offset);
5060 }}};
5061}
5062
5063bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5064 int64_t Offset) const {
5065 if (!isUInt<16>(Offset))
5066 return false;
5067
5069 return true;
5070
5071 // On Southern Islands instruction with a negative base value and an offset
5072 // don't seem to work.
5073 return KB->signBitIsZero(Base);
5074}
5075
5076bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5077 int64_t Offset1,
5078 unsigned Size) const {
5079 if (Offset0 % Size != 0 || Offset1 % Size != 0)
5080 return false;
5081 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
5082 return false;
5083
5085 return true;
5086
5087 // On Southern Islands instruction with a negative base value and an offset
5088 // don't seem to work.
5089 return KB->signBitIsZero(Base);
5090}
5091
5092// Return whether the operation has NoUnsignedWrap property.
5094 return Addr->getOpcode() == TargetOpcode::G_OR ||
5095 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5096 Addr->getFlag(MachineInstr::NoUWrap));
5097}
5098
5099// Check that the base address of flat scratch load/store in the form of `base +
5100// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5101// requirement). We always treat the first operand as the base address here.
5102bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5103 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5104
5105 if (isNoUnsignedWrap(AddrMI))
5106 return true;
5107
5108 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5109 // values.
5110 if (STI.hasSignedScratchOffsets())
5111 return true;
5112
5113 Register LHS = AddrMI->getOperand(1).getReg();
5114 Register RHS = AddrMI->getOperand(2).getReg();
5115
5116 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5117 std::optional<ValueAndVReg> RhsValReg =
5119 // If the immediate offset is negative and within certain range, the base
5120 // address cannot also be negative. If the base is also negative, the sum
5121 // would be either negative or much larger than the valid range of scratch
5122 // memory a thread can access.
5123 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5124 RhsValReg->Value.getSExtValue() > -0x40000000)
5125 return true;
5126 }
5127
5128 return KB->signBitIsZero(LHS);
5129}
5130
5131// Check address value in SGPR/VGPR are legal for flat scratch in the form
5132// of: SGPR + VGPR.
5133bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
5134 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5135
5136 if (isNoUnsignedWrap(AddrMI))
5137 return true;
5138
5139 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5140 // values.
5141 if (STI.hasSignedScratchOffsets())
5142 return true;
5143
5144 Register LHS = AddrMI->getOperand(1).getReg();
5145 Register RHS = AddrMI->getOperand(2).getReg();
5146 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5147}
5148
5149// Check address value in SGPR/VGPR are legal for flat scratch in the form
5150// of: SGPR + VGPR + Imm.
5151bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5152 Register Addr) const {
5153 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5154 // values.
5155 if (STI.hasSignedScratchOffsets())
5156 return true;
5157
5158 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5159 Register Base = AddrMI->getOperand(1).getReg();
5160 std::optional<DefinitionAndSourceRegister> BaseDef =
5162 std::optional<ValueAndVReg> RHSOffset =
5164 assert(RHSOffset);
5165
5166 // If the immediate offset is negative and within certain range, the base
5167 // address cannot also be negative. If the base is also negative, the sum
5168 // would be either negative or much larger than the valid range of scratch
5169 // memory a thread can access.
5170 if (isNoUnsignedWrap(BaseDef->MI) &&
5171 (isNoUnsignedWrap(AddrMI) ||
5172 (RHSOffset->Value.getSExtValue() < 0 &&
5173 RHSOffset->Value.getSExtValue() > -0x40000000)))
5174 return true;
5175
5176 Register LHS = BaseDef->MI->getOperand(1).getReg();
5177 Register RHS = BaseDef->MI->getOperand(2).getReg();
5178 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5179}
5180
5181bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
5182 unsigned ShAmtBits) const {
5183 assert(MI.getOpcode() == TargetOpcode::G_AND);
5184
5185 std::optional<APInt> RHS =
5186 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
5187 if (!RHS)
5188 return false;
5189
5190 if (RHS->countr_one() >= ShAmtBits)
5191 return true;
5192
5193 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
5194 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5195}
5196
5198AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5199 MachineOperand &Root) const {
5200 Register Reg = Root.getReg();
5202
5203 std::optional<DefinitionAndSourceRegister> Def =
5204 getDefSrcRegIgnoringCopies(Reg, *MRI);
5205 assert(Def && "this shouldn't be an optional result");
5206 Reg = Def->Reg;
5207
5208 if (Register WaveBase = getWaveAddress(Def->MI)) {
5209 return {{
5210 [=](MachineInstrBuilder &MIB) { // rsrc
5211 MIB.addReg(Info->getScratchRSrcReg());
5212 },
5213 [=](MachineInstrBuilder &MIB) { // soffset
5214 MIB.addReg(WaveBase);
5215 },
5216 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
5217 }};
5218 }
5219
5220 int64_t Offset = 0;
5221
5222 // FIXME: Copy check is a hack
5224 if (mi_match(Reg, *MRI,
5225 m_GPtrAdd(m_Reg(BasePtr),
5227 if (!TII.isLegalMUBUFImmOffset(Offset))
5228 return {};
5229 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
5230 Register WaveBase = getWaveAddress(BasePtrDef);
5231 if (!WaveBase)
5232 return {};
5233
5234 return {{
5235 [=](MachineInstrBuilder &MIB) { // rsrc
5236 MIB.addReg(Info->getScratchRSrcReg());
5237 },
5238 [=](MachineInstrBuilder &MIB) { // soffset
5239 MIB.addReg(WaveBase);
5240 },
5241 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5242 }};
5243 }
5244
5245 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
5247 return {};
5248
5249 return {{
5250 [=](MachineInstrBuilder &MIB) { // rsrc
5251 MIB.addReg(Info->getScratchRSrcReg());
5252 },
5253 [=](MachineInstrBuilder &MIB) { // soffset
5254 MIB.addImm(0);
5255 },
5256 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5257 }};
5258}
5259
5260std::pair<Register, unsigned>
5261AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
5262 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5263 int64_t ConstAddr = 0;
5264
5265 Register PtrBase;
5266 int64_t Offset;
5267 std::tie(PtrBase, Offset) =
5268 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5269
5270 if (Offset) {
5271 if (isDSOffsetLegal(PtrBase, Offset)) {
5272 // (add n0, c0)
5273 return std::pair(PtrBase, Offset);
5274 }
5275 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5276 // TODO
5277
5278
5279 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5280 // TODO
5281
5282 }
5283
5284 return std::pair(Root.getReg(), 0);
5285}
5286
5288AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
5289 Register Reg;
5290 unsigned Offset;
5291 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
5292 return {{
5293 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5294 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
5295 }};
5296}
5297
5299AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
5300 return selectDSReadWrite2(Root, 4);
5301}
5302
5304AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
5305 return selectDSReadWrite2(Root, 8);
5306}
5307
5309AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5310 unsigned Size) const {
5311 Register Reg;
5312 unsigned Offset;
5313 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
5314 return {{
5315 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5316 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5317 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
5318 }};
5319}
5320
5321std::pair<Register, unsigned>
5322AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5323 unsigned Size) const {
5324 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5325 int64_t ConstAddr = 0;
5326
5327 Register PtrBase;
5328 int64_t Offset;
5329 std::tie(PtrBase, Offset) =
5330 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5331
5332 if (Offset) {
5333 int64_t OffsetValue0 = Offset;
5334 int64_t OffsetValue1 = Offset + Size;
5335 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5336 // (add n0, c0)
5337 return std::pair(PtrBase, OffsetValue0 / Size);
5338 }
5339 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5340 // TODO
5341
5342 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5343 // TODO
5344
5345 }
5346
5347 return std::pair(Root.getReg(), 0);
5348}
5349
5350/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5351/// the base value with the constant offset. There may be intervening copies
5352/// between \p Root and the identified constant. Returns \p Root, 0 if this does
5353/// not match the pattern.
5354std::pair<Register, int64_t>
5355AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5356 Register Root, const MachineRegisterInfo &MRI) const {
5357 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
5358 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5359 return {Root, 0};
5360
5361 MachineOperand &RHS = RootI->getOperand(2);
5362 std::optional<ValueAndVReg> MaybeOffset =
5364 if (!MaybeOffset)
5365 return {Root, 0};
5366 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5367}
5368
5370 MIB.addImm(0);
5371}
5372
5373/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5374/// BasePtr is not valid, a null base pointer will be used.
5376 uint32_t FormatLo, uint32_t FormatHi,
5377 Register BasePtr) {
5378 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5379 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5380 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5381 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5382
5383 B.buildInstr(AMDGPU::S_MOV_B32)
5384 .addDef(RSrc2)
5385 .addImm(FormatLo);
5386 B.buildInstr(AMDGPU::S_MOV_B32)
5387 .addDef(RSrc3)
5388 .addImm(FormatHi);
5389
5390 // Build the half of the subregister with the constants before building the
5391 // full 128-bit register. If we are building multiple resource descriptors,
5392 // this will allow CSEing of the 2-component register.
5393 B.buildInstr(AMDGPU::REG_SEQUENCE)
5394 .addDef(RSrcHi)
5395 .addReg(RSrc2)
5396 .addImm(AMDGPU::sub0)
5397 .addReg(RSrc3)
5398 .addImm(AMDGPU::sub1);
5399
5400 Register RSrcLo = BasePtr;
5401 if (!BasePtr) {
5402 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5403 B.buildInstr(AMDGPU::S_MOV_B64)
5404 .addDef(RSrcLo)
5405 .addImm(0);
5406 }
5407
5408 B.buildInstr(AMDGPU::REG_SEQUENCE)
5409 .addDef(RSrc)
5410 .addReg(RSrcLo)
5411 .addImm(AMDGPU::sub0_sub1)
5412 .addReg(RSrcHi)
5413 .addImm(AMDGPU::sub2_sub3);
5414
5415 return RSrc;
5416}
5417
5419 const SIInstrInfo &TII, Register BasePtr) {
5420 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5421
5422 // FIXME: Why are half the "default" bits ignored based on the addressing
5423 // mode?
5424 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5425}
5426
5428 const SIInstrInfo &TII, Register BasePtr) {
5429 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5430
5431 // FIXME: Why are half the "default" bits ignored based on the addressing
5432 // mode?
5433 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5434}
5435
5436AMDGPUInstructionSelector::MUBUFAddressData
5437AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5438 MUBUFAddressData Data;
5439 Data.N0 = Src;
5440
5441 Register PtrBase;
5442 int64_t Offset;
5443
5444 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5445 if (isUInt<32>(Offset)) {
5446 Data.N0 = PtrBase;
5447 Data.Offset = Offset;
5448 }
5449
5450 if (MachineInstr *InputAdd
5451 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5452 Data.N2 = InputAdd->getOperand(1).getReg();
5453 Data.N3 = InputAdd->getOperand(2).getReg();
5454
5455 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5456 // FIXME: Don't know this was defined by operand 0
5457 //
5458 // TODO: Remove this when we have copy folding optimizations after
5459 // RegBankSelect.
5460 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5461 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5462 }
5463
5464 return Data;
5465}
5466
5467/// Return if the addr64 mubuf mode should be used for the given address.
5468bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5469 // (ptr_add N2, N3) -> addr64, or
5470 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5471 if (Addr.N2)
5472 return true;
5473
5474 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5475 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5476}
5477
5478/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5479/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5480/// component.
5481void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5482 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5483 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5484 return;
5485
5486 // Illegal offset, store it in soffset.
5487 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5488 B.buildInstr(AMDGPU::S_MOV_B32)
5489 .addDef(SOffset)
5490 .addImm(ImmOffset);
5491 ImmOffset = 0;
5492}
5493
5494bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5495 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5496 Register &SOffset, int64_t &Offset) const {
5497 // FIXME: Predicates should stop this from reaching here.
5498 // addr64 bit was removed for volcanic islands.
5499 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5500 return false;
5501
5502 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5503 if (!shouldUseAddr64(AddrData))
5504 return false;
5505
5506 Register N0 = AddrData.N0;
5507 Register N2 = AddrData.N2;
5508 Register N3 = AddrData.N3;
5509 Offset = AddrData.Offset;
5510
5511 // Base pointer for the SRD.
5512 Register SRDPtr;
5513
5514 if (N2) {
5515 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5516 assert(N3);
5517 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5518 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5519 // addr64, and construct the default resource from a 0 address.
5520 VAddr = N0;
5521 } else {
5522 SRDPtr = N3;
5523 VAddr = N2;
5524 }
5525 } else {
5526 // N2 is not divergent.
5527 SRDPtr = N2;
5528 VAddr = N3;
5529 }
5530 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5531 // Use the default null pointer in the resource
5532 VAddr = N0;
5533 } else {
5534 // N0 -> offset, or
5535 // (N0 + C1) -> offset
5536 SRDPtr = N0;
5537 }
5538
5539 MachineIRBuilder B(*Root.getParent());
5540 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5541 splitIllegalMUBUFOffset(B, SOffset, Offset);
5542 return true;
5543}
5544
5545bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5546 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5547 int64_t &Offset) const {
5548
5549 // FIXME: Pattern should not reach here.
5550 if (STI.useFlatForGlobal())
5551 return false;
5552
5553 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5554 if (shouldUseAddr64(AddrData))
5555 return false;
5556
5557 // N0 -> offset, or
5558 // (N0 + C1) -> offset
5559 Register SRDPtr = AddrData.N0;
5560 Offset = AddrData.Offset;
5561
5562 // TODO: Look through extensions for 32-bit soffset.
5563 MachineIRBuilder B(*Root.getParent());
5564
5565 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5566 splitIllegalMUBUFOffset(B, SOffset, Offset);
5567 return true;
5568}
5569
5571AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5572 Register VAddr;
5573 Register RSrcReg;
5574 Register SOffset;
5575 int64_t Offset = 0;
5576
5577 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5578 return {};
5579
5580 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5581 // pattern.
5582 return {{
5583 [=](MachineInstrBuilder &MIB) { // rsrc
5584 MIB.addReg(RSrcReg);
5585 },
5586 [=](MachineInstrBuilder &MIB) { // vaddr
5587 MIB.addReg(VAddr);
5588 },
5589 [=](MachineInstrBuilder &MIB) { // soffset
5590 if (SOffset)
5591 MIB.addReg(SOffset);
5592 else if (STI.hasRestrictedSOffset())
5593 MIB.addReg(AMDGPU::SGPR_NULL);
5594 else
5595 MIB.addImm(0);
5596 },
5597 [=](MachineInstrBuilder &MIB) { // offset
5598 MIB.addImm(Offset);
5599 },
5600 addZeroImm, // cpol
5601 addZeroImm, // tfe
5602 addZeroImm // swz
5603 }};
5604}
5605
5607AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5608 Register RSrcReg;
5609 Register SOffset;
5610 int64_t Offset = 0;
5611
5612 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5613 return {};
5614
5615 return {{
5616 [=](MachineInstrBuilder &MIB) { // rsrc
5617 MIB.addReg(RSrcReg);
5618 },
5619 [=](MachineInstrBuilder &MIB) { // soffset
5620 if (SOffset)
5621 MIB.addReg(SOffset);
5622 else if (STI.hasRestrictedSOffset())
5623 MIB.addReg(AMDGPU::SGPR_NULL);
5624 else
5625 MIB.addImm(0);
5626 },
5627 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5628 addZeroImm, // cpol
5629 addZeroImm, // tfe
5630 addZeroImm, // swz
5631 }};
5632}
5633
5635AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5636
5637 Register SOffset = Root.getReg();
5638
5639 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5640 SOffset = AMDGPU::SGPR_NULL;
5641
5642 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5643}
5644
5645/// Get an immediate that must be 32-bits, and treated as zero extended.
5646static std::optional<uint64_t>
5648 // getIConstantVRegVal sexts any values, so see if that matters.
5649 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5650 if (!OffsetVal || !isInt<32>(*OffsetVal))
5651 return std::nullopt;
5652 return Lo_32(*OffsetVal);
5653}
5654
5656AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5657 std::optional<uint64_t> OffsetVal =
5658 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
5659 if (!OffsetVal)
5660 return {};
5661
5662 std::optional<int64_t> EncodedImm =
5663 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5664 if (!EncodedImm)
5665 return {};
5666
5667 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5668}
5669
5671AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5673
5674 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5675 if (!OffsetVal)
5676 return {};
5677
5678 std::optional<int64_t> EncodedImm =
5680 if (!EncodedImm)
5681 return {};
5682
5683 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5684}
5685
5687AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5688 // Match the (soffset + offset) pair as a 32-bit register base and
5689 // an immediate offset.
5690 Register SOffset;
5691 unsigned Offset;
5692 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5693 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5694 if (!SOffset)
5695 return std::nullopt;
5696
5697 std::optional<int64_t> EncodedOffset =
5698 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5699 if (!EncodedOffset)
5700 return std::nullopt;
5701
5702 assert(MRI->getType(SOffset) == LLT::scalar(32));
5703 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5704 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5705}
5706
5707std::pair<Register, unsigned>
5708AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5709 bool &Matched) const {
5710 Matched = false;
5711
5712 Register Src;
5713 unsigned Mods;
5714 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5715
5716 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
5717 assert(MRI->getType(Src) == LLT::scalar(16));
5718
5719 // Only change Src if src modifier could be gained. In such cases new Src
5720 // could be sgpr but this does not violate constant bus restriction for
5721 // instruction that is being selected.
5722 Src = stripBitCast(Src, *MRI);
5723
5724 const auto CheckAbsNeg = [&]() {
5725 // Be careful about folding modifiers if we already have an abs. fneg is
5726 // applied last, so we don't want to apply an earlier fneg.
5727 if ((Mods & SISrcMods::ABS) == 0) {
5728 unsigned ModsTmp;
5729 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
5730
5731 if ((ModsTmp & SISrcMods::NEG) != 0)
5732 Mods ^= SISrcMods::NEG;
5733
5734 if ((ModsTmp & SISrcMods::ABS) != 0)
5735 Mods |= SISrcMods::ABS;
5736 }
5737 };
5738
5739 CheckAbsNeg();
5740
5741 // op_sel/op_sel_hi decide the source type and source.
5742 // If the source's op_sel_hi is set, it indicates to do a conversion from
5743 // fp16. If the sources's op_sel is set, it picks the high half of the
5744 // source register.
5745
5746 Mods |= SISrcMods::OP_SEL_1;
5747
5748 if (isExtractHiElt(*MRI, Src, Src)) {
5749 Mods |= SISrcMods::OP_SEL_0;
5750 CheckAbsNeg();
5751 }
5752
5753 Matched = true;
5754 }
5755
5756 return {Src, Mods};
5757}
5758
5760AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5761 MachineOperand &Root) const {
5762 Register Src;
5763 unsigned Mods;
5764 bool Matched;
5765 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5766 if (!Matched)
5767 return {};
5768
5769 return {{
5770 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5771 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5772 }};
5773}
5774
5776AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5777 Register Src;
5778 unsigned Mods;
5779 bool Matched;
5780 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5781
5782 return {{
5783 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5784 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5785 }};
5786}
5787
5788bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5789 MachineInstr &I, Intrinsic::ID IntrID) const {
5790 MachineBasicBlock *MBB = I.getParent();
5791 const DebugLoc &DL = I.getDebugLoc();
5792 Register CCReg = I.getOperand(0).getReg();
5793
5794 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5795 .addImm(I.getOperand(2).getImm());
5796
5797 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5798
5799 I.eraseFromParent();
5800 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5801 *MRI);
5802}
5803
5804bool AMDGPUInstructionSelector::selectSGetBarrierState(
5805 MachineInstr &I, Intrinsic::ID IntrID) const {
5806 MachineBasicBlock *MBB = I.getParent();
5807 const DebugLoc &DL = I.getDebugLoc();
5808 MachineOperand BarOp = I.getOperand(2);
5809 std::optional<int64_t> BarValImm =
5810 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5811
5812 if (!BarValImm) {
5813 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5814 .addReg(BarOp.getReg());
5815 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5816 }
5818 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
5819 : AMDGPU::S_GET_BARRIER_STATE_M0;
5820 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5821
5822 auto DstReg = I.getOperand(0).getReg();
5823 const TargetRegisterClass *DstRC =
5824 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5825 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5826 return false;
5827 MIB.addDef(DstReg);
5828 if (BarValImm) {
5829 MIB.addImm(*BarValImm);
5830 }
5831 I.eraseFromParent();
5832 return true;
5833}
5834
5835unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5836 if (HasInlineConst) {
5837 switch (IntrID) {
5838 default:
5839 llvm_unreachable("not a named barrier op");
5840 case Intrinsic::amdgcn_s_barrier_join:
5841 return AMDGPU::S_BARRIER_JOIN_IMM;
5842 case Intrinsic::amdgcn_s_wakeup_barrier:
5843 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5844 case Intrinsic::amdgcn_s_get_named_barrier_state:
5845 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5846 };
5847 } else {
5848 switch (IntrID) {
5849 default:
5850 llvm_unreachable("not a named barrier op");
5851 case Intrinsic::amdgcn_s_barrier_join:
5852 return AMDGPU::S_BARRIER_JOIN_M0;
5853 case Intrinsic::amdgcn_s_wakeup_barrier:
5854 return AMDGPU::S_WAKEUP_BARRIER_M0;
5855 case Intrinsic::amdgcn_s_get_named_barrier_state:
5856 return AMDGPU::S_GET_BARRIER_STATE_M0;
5857 };
5858 }
5859}
5860
5861bool AMDGPUInstructionSelector::selectNamedBarrierInit(
5862 MachineInstr &I, Intrinsic::ID IntrID) const {
5863 MachineBasicBlock *MBB = I.getParent();
5864 const DebugLoc &DL = I.getDebugLoc();
5865 MachineOperand BarOp = I.getOperand(1);
5866 MachineOperand CntOp = I.getOperand(2);
5867
5868 // BarID = (BarOp >> 4) & 0x3F
5869 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5870 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5871 .add(BarOp)
5872 .addImm(4u)
5873 .setOperandDead(3); // Dead scc
5874
5875 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5876 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5877 .addReg(TmpReg0)
5878 .addImm(0x3F)
5879 .setOperandDead(3); // Dead scc
5880
5881 // MO = ((CntOp & 0x3F) << shAmt) | BarID
5882 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5883 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
5884 .add(CntOp)
5885 .addImm(0x3F)
5886 .setOperandDead(3); // Dead scc
5887
5888 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5889 constexpr unsigned ShAmt = 16;
5890 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
5891 .addReg(TmpReg2)
5892 .addImm(ShAmt)
5893 .setOperandDead(3); // Dead scc
5894
5895 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5896 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
5897 .addReg(TmpReg1)
5898 .addReg(TmpReg3)
5899 .setOperandDead(3); // Dead scc;
5900
5901 auto CopyMIB =
5902 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
5903 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5904
5905 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
5906 ? AMDGPU::S_BARRIER_INIT_M0
5907 : AMDGPU::S_BARRIER_SIGNAL_M0;
5909 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5910
5911 I.eraseFromParent();
5912 return true;
5913}
5914
5915bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5916 MachineInstr &I, Intrinsic::ID IntrID) const {
5917 MachineBasicBlock *MBB = I.getParent();
5918 const DebugLoc &DL = I.getDebugLoc();
5919 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
5920 ? I.getOperand(2)
5921 : I.getOperand(1);
5922 std::optional<int64_t> BarValImm =
5923 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5924
5925 if (!BarValImm) {
5926 // BarID = (BarOp >> 4) & 0x3F
5927 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5928 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5929 .addReg(BarOp.getReg())
5930 .addImm(4u)
5931 .setOperandDead(3); // Dead scc;
5932
5933 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5934 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5935 .addReg(TmpReg0)
5936 .addImm(0x3F)
5937 .setOperandDead(3); // Dead scc;
5938
5939 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5940 .addReg(TmpReg1);
5941 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5942 }
5943
5945 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5946 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5947
5948 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
5949 auto DstReg = I.getOperand(0).getReg();
5950 const TargetRegisterClass *DstRC =
5951 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5952 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5953 return false;
5954 MIB.addDef(DstReg);
5955 }
5956
5957 if (BarValImm) {
5958 auto BarId = ((*BarValImm) >> 4) & 0x3F;
5959 MIB.addImm(BarId);
5960 }
5961
5962 I.eraseFromParent();
5963 return true;
5964}
5965
5966void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5967 const MachineInstr &MI,
5968 int OpIdx) const {
5969 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5970 "Expected G_CONSTANT");
5971 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5972}
5973
5974void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5975 const MachineInstr &MI,
5976 int OpIdx) const {
5977 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5978 "Expected G_CONSTANT");
5979 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5980}
5981
5982void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
5983 const MachineInstr &MI,
5984 int OpIdx) const {
5985 const MachineOperand &Op = MI.getOperand(1);
5986 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
5987 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5988}
5989
5990void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5991 const MachineInstr &MI,
5992 int OpIdx) const {
5993 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5994 "Expected G_CONSTANT");
5995 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5996}
5997
5998/// This only really exists to satisfy DAG type checking machinery, so is a
5999/// no-op here.
6000void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6001 const MachineInstr &MI,
6002 int OpIdx) const {
6003 const MachineOperand &Op = MI.getOperand(OpIdx);
6004 int64_t Imm;
6005 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6006 MIB.addImm(Imm);
6007 else
6008 MIB.addImm(Op.getImm());
6009}
6010
6011void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6012 const MachineInstr &MI,
6013 int OpIdx) const {
6014 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6015}
6016
6017void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6018 const MachineInstr &MI,
6019 int OpIdx) const {
6020 assert(OpIdx >= 0 && "expected to match an immediate operand");
6021 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6022}
6023
6024void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6025 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6026 assert(OpIdx >= 0 && "expected to match an immediate operand");
6027 MIB.addImm(
6028 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6029}
6030
6031void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6032 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6033 assert(OpIdx >= 0 && "expected to match an immediate operand");
6034 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6036 : (int64_t)SISrcMods::DST_OP_SEL);
6037}
6038
6039void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6040 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6041 assert(OpIdx >= 0 && "expected to match an immediate operand");
6042 MIB.addImm(
6043 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6044}
6045
6046void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6047 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6048 assert(OpIdx >= 0 && "expected to match an immediate operand");
6049 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6050 ? (int64_t)(SISrcMods::OP_SEL_0)
6051 : 0);
6052}
6053
6054void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6055 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6056 assert(OpIdx >= 0 && "expected to match an immediate operand");
6057 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6058 : 0);
6059}
6060
6061void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6062 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6063 assert(OpIdx >= 0 && "expected to match an immediate operand");
6064 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6065 : 0);
6066}
6067
6068void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6069 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6070 assert(OpIdx >= 0 && "expected to match an immediate operand");
6071 MIB.addImm(
6072 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6073}
6074
6075void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6076 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6077 assert(OpIdx >= 0 && "expected to match an immediate operand");
6078 MIB.addImm(
6079 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
6080}
6081
6082void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6083 const MachineInstr &MI,
6084 int OpIdx) const {
6085 assert(OpIdx >= 0 && "expected to match an immediate operand");
6086 MIB.addImm(MI.getOperand(OpIdx).getImm() &
6089}
6090
6091void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6092 const MachineInstr &MI,
6093 int OpIdx) const {
6094 assert(OpIdx >= 0 && "expected to match an immediate operand");
6095 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
6098 MIB.addImm(Swizzle);
6099}
6100
6101void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6102 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6103 assert(OpIdx >= 0 && "expected to match an immediate operand");
6104 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
6107 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
6108}
6109
6110void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6111 const MachineInstr &MI,
6112 int OpIdx) const {
6113 MIB.addFrameIndex(MI.getOperand(1).getIndex());
6114}
6115
6116void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6117 const MachineInstr &MI,
6118 int OpIdx) const {
6119 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
6120 int ExpVal = APF.getExactLog2Abs();
6121 assert(ExpVal != INT_MIN);
6122 MIB.addImm(ExpVal);
6123}
6124
6125void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6126 const MachineInstr &MI,
6127 int OpIdx) const {
6128 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
6129 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6130 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
6131 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
6132 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
6133}
6134
6135/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
6136void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6137 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6138 unsigned Val = MI.getOperand(OpIdx).getImm();
6139 unsigned New = 0;
6140 if (Val & 0x1)
6142 if (Val & 0x2)
6144 MIB.addImm(New);
6145}
6146
6147bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
6148 return TII.isInlineConstant(Imm);
6149}
6150
6151bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
6152 return TII.isInlineConstant(Imm);
6153}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1484
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:690
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:688
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:687
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:685
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:686
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:675
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:271
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:163
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:350
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:482
bool hasBitOp3Insts() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:563
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:287
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:730
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:277
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:270
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1069
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:578
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
MCRegister getExec() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:910
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:645
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:294
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:486
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:314
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:439
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
DWARFExpression::Operation Op
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:467
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:493
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.