LLVM 20.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
60 CodeGenCoverage *CoverageInfo,
62 BlockFrequencyInfo *BFI) {
63 MRI = &MF.getRegInfo();
64 Subtarget = &MF.getSubtarget<GCNSubtarget>();
67}
68
69// Return the wave level SGPR base address if this is a wave address.
71 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72 ? Def->getOperand(1).getReg()
73 : Register();
74}
75
76bool AMDGPUInstructionSelector::isVCC(Register Reg,
77 const MachineRegisterInfo &MRI) const {
78 // The verifier is oblivious to s1 being a valid value for wavesize registers.
79 if (Reg.isPhysical())
80 return false;
81
82 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
83 const TargetRegisterClass *RC =
84 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
85 if (RC) {
86 const LLT Ty = MRI.getType(Reg);
87 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
88 return false;
89 // G_TRUNC s1 result is never vcc.
90 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
91 RC->hasSuperClassEq(TRI.getBoolRC());
92 }
93
94 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
95 return RB->getID() == AMDGPU::VCCRegBankID;
96}
97
98bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
99 unsigned NewOpc) const {
100 MI.setDesc(TII.get(NewOpc));
101 MI.removeOperand(1); // Remove intrinsic ID.
102 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
103
104 MachineOperand &Dst = MI.getOperand(0);
105 MachineOperand &Src = MI.getOperand(1);
106
107 // TODO: This should be legalized to s32 if needed
108 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
109 return false;
110
111 const TargetRegisterClass *DstRC
112 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
113 const TargetRegisterClass *SrcRC
114 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
115 if (!DstRC || DstRC != SrcRC)
116 return false;
117
118 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
119 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
120}
121
122bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
123 const DebugLoc &DL = I.getDebugLoc();
124 MachineBasicBlock *BB = I.getParent();
125 I.setDesc(TII.get(TargetOpcode::COPY));
126
127 const MachineOperand &Src = I.getOperand(1);
128 MachineOperand &Dst = I.getOperand(0);
129 Register DstReg = Dst.getReg();
130 Register SrcReg = Src.getReg();
131
132 if (isVCC(DstReg, *MRI)) {
133 if (SrcReg == AMDGPU::SCC) {
134 const TargetRegisterClass *RC
135 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
136 if (!RC)
137 return true;
138 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
139 }
140
141 if (!isVCC(SrcReg, *MRI)) {
142 // TODO: Should probably leave the copy and let copyPhysReg expand it.
143 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
144 return false;
145
146 const TargetRegisterClass *SrcRC
147 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
148
149 std::optional<ValueAndVReg> ConstVal =
150 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
151 if (ConstVal) {
152 unsigned MovOpc =
153 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
154 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
155 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
156 } else {
157 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
158
159 // We can't trust the high bits at this point, so clear them.
160
161 // TODO: Skip masking high bits if def is known boolean.
162
163 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
164 assert(Subtarget->useRealTrue16Insts());
165 const int64_t NoMods = 0;
166 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
167 .addImm(NoMods)
168 .addImm(1)
169 .addImm(NoMods)
170 .addReg(SrcReg)
171 .addImm(NoMods);
172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
173 .addImm(NoMods)
174 .addImm(0)
175 .addImm(NoMods)
176 .addReg(MaskedReg)
177 .addImm(NoMods);
178 } else {
179 bool IsSGPR = TRI.isSGPRClass(SrcRC);
180 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
181 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
182 .addImm(1)
183 .addReg(SrcReg);
184 if (IsSGPR)
185 And.setOperandDead(3); // Dead scc
186
187 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
188 .addImm(0)
189 .addReg(MaskedReg);
190 }
191 }
192
193 if (!MRI->getRegClassOrNull(SrcReg))
194 MRI->setRegClass(SrcReg, SrcRC);
195 I.eraseFromParent();
196 return true;
197 }
198
199 const TargetRegisterClass *RC =
201 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
202 return false;
203
204 return true;
205 }
206
207 for (const MachineOperand &MO : I.operands()) {
208 if (MO.getReg().isPhysical())
209 continue;
210
211 const TargetRegisterClass *RC =
213 if (!RC)
214 continue;
215 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
216 }
217 return true;
218}
219
220bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
221 const Register DefReg = I.getOperand(0).getReg();
222 const LLT DefTy = MRI->getType(DefReg);
223
224 // S1 G_PHIs should not be selected in instruction-select, instead:
225 // - divergent S1 G_PHI should go through lane mask merging algorithm
226 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
227 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
228 if (DefTy == LLT::scalar(1))
229 return false;
230
231 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
232
233 const RegClassOrRegBank &RegClassOrBank =
234 MRI->getRegClassOrRegBank(DefReg);
235
236 const TargetRegisterClass *DefRC =
237 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
238 if (!DefRC) {
239 if (!DefTy.isValid()) {
240 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
241 return false;
242 }
243
244 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
245 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
246 if (!DefRC) {
247 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
248 return false;
249 }
250 }
251
252 // TODO: Verify that all registers have the same bank
253 I.setDesc(TII.get(TargetOpcode::PHI));
254 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
255}
256
258AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
259 const TargetRegisterClass &SubRC,
260 unsigned SubIdx) const {
261
262 MachineInstr *MI = MO.getParent();
264 Register DstReg = MRI->createVirtualRegister(&SubRC);
265
266 if (MO.isReg()) {
267 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
268 Register Reg = MO.getReg();
269 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
270 .addReg(Reg, 0, ComposedSubIdx);
271
272 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
273 MO.isKill(), MO.isDead(), MO.isUndef(),
274 MO.isEarlyClobber(), 0, MO.isDebug(),
275 MO.isInternalRead());
276 }
277
278 assert(MO.isImm());
279
280 APInt Imm(64, MO.getImm());
281
282 switch (SubIdx) {
283 default:
284 llvm_unreachable("do not know to split immediate with this sub index.");
285 case AMDGPU::sub0:
286 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
287 case AMDGPU::sub1:
288 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
289 }
290}
291
292static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
293 switch (Opc) {
294 case AMDGPU::G_AND:
295 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
296 case AMDGPU::G_OR:
297 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
298 case AMDGPU::G_XOR:
299 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
300 default:
301 llvm_unreachable("not a bit op");
302 }
303}
304
305bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
306 Register DstReg = I.getOperand(0).getReg();
307 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
308
309 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
310 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
311 DstRB->getID() != AMDGPU::VCCRegBankID)
312 return false;
313
314 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
315 STI.isWave64());
316 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
317
318 // Dead implicit-def of scc
319 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
320 true, // isImp
321 false, // isKill
322 true)); // isDead
323 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
324}
325
326bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
327 MachineBasicBlock *BB = I.getParent();
329 Register DstReg = I.getOperand(0).getReg();
330 const DebugLoc &DL = I.getDebugLoc();
331 LLT Ty = MRI->getType(DstReg);
332 if (Ty.isVector())
333 return false;
334
335 unsigned Size = Ty.getSizeInBits();
336 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
337 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
338 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
339
340 if (Size == 32) {
341 if (IsSALU) {
342 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
344 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
345 .add(I.getOperand(1))
346 .add(I.getOperand(2))
347 .setOperandDead(3); // Dead scc
348 I.eraseFromParent();
349 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
350 }
351
352 if (STI.hasAddNoCarry()) {
353 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
354 I.setDesc(TII.get(Opc));
355 I.addOperand(*MF, MachineOperand::CreateImm(0));
356 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
357 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
358 }
359
360 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
361
362 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
364 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
365 .addDef(UnusedCarry, RegState::Dead)
366 .add(I.getOperand(1))
367 .add(I.getOperand(2))
368 .addImm(0);
369 I.eraseFromParent();
370 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
371 }
372
373 assert(!Sub && "illegal sub should not reach here");
374
375 const TargetRegisterClass &RC
376 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
377 const TargetRegisterClass &HalfRC
378 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
379
380 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
381 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
382 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
383 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
384
385 Register DstLo = MRI->createVirtualRegister(&HalfRC);
386 Register DstHi = MRI->createVirtualRegister(&HalfRC);
387
388 if (IsSALU) {
389 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
390 .add(Lo1)
391 .add(Lo2);
392 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
393 .add(Hi1)
394 .add(Hi2)
395 .setOperandDead(3); // Dead scc
396 } else {
397 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
398 Register CarryReg = MRI->createVirtualRegister(CarryRC);
399 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
400 .addDef(CarryReg)
401 .add(Lo1)
402 .add(Lo2)
403 .addImm(0);
404 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
405 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
406 .add(Hi1)
407 .add(Hi2)
408 .addReg(CarryReg, RegState::Kill)
409 .addImm(0);
410
411 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
412 return false;
413 }
414
415 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
416 .addReg(DstLo)
417 .addImm(AMDGPU::sub0)
418 .addReg(DstHi)
419 .addImm(AMDGPU::sub1);
420
421
422 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
423 return false;
424
425 I.eraseFromParent();
426 return true;
427}
428
429bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
430 MachineInstr &I) const {
431 MachineBasicBlock *BB = I.getParent();
433 const DebugLoc &DL = I.getDebugLoc();
434 Register Dst0Reg = I.getOperand(0).getReg();
435 Register Dst1Reg = I.getOperand(1).getReg();
436 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
437 I.getOpcode() == AMDGPU::G_UADDE;
438 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
439 I.getOpcode() == AMDGPU::G_USUBE;
440
441 if (isVCC(Dst1Reg, *MRI)) {
442 unsigned NoCarryOpc =
443 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
444 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
445 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
446 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
447 I.addOperand(*MF, MachineOperand::CreateImm(0));
448 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
449 }
450
451 Register Src0Reg = I.getOperand(2).getReg();
452 Register Src1Reg = I.getOperand(3).getReg();
453
454 if (HasCarryIn) {
455 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
456 .addReg(I.getOperand(4).getReg());
457 }
458
459 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
460 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
461
462 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
463 .add(I.getOperand(2))
464 .add(I.getOperand(3));
465
466 if (MRI->use_nodbg_empty(Dst1Reg)) {
467 CarryInst.setOperandDead(3); // Dead scc
468 } else {
469 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
470 .addReg(AMDGPU::SCC);
471 if (!MRI->getRegClassOrNull(Dst1Reg))
472 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
473 }
474
475 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
476 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
477 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
478 return false;
479
480 if (HasCarryIn &&
481 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
482 AMDGPU::SReg_32RegClass, *MRI))
483 return false;
484
485 I.eraseFromParent();
486 return true;
487}
488
489bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
490 MachineInstr &I) const {
491 MachineBasicBlock *BB = I.getParent();
493 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
494
495 unsigned Opc;
496 if (Subtarget->hasMADIntraFwdBug())
497 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
498 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
499 else
500 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
501 I.setDesc(TII.get(Opc));
502 I.addOperand(*MF, MachineOperand::CreateImm(0));
503 I.addImplicitDefUseOperands(*MF);
504 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
505}
506
507// TODO: We should probably legalize these to only using 32-bit results.
508bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
509 MachineBasicBlock *BB = I.getParent();
510 Register DstReg = I.getOperand(0).getReg();
511 Register SrcReg = I.getOperand(1).getReg();
512 LLT DstTy = MRI->getType(DstReg);
513 LLT SrcTy = MRI->getType(SrcReg);
514 const unsigned SrcSize = SrcTy.getSizeInBits();
515 unsigned DstSize = DstTy.getSizeInBits();
516
517 // TODO: Should handle any multiple of 32 offset.
518 unsigned Offset = I.getOperand(2).getImm();
519 if (Offset % 32 != 0 || DstSize > 128)
520 return false;
521
522 // 16-bit operations really use 32-bit registers.
523 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
524 if (DstSize == 16)
525 DstSize = 32;
526
527 const TargetRegisterClass *DstRC =
528 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
529 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
530 return false;
531
532 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
533 const TargetRegisterClass *SrcRC =
534 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
535 if (!SrcRC)
536 return false;
538 DstSize / 32);
539 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
540 if (!SrcRC)
541 return false;
542
543 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
544 *SrcRC, I.getOperand(1));
545 const DebugLoc &DL = I.getDebugLoc();
546 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
547 .addReg(SrcReg, 0, SubReg);
548
549 I.eraseFromParent();
550 return true;
551}
552
553bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
554 MachineBasicBlock *BB = MI.getParent();
555 Register DstReg = MI.getOperand(0).getReg();
556 LLT DstTy = MRI->getType(DstReg);
557 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
558
559 const unsigned SrcSize = SrcTy.getSizeInBits();
560 if (SrcSize < 32)
561 return selectImpl(MI, *CoverageInfo);
562
563 const DebugLoc &DL = MI.getDebugLoc();
564 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
565 const unsigned DstSize = DstTy.getSizeInBits();
566 const TargetRegisterClass *DstRC =
567 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
568 if (!DstRC)
569 return false;
570
571 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
573 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
574 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
575 MachineOperand &Src = MI.getOperand(I + 1);
576 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
577 MIB.addImm(SubRegs[I]);
578
579 const TargetRegisterClass *SrcRC
580 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
581 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
582 return false;
583 }
584
585 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
586 return false;
587
588 MI.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
593 MachineBasicBlock *BB = MI.getParent();
594 const int NumDst = MI.getNumOperands() - 1;
595
596 MachineOperand &Src = MI.getOperand(NumDst);
597
598 Register SrcReg = Src.getReg();
599 Register DstReg0 = MI.getOperand(0).getReg();
600 LLT DstTy = MRI->getType(DstReg0);
601 LLT SrcTy = MRI->getType(SrcReg);
602
603 const unsigned DstSize = DstTy.getSizeInBits();
604 const unsigned SrcSize = SrcTy.getSizeInBits();
605 const DebugLoc &DL = MI.getDebugLoc();
606 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
607
608 const TargetRegisterClass *SrcRC =
609 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
610 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
611 return false;
612
613 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
614 // source, and this relies on the fact that the same subregister indices are
615 // used for both.
616 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
617 for (int I = 0, E = NumDst; I != E; ++I) {
618 MachineOperand &Dst = MI.getOperand(I);
619 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
620 .addReg(SrcReg, 0, SubRegs[I]);
621
622 // Make sure the subregister index is valid for the source register.
623 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
624 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
625 return false;
626
627 const TargetRegisterClass *DstRC =
629 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
630 return false;
631 }
632
633 MI.eraseFromParent();
634 return true;
635}
636
637bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
638 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
639 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
640
641 Register Src0 = MI.getOperand(1).getReg();
642 Register Src1 = MI.getOperand(2).getReg();
643 LLT SrcTy = MRI->getType(Src0);
644 const unsigned SrcSize = SrcTy.getSizeInBits();
645
646 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
647 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
648 return selectG_MERGE_VALUES(MI);
649 }
650
651 // Selection logic below is for V2S16 only.
652 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
653 Register Dst = MI.getOperand(0).getReg();
654 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
655 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
656 SrcTy != LLT::scalar(32)))
657 return selectImpl(MI, *CoverageInfo);
658
659 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
660 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
661 return false;
662
663 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
664 DstBank->getID() == AMDGPU::VGPRRegBankID);
665 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
666
667 const DebugLoc &DL = MI.getDebugLoc();
668 MachineBasicBlock *BB = MI.getParent();
669
670 // First, before trying TableGen patterns, check if both sources are
671 // constants. In those cases, we can trivially compute the final constant
672 // and emit a simple move.
673 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
674 if (ConstSrc1) {
675 auto ConstSrc0 =
676 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
677 if (ConstSrc0) {
678 const int64_t K0 = ConstSrc0->Value.getSExtValue();
679 const int64_t K1 = ConstSrc1->Value.getSExtValue();
680 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
681 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
682 uint32_t Imm = Lo16 | (Hi16 << 16);
683
684 // VALU
685 if (IsVector) {
686 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
687 MI.eraseFromParent();
688 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
689 }
690
691 // SALU
692 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
693 MI.eraseFromParent();
694 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
695 }
696 }
697
698 // Now try TableGen patterns.
699 if (selectImpl(MI, *CoverageInfo))
700 return true;
701
702 // TODO: This should probably be a combine somewhere
703 // (build_vector $src0, undef) -> copy $src0
704 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
705 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
706 MI.setDesc(TII.get(AMDGPU::COPY));
707 MI.removeOperand(2);
708 const auto &RC =
709 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
710 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
711 RBI.constrainGenericRegister(Src0, RC, *MRI);
712 }
713
714 // TODO: Can be improved?
715 if (IsVector) {
716 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
717 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
718 .addImm(0xFFFF)
719 .addReg(Src0);
720 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
721 return false;
722
723 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
724 .addReg(Src1)
725 .addImm(16)
726 .addReg(TmpReg);
727 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
728 return false;
729
730 MI.eraseFromParent();
731 return true;
732 }
733
734 Register ShiftSrc0;
735 Register ShiftSrc1;
736
737 // With multiple uses of the shift, this will duplicate the shift and
738 // increase register pressure.
739 //
740 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
741 // => (S_PACK_HH_B32_B16 $src0, $src1)
742 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
743 // => (S_PACK_HL_B32_B16 $src0, $src1)
744 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
745 // => (S_PACK_LH_B32_B16 $src0, $src1)
746 // (build_vector $src0, $src1)
747 // => (S_PACK_LL_B32_B16 $src0, $src1)
748
749 bool Shift0 = mi_match(
750 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
751
752 bool Shift1 = mi_match(
753 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
754
755 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
756 if (Shift0 && Shift1) {
757 Opc = AMDGPU::S_PACK_HH_B32_B16;
758 MI.getOperand(1).setReg(ShiftSrc0);
759 MI.getOperand(2).setReg(ShiftSrc1);
760 } else if (Shift1) {
761 Opc = AMDGPU::S_PACK_LH_B32_B16;
762 MI.getOperand(2).setReg(ShiftSrc1);
763 } else if (Shift0) {
764 auto ConstSrc1 =
765 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
766 if (ConstSrc1 && ConstSrc1->Value == 0) {
767 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
768 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
769 .addReg(ShiftSrc0)
770 .addImm(16)
771 .setOperandDead(3); // Dead scc
772
773 MI.eraseFromParent();
774 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
775 }
776 if (STI.hasSPackHL()) {
777 Opc = AMDGPU::S_PACK_HL_B32_B16;
778 MI.getOperand(1).setReg(ShiftSrc0);
779 }
780 }
781
782 MI.setDesc(TII.get(Opc));
783 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
784}
785
786bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
787 const MachineOperand &MO = I.getOperand(0);
788
789 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
790 // regbank check here is to know why getConstrainedRegClassForOperand failed.
792 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
793 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
794 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
795 return true;
796 }
797
798 return false;
799}
800
801bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
802 MachineBasicBlock *BB = I.getParent();
803
804 Register DstReg = I.getOperand(0).getReg();
805 Register Src0Reg = I.getOperand(1).getReg();
806 Register Src1Reg = I.getOperand(2).getReg();
807 LLT Src1Ty = MRI->getType(Src1Reg);
808
809 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
810 unsigned InsSize = Src1Ty.getSizeInBits();
811
812 int64_t Offset = I.getOperand(3).getImm();
813
814 // FIXME: These cases should have been illegal and unnecessary to check here.
815 if (Offset % 32 != 0 || InsSize % 32 != 0)
816 return false;
817
818 // Currently not handled by getSubRegFromChannel.
819 if (InsSize > 128)
820 return false;
821
822 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
823 if (SubReg == AMDGPU::NoSubRegister)
824 return false;
825
826 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
827 const TargetRegisterClass *DstRC =
828 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
829 if (!DstRC)
830 return false;
831
832 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
833 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
834 const TargetRegisterClass *Src0RC =
835 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
836 const TargetRegisterClass *Src1RC =
837 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
838
839 // Deal with weird cases where the class only partially supports the subreg
840 // index.
841 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
842 if (!Src0RC || !Src1RC)
843 return false;
844
845 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
846 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
847 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
848 return false;
849
850 const DebugLoc &DL = I.getDebugLoc();
851 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
852 .addReg(Src0Reg)
853 .addReg(Src1Reg)
854 .addImm(SubReg);
855
856 I.eraseFromParent();
857 return true;
858}
859
860bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
861 Register DstReg = MI.getOperand(0).getReg();
862 Register SrcReg = MI.getOperand(1).getReg();
863 Register OffsetReg = MI.getOperand(2).getReg();
864 Register WidthReg = MI.getOperand(3).getReg();
865
866 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
867 "scalar BFX instructions are expanded in regbankselect");
868 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
869 "64-bit vector BFX instructions are expanded in regbankselect");
870
871 const DebugLoc &DL = MI.getDebugLoc();
872 MachineBasicBlock *MBB = MI.getParent();
873
874 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
875 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
876 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
877 .addReg(SrcReg)
878 .addReg(OffsetReg)
879 .addReg(WidthReg);
880 MI.eraseFromParent();
881 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
882}
883
884bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
885 if (STI.getLDSBankCount() != 16)
886 return selectImpl(MI, *CoverageInfo);
887
888 Register Dst = MI.getOperand(0).getReg();
889 Register Src0 = MI.getOperand(2).getReg();
890 Register M0Val = MI.getOperand(6).getReg();
891 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
892 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
893 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
894 return false;
895
896 // This requires 2 instructions. It is possible to write a pattern to support
897 // this, but the generated isel emitter doesn't correctly deal with multiple
898 // output instructions using the same physical register input. The copy to m0
899 // is incorrectly placed before the second instruction.
900 //
901 // TODO: Match source modifiers.
902
903 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
904 const DebugLoc &DL = MI.getDebugLoc();
905 MachineBasicBlock *MBB = MI.getParent();
906
907 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
908 .addReg(M0Val);
909 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
910 .addImm(2)
911 .addImm(MI.getOperand(4).getImm()) // $attr
912 .addImm(MI.getOperand(3).getImm()); // $attrchan
913
914 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
915 .addImm(0) // $src0_modifiers
916 .addReg(Src0) // $src0
917 .addImm(MI.getOperand(4).getImm()) // $attr
918 .addImm(MI.getOperand(3).getImm()) // $attrchan
919 .addImm(0) // $src2_modifiers
920 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
921 .addImm(MI.getOperand(5).getImm()) // $high
922 .addImm(0) // $clamp
923 .addImm(0); // $omod
924
925 MI.eraseFromParent();
926 return true;
927}
928
929// Writelane is special in that it can use SGPR and M0 (which would normally
930// count as using the constant bus twice - but in this case it is allowed since
931// the lane selector doesn't count as a use of the constant bus). However, it is
932// still required to abide by the 1 SGPR rule. Fix this up if we might have
933// multiple SGPRs.
934bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
935 // With a constant bus limit of at least 2, there's no issue.
936 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
937 return selectImpl(MI, *CoverageInfo);
938
939 MachineBasicBlock *MBB = MI.getParent();
940 const DebugLoc &DL = MI.getDebugLoc();
941 Register VDst = MI.getOperand(0).getReg();
942 Register Val = MI.getOperand(2).getReg();
943 Register LaneSelect = MI.getOperand(3).getReg();
944 Register VDstIn = MI.getOperand(4).getReg();
945
946 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
947
948 std::optional<ValueAndVReg> ConstSelect =
949 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
950 if (ConstSelect) {
951 // The selector has to be an inline immediate, so we can use whatever for
952 // the other operands.
953 MIB.addReg(Val);
954 MIB.addImm(ConstSelect->Value.getSExtValue() &
955 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
956 } else {
957 std::optional<ValueAndVReg> ConstVal =
959
960 // If the value written is an inline immediate, we can get away without a
961 // copy to m0.
962 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
963 STI.hasInv2PiInlineImm())) {
964 MIB.addImm(ConstVal->Value.getSExtValue());
965 MIB.addReg(LaneSelect);
966 } else {
967 MIB.addReg(Val);
968
969 // If the lane selector was originally in a VGPR and copied with
970 // readfirstlane, there's a hazard to read the same SGPR from the
971 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
972 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
973
974 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
975 .addReg(LaneSelect);
976 MIB.addReg(AMDGPU::M0);
977 }
978 }
979
980 MIB.addReg(VDstIn);
981
982 MI.eraseFromParent();
983 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
984}
985
986// We need to handle this here because tablegen doesn't support matching
987// instructions with multiple outputs.
988bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
989 Register Dst0 = MI.getOperand(0).getReg();
990 Register Dst1 = MI.getOperand(1).getReg();
991
992 LLT Ty = MRI->getType(Dst0);
993 unsigned Opc;
994 if (Ty == LLT::scalar(32))
995 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
996 else if (Ty == LLT::scalar(64))
997 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
998 else
999 return false;
1000
1001 // TODO: Match source modifiers.
1002
1003 const DebugLoc &DL = MI.getDebugLoc();
1004 MachineBasicBlock *MBB = MI.getParent();
1005
1006 Register Numer = MI.getOperand(3).getReg();
1007 Register Denom = MI.getOperand(4).getReg();
1008 unsigned ChooseDenom = MI.getOperand(5).getImm();
1009
1010 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1011
1012 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1013 .addDef(Dst1)
1014 .addImm(0) // $src0_modifiers
1015 .addUse(Src0) // $src0
1016 .addImm(0) // $src1_modifiers
1017 .addUse(Denom) // $src1
1018 .addImm(0) // $src2_modifiers
1019 .addUse(Numer) // $src2
1020 .addImm(0) // $clamp
1021 .addImm(0); // $omod
1022
1023 MI.eraseFromParent();
1024 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1025}
1026
1027bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1028 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1029 switch (IntrinsicID) {
1030 case Intrinsic::amdgcn_if_break: {
1031 MachineBasicBlock *BB = I.getParent();
1032
1033 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1034 // SelectionDAG uses for wave32 vs wave64.
1035 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1036 .add(I.getOperand(0))
1037 .add(I.getOperand(2))
1038 .add(I.getOperand(3));
1039
1040 Register DstReg = I.getOperand(0).getReg();
1041 Register Src0Reg = I.getOperand(2).getReg();
1042 Register Src1Reg = I.getOperand(3).getReg();
1043
1044 I.eraseFromParent();
1045
1046 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1047 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1048
1049 return true;
1050 }
1051 case Intrinsic::amdgcn_interp_p1_f16:
1052 return selectInterpP1F16(I);
1053 case Intrinsic::amdgcn_wqm:
1054 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1055 case Intrinsic::amdgcn_softwqm:
1056 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1057 case Intrinsic::amdgcn_strict_wwm:
1058 case Intrinsic::amdgcn_wwm:
1059 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1060 case Intrinsic::amdgcn_strict_wqm:
1061 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1062 case Intrinsic::amdgcn_writelane:
1063 return selectWritelane(I);
1064 case Intrinsic::amdgcn_div_scale:
1065 return selectDivScale(I);
1066 case Intrinsic::amdgcn_icmp:
1067 case Intrinsic::amdgcn_fcmp:
1068 if (selectImpl(I, *CoverageInfo))
1069 return true;
1070 return selectIntrinsicCmp(I);
1071 case Intrinsic::amdgcn_ballot:
1072 return selectBallot(I);
1073 case Intrinsic::amdgcn_reloc_constant:
1074 return selectRelocConstant(I);
1075 case Intrinsic::amdgcn_groupstaticsize:
1076 return selectGroupStaticSize(I);
1077 case Intrinsic::returnaddress:
1078 return selectReturnAddress(I);
1079 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1080 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1081 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1082 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1083 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1084 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1085 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1086 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1087 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1088 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1089 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1090 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1091 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1092 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1093 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1094 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1095 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1096 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1097 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1098 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1099 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1100 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1101 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1102 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1103 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1104 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1105 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1106 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1107 return selectSMFMACIntrin(I);
1108 case Intrinsic::amdgcn_permlane16_swap:
1109 case Intrinsic::amdgcn_permlane32_swap:
1110 return selectPermlaneSwapIntrin(I, IntrinsicID);
1111 default:
1112 return selectImpl(I, *CoverageInfo);
1113 }
1114}
1115
1117 const GCNSubtarget &ST) {
1118 if (Size != 16 && Size != 32 && Size != 64)
1119 return -1;
1120
1121 if (Size == 16 && !ST.has16BitInsts())
1122 return -1;
1123
1124 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1125 unsigned FakeS16Opc, unsigned S32Opc,
1126 unsigned S64Opc) {
1127 if (Size == 16)
1128 // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code
1129 return ST.hasTrue16BitInsts()
1130 ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc
1131 : S16Opc;
1132 if (Size == 32)
1133 return S32Opc;
1134 return S64Opc;
1135 };
1136
1137 switch (P) {
1138 default:
1139 llvm_unreachable("Unknown condition code!");
1140 case CmpInst::ICMP_NE:
1141 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1142 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1143 AMDGPU::V_CMP_NE_U64_e64);
1144 case CmpInst::ICMP_EQ:
1145 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1146 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1147 AMDGPU::V_CMP_EQ_U64_e64);
1148 case CmpInst::ICMP_SGT:
1149 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1150 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1151 AMDGPU::V_CMP_GT_I64_e64);
1152 case CmpInst::ICMP_SGE:
1153 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1154 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1155 AMDGPU::V_CMP_GE_I64_e64);
1156 case CmpInst::ICMP_SLT:
1157 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1158 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1159 AMDGPU::V_CMP_LT_I64_e64);
1160 case CmpInst::ICMP_SLE:
1161 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1162 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1163 AMDGPU::V_CMP_LE_I64_e64);
1164 case CmpInst::ICMP_UGT:
1165 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1166 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1167 AMDGPU::V_CMP_GT_U64_e64);
1168 case CmpInst::ICMP_UGE:
1169 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1170 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1171 AMDGPU::V_CMP_GE_U64_e64);
1172 case CmpInst::ICMP_ULT:
1173 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1174 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1175 AMDGPU::V_CMP_LT_U64_e64);
1176 case CmpInst::ICMP_ULE:
1177 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1178 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1179 AMDGPU::V_CMP_LE_U64_e64);
1180
1181 case CmpInst::FCMP_OEQ:
1182 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1183 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1184 AMDGPU::V_CMP_EQ_F64_e64);
1185 case CmpInst::FCMP_OGT:
1186 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1187 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1188 AMDGPU::V_CMP_GT_F64_e64);
1189 case CmpInst::FCMP_OGE:
1190 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1191 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1192 AMDGPU::V_CMP_GE_F64_e64);
1193 case CmpInst::FCMP_OLT:
1194 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1195 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1196 AMDGPU::V_CMP_LT_F64_e64);
1197 case CmpInst::FCMP_OLE:
1198 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1199 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1200 AMDGPU::V_CMP_LE_F64_e64);
1201 case CmpInst::FCMP_ONE:
1202 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1203 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1204 AMDGPU::V_CMP_NEQ_F64_e64);
1205 case CmpInst::FCMP_ORD:
1206 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1207 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1208 AMDGPU::V_CMP_O_F64_e64);
1209 case CmpInst::FCMP_UNO:
1210 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1211 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1212 AMDGPU::V_CMP_U_F64_e64);
1213 case CmpInst::FCMP_UEQ:
1214 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1215 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1216 AMDGPU::V_CMP_NLG_F64_e64);
1217 case CmpInst::FCMP_UGT:
1218 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1219 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1220 AMDGPU::V_CMP_NLE_F64_e64);
1221 case CmpInst::FCMP_UGE:
1222 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1223 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1224 AMDGPU::V_CMP_NLT_F64_e64);
1225 case CmpInst::FCMP_ULT:
1226 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1227 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1228 AMDGPU::V_CMP_NGE_F64_e64);
1229 case CmpInst::FCMP_ULE:
1230 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1231 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1232 AMDGPU::V_CMP_NGT_F64_e64);
1233 case CmpInst::FCMP_UNE:
1234 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1235 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1236 AMDGPU::V_CMP_NEQ_F64_e64);
1237 case CmpInst::FCMP_TRUE:
1238 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1239 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1240 AMDGPU::V_CMP_TRU_F64_e64);
1242 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1243 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1244 AMDGPU::V_CMP_F_F64_e64);
1245 }
1246}
1247
1248int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1249 unsigned Size) const {
1250 if (Size == 64) {
1251 if (!STI.hasScalarCompareEq64())
1252 return -1;
1253
1254 switch (P) {
1255 case CmpInst::ICMP_NE:
1256 return AMDGPU::S_CMP_LG_U64;
1257 case CmpInst::ICMP_EQ:
1258 return AMDGPU::S_CMP_EQ_U64;
1259 default:
1260 return -1;
1261 }
1262 }
1263
1264 if (Size == 32) {
1265 switch (P) {
1266 case CmpInst::ICMP_NE:
1267 return AMDGPU::S_CMP_LG_U32;
1268 case CmpInst::ICMP_EQ:
1269 return AMDGPU::S_CMP_EQ_U32;
1270 case CmpInst::ICMP_SGT:
1271 return AMDGPU::S_CMP_GT_I32;
1272 case CmpInst::ICMP_SGE:
1273 return AMDGPU::S_CMP_GE_I32;
1274 case CmpInst::ICMP_SLT:
1275 return AMDGPU::S_CMP_LT_I32;
1276 case CmpInst::ICMP_SLE:
1277 return AMDGPU::S_CMP_LE_I32;
1278 case CmpInst::ICMP_UGT:
1279 return AMDGPU::S_CMP_GT_U32;
1280 case CmpInst::ICMP_UGE:
1281 return AMDGPU::S_CMP_GE_U32;
1282 case CmpInst::ICMP_ULT:
1283 return AMDGPU::S_CMP_LT_U32;
1284 case CmpInst::ICMP_ULE:
1285 return AMDGPU::S_CMP_LE_U32;
1286 case CmpInst::FCMP_OEQ:
1287 return AMDGPU::S_CMP_EQ_F32;
1288 case CmpInst::FCMP_OGT:
1289 return AMDGPU::S_CMP_GT_F32;
1290 case CmpInst::FCMP_OGE:
1291 return AMDGPU::S_CMP_GE_F32;
1292 case CmpInst::FCMP_OLT:
1293 return AMDGPU::S_CMP_LT_F32;
1294 case CmpInst::FCMP_OLE:
1295 return AMDGPU::S_CMP_LE_F32;
1296 case CmpInst::FCMP_ONE:
1297 return AMDGPU::S_CMP_LG_F32;
1298 case CmpInst::FCMP_ORD:
1299 return AMDGPU::S_CMP_O_F32;
1300 case CmpInst::FCMP_UNO:
1301 return AMDGPU::S_CMP_U_F32;
1302 case CmpInst::FCMP_UEQ:
1303 return AMDGPU::S_CMP_NLG_F32;
1304 case CmpInst::FCMP_UGT:
1305 return AMDGPU::S_CMP_NLE_F32;
1306 case CmpInst::FCMP_UGE:
1307 return AMDGPU::S_CMP_NLT_F32;
1308 case CmpInst::FCMP_ULT:
1309 return AMDGPU::S_CMP_NGE_F32;
1310 case CmpInst::FCMP_ULE:
1311 return AMDGPU::S_CMP_NGT_F32;
1312 case CmpInst::FCMP_UNE:
1313 return AMDGPU::S_CMP_NEQ_F32;
1314 default:
1315 llvm_unreachable("Unknown condition code!");
1316 }
1317 }
1318
1319 if (Size == 16) {
1320 if (!STI.hasSALUFloatInsts())
1321 return -1;
1322
1323 switch (P) {
1324 case CmpInst::FCMP_OEQ:
1325 return AMDGPU::S_CMP_EQ_F16;
1326 case CmpInst::FCMP_OGT:
1327 return AMDGPU::S_CMP_GT_F16;
1328 case CmpInst::FCMP_OGE:
1329 return AMDGPU::S_CMP_GE_F16;
1330 case CmpInst::FCMP_OLT:
1331 return AMDGPU::S_CMP_LT_F16;
1332 case CmpInst::FCMP_OLE:
1333 return AMDGPU::S_CMP_LE_F16;
1334 case CmpInst::FCMP_ONE:
1335 return AMDGPU::S_CMP_LG_F16;
1336 case CmpInst::FCMP_ORD:
1337 return AMDGPU::S_CMP_O_F16;
1338 case CmpInst::FCMP_UNO:
1339 return AMDGPU::S_CMP_U_F16;
1340 case CmpInst::FCMP_UEQ:
1341 return AMDGPU::S_CMP_NLG_F16;
1342 case CmpInst::FCMP_UGT:
1343 return AMDGPU::S_CMP_NLE_F16;
1344 case CmpInst::FCMP_UGE:
1345 return AMDGPU::S_CMP_NLT_F16;
1346 case CmpInst::FCMP_ULT:
1347 return AMDGPU::S_CMP_NGE_F16;
1348 case CmpInst::FCMP_ULE:
1349 return AMDGPU::S_CMP_NGT_F16;
1350 case CmpInst::FCMP_UNE:
1351 return AMDGPU::S_CMP_NEQ_F16;
1352 default:
1353 llvm_unreachable("Unknown condition code!");
1354 }
1355 }
1356
1357 return -1;
1358}
1359
1360bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1361
1362 MachineBasicBlock *BB = I.getParent();
1363 const DebugLoc &DL = I.getDebugLoc();
1364
1365 Register SrcReg = I.getOperand(2).getReg();
1366 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1367
1368 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1369
1370 Register CCReg = I.getOperand(0).getReg();
1371 if (!isVCC(CCReg, *MRI)) {
1372 int Opcode = getS_CMPOpcode(Pred, Size);
1373 if (Opcode == -1)
1374 return false;
1375 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1376 .add(I.getOperand(2))
1377 .add(I.getOperand(3));
1378 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1379 .addReg(AMDGPU::SCC);
1380 bool Ret =
1381 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1382 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1383 I.eraseFromParent();
1384 return Ret;
1385 }
1386
1387 if (I.getOpcode() == AMDGPU::G_FCMP)
1388 return false;
1389
1390 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1391 if (Opcode == -1)
1392 return false;
1393
1394 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1395 I.getOperand(0).getReg())
1396 .add(I.getOperand(2))
1397 .add(I.getOperand(3));
1399 *TRI.getBoolRC(), *MRI);
1400 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1401 I.eraseFromParent();
1402 return Ret;
1403}
1404
1405bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1406 Register Dst = I.getOperand(0).getReg();
1407 if (isVCC(Dst, *MRI))
1408 return false;
1409
1410 LLT DstTy = MRI->getType(Dst);
1411 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1412 return false;
1413
1414 MachineBasicBlock *BB = I.getParent();
1415 const DebugLoc &DL = I.getDebugLoc();
1416 Register SrcReg = I.getOperand(2).getReg();
1417 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1418
1419 // i1 inputs are not supported in GlobalISel.
1420 if (Size == 1)
1421 return false;
1422
1423 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1424 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1425 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1426 I.eraseFromParent();
1427 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1428 }
1429
1430 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1431 if (Opcode == -1)
1432 return false;
1433
1434 MachineInstrBuilder SelectedMI;
1435 MachineOperand &LHS = I.getOperand(2);
1436 MachineOperand &RHS = I.getOperand(3);
1437 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1438 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1439 Register Src0Reg =
1440 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1441 Register Src1Reg =
1442 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1443 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1444 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1445 SelectedMI.addImm(Src0Mods);
1446 SelectedMI.addReg(Src0Reg);
1447 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1448 SelectedMI.addImm(Src1Mods);
1449 SelectedMI.addReg(Src1Reg);
1450 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1451 SelectedMI.addImm(0); // clamp
1452 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1453 SelectedMI.addImm(0); // op_sel
1454
1455 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1456 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1457 return false;
1458
1459 I.eraseFromParent();
1460 return true;
1461}
1462
1463// Ballot has to zero bits in input lane-mask that are zero in current exec,
1464// Done as AND with exec. For inputs that are results of instruction that
1465// implicitly use same exec, for example compares in same basic block or SCC to
1466// VCC copy, use copy.
1469 MachineInstr *MI = MRI.getVRegDef(Reg);
1470 if (MI->getParent() != MBB)
1471 return false;
1472
1473 // Lane mask generated by SCC to VCC copy.
1474 if (MI->getOpcode() == AMDGPU::COPY) {
1475 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1476 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1477 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1478 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1479 return true;
1480 }
1481
1482 // Lane mask generated using compare with same exec.
1483 if (isa<GAnyCmp>(MI))
1484 return true;
1485
1486 Register LHS, RHS;
1487 // Look through AND.
1488 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1489 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1491
1492 return false;
1493}
1494
1495bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1496 MachineBasicBlock *BB = I.getParent();
1497 const DebugLoc &DL = I.getDebugLoc();
1498 Register DstReg = I.getOperand(0).getReg();
1499 Register SrcReg = I.getOperand(2).getReg();
1500 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1501 const unsigned WaveSize = STI.getWavefrontSize();
1502
1503 // In the common case, the return type matches the wave size.
1504 // However we also support emitting i64 ballots in wave32 mode.
1505 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1506 return false;
1507
1508 std::optional<ValueAndVReg> Arg =
1510
1511 Register Dst = DstReg;
1512 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1513 if (BallotSize != WaveSize) {
1514 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1515 }
1516
1517 if (Arg) {
1518 const int64_t Value = Arg->Value.getZExtValue();
1519 if (Value == 0) {
1520 // Dst = S_MOV 0
1521 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1522 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1523 } else {
1524 // Dst = COPY EXEC
1525 assert(Value == 1);
1526 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1527 }
1528 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1529 return false;
1530 } else {
1531 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1532 // Dst = COPY SrcReg
1533 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1534 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1535 return false;
1536 } else {
1537 // Dst = S_AND SrcReg, EXEC
1538 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1539 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1540 .addReg(SrcReg)
1541 .addReg(TRI.getExec())
1542 .setOperandDead(3); // Dead scc
1543 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1544 return false;
1545 }
1546 }
1547
1548 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1549 if (BallotSize != WaveSize) {
1550 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1551 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1552 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1553 .addReg(Dst)
1554 .addImm(AMDGPU::sub0)
1555 .addReg(HiReg)
1556 .addImm(AMDGPU::sub1);
1557 }
1558
1559 I.eraseFromParent();
1560 return true;
1561}
1562
1563bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1564 Register DstReg = I.getOperand(0).getReg();
1565 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1566 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1567 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1568 return false;
1569
1570 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1571
1573 const MDNode *Metadata = I.getOperand(2).getMetadata();
1574 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1575 auto *RelocSymbol = cast<GlobalVariable>(
1576 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1577
1578 MachineBasicBlock *BB = I.getParent();
1579 BuildMI(*BB, &I, I.getDebugLoc(),
1580 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1582
1583 I.eraseFromParent();
1584 return true;
1585}
1586
1587bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1589
1590 Register DstReg = I.getOperand(0).getReg();
1591 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1592 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1593 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1594
1595 MachineBasicBlock *MBB = I.getParent();
1596 const DebugLoc &DL = I.getDebugLoc();
1597
1598 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1599
1600 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1602 MIB.addImm(MFI->getLDSSize());
1603 } else {
1605 const GlobalValue *GV =
1606 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1608 }
1609
1610 I.eraseFromParent();
1611 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1612}
1613
1614bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1615 MachineBasicBlock *MBB = I.getParent();
1617 const DebugLoc &DL = I.getDebugLoc();
1618
1619 MachineOperand &Dst = I.getOperand(0);
1620 Register DstReg = Dst.getReg();
1621 unsigned Depth = I.getOperand(2).getImm();
1622
1623 const TargetRegisterClass *RC
1624 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1625 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1626 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1627 return false;
1628
1629 // Check for kernel and shader functions
1630 if (Depth != 0 ||
1632 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1633 .addImm(0);
1634 I.eraseFromParent();
1635 return true;
1636 }
1637
1639 // There is a call to @llvm.returnaddress in this function
1640 MFI.setReturnAddressIsTaken(true);
1641
1642 // Get the return address reg and mark it as an implicit live-in
1643 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1644 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1645 AMDGPU::SReg_64RegClass, DL);
1646 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1647 .addReg(LiveIn);
1648 I.eraseFromParent();
1649 return true;
1650}
1651
1652bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1653 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1654 // SelectionDAG uses for wave32 vs wave64.
1655 MachineBasicBlock *BB = MI.getParent();
1656 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1657 .add(MI.getOperand(1));
1658
1659 Register Reg = MI.getOperand(1).getReg();
1660 MI.eraseFromParent();
1661
1662 if (!MRI->getRegClassOrNull(Reg))
1663 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1664 return true;
1665}
1666
1667bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1668 MachineInstr &MI, Intrinsic::ID IntrID) const {
1669 MachineBasicBlock *MBB = MI.getParent();
1671 const DebugLoc &DL = MI.getDebugLoc();
1672
1673 unsigned IndexOperand = MI.getOperand(7).getImm();
1674 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1675 bool WaveDone = MI.getOperand(9).getImm() != 0;
1676
1677 if (WaveDone && !WaveRelease)
1678 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1679
1680 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1681 IndexOperand &= ~0x3f;
1682 unsigned CountDw = 0;
1683
1685 CountDw = (IndexOperand >> 24) & 0xf;
1686 IndexOperand &= ~(0xf << 24);
1687
1688 if (CountDw < 1 || CountDw > 4) {
1690 "ds_ordered_count: dword count must be between 1 and 4");
1691 }
1692 }
1693
1694 if (IndexOperand)
1695 report_fatal_error("ds_ordered_count: bad index operand");
1696
1697 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1698 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1699
1700 unsigned Offset0 = OrderedCountIndex << 2;
1701 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1702
1704 Offset1 |= (CountDw - 1) << 6;
1705
1707 Offset1 |= ShaderType << 2;
1708
1709 unsigned Offset = Offset0 | (Offset1 << 8);
1710
1711 Register M0Val = MI.getOperand(2).getReg();
1712 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1713 .addReg(M0Val);
1714
1715 Register DstReg = MI.getOperand(0).getReg();
1716 Register ValReg = MI.getOperand(3).getReg();
1718 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1719 .addReg(ValReg)
1720 .addImm(Offset)
1721 .cloneMemRefs(MI);
1722
1723 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1724 return false;
1725
1726 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1727 MI.eraseFromParent();
1728 return Ret;
1729}
1730
1731static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1732 switch (IntrID) {
1733 case Intrinsic::amdgcn_ds_gws_init:
1734 return AMDGPU::DS_GWS_INIT;
1735 case Intrinsic::amdgcn_ds_gws_barrier:
1736 return AMDGPU::DS_GWS_BARRIER;
1737 case Intrinsic::amdgcn_ds_gws_sema_v:
1738 return AMDGPU::DS_GWS_SEMA_V;
1739 case Intrinsic::amdgcn_ds_gws_sema_br:
1740 return AMDGPU::DS_GWS_SEMA_BR;
1741 case Intrinsic::amdgcn_ds_gws_sema_p:
1742 return AMDGPU::DS_GWS_SEMA_P;
1743 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1744 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1745 default:
1746 llvm_unreachable("not a gws intrinsic");
1747 }
1748}
1749
1750bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1751 Intrinsic::ID IID) const {
1752 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1753 !STI.hasGWSSemaReleaseAll()))
1754 return false;
1755
1756 // intrinsic ID, vsrc, offset
1757 const bool HasVSrc = MI.getNumOperands() == 3;
1758 assert(HasVSrc || MI.getNumOperands() == 2);
1759
1760 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1761 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1762 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1763 return false;
1764
1765 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1766 unsigned ImmOffset;
1767
1768 MachineBasicBlock *MBB = MI.getParent();
1769 const DebugLoc &DL = MI.getDebugLoc();
1770
1771 MachineInstr *Readfirstlane = nullptr;
1772
1773 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1774 // incoming offset, in case there's an add of a constant. We'll have to put it
1775 // back later.
1776 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1777 Readfirstlane = OffsetDef;
1778 BaseOffset = OffsetDef->getOperand(1).getReg();
1779 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1780 }
1781
1782 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1783 // If we have a constant offset, try to use the 0 in m0 as the base.
1784 // TODO: Look into changing the default m0 initialization value. If the
1785 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1786 // the immediate offset.
1787
1788 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1789 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1790 .addImm(0);
1791 } else {
1792 std::tie(BaseOffset, ImmOffset) =
1793 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1794
1795 if (Readfirstlane) {
1796 // We have the constant offset now, so put the readfirstlane back on the
1797 // variable component.
1798 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1799 return false;
1800
1801 Readfirstlane->getOperand(1).setReg(BaseOffset);
1802 BaseOffset = Readfirstlane->getOperand(0).getReg();
1803 } else {
1804 if (!RBI.constrainGenericRegister(BaseOffset,
1805 AMDGPU::SReg_32RegClass, *MRI))
1806 return false;
1807 }
1808
1809 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1810 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1811 .addReg(BaseOffset)
1812 .addImm(16)
1813 .setOperandDead(3); // Dead scc
1814
1815 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1816 .addReg(M0Base);
1817 }
1818
1819 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1820 // offset field) % 64. Some versions of the programming guide omit the m0
1821 // part, or claim it's from offset 0.
1822 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1823
1824 if (HasVSrc) {
1825 Register VSrc = MI.getOperand(1).getReg();
1826 MIB.addReg(VSrc);
1827
1828 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1829 return false;
1830 }
1831
1832 MIB.addImm(ImmOffset)
1833 .cloneMemRefs(MI);
1834
1835 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1836
1837 MI.eraseFromParent();
1838 return true;
1839}
1840
1841bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1842 bool IsAppend) const {
1843 Register PtrBase = MI.getOperand(2).getReg();
1844 LLT PtrTy = MRI->getType(PtrBase);
1845 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1846
1847 unsigned Offset;
1848 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1849
1850 // TODO: Should this try to look through readfirstlane like GWS?
1851 if (!isDSOffsetLegal(PtrBase, Offset)) {
1852 PtrBase = MI.getOperand(2).getReg();
1853 Offset = 0;
1854 }
1855
1856 MachineBasicBlock *MBB = MI.getParent();
1857 const DebugLoc &DL = MI.getDebugLoc();
1858 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1859
1860 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1861 .addReg(PtrBase);
1862 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1863 return false;
1864
1865 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1866 .addImm(Offset)
1867 .addImm(IsGDS ? -1 : 0)
1868 .cloneMemRefs(MI);
1869 MI.eraseFromParent();
1870 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1871}
1872
1873bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1874 MachineFunction *MF = MI.getParent()->getParent();
1876
1877 MFInfo->setInitWholeWave();
1878 return selectImpl(MI, *CoverageInfo);
1879}
1880
1881bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1882 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1884 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1885 if (WGSize <= STI.getWavefrontSize()) {
1886 // If the workgroup fits in a wave, remove s_barrier_signal and lower
1887 // s_barrier/s_barrier_wait to wave_barrier.
1888 if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1889 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1890 MachineBasicBlock *MBB = MI.getParent();
1891 const DebugLoc &DL = MI.getDebugLoc();
1892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1893 }
1894 MI.eraseFromParent();
1895 return true;
1896 }
1897 }
1898
1899 if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
1900 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1901 MachineBasicBlock *MBB = MI.getParent();
1902 const DebugLoc &DL = MI.getDebugLoc();
1903 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1905 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1907 MI.eraseFromParent();
1908 return true;
1909 }
1910
1911 return selectImpl(MI, *CoverageInfo);
1912}
1913
1914static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1915 bool &IsTexFail) {
1916 if (TexFailCtrl)
1917 IsTexFail = true;
1918
1919 TFE = (TexFailCtrl & 0x1) ? true : false;
1920 TexFailCtrl &= ~(uint64_t)0x1;
1921 LWE = (TexFailCtrl & 0x2) ? true : false;
1922 TexFailCtrl &= ~(uint64_t)0x2;
1923
1924 return TexFailCtrl == 0;
1925}
1926
1927bool AMDGPUInstructionSelector::selectImageIntrinsic(
1929 MachineBasicBlock *MBB = MI.getParent();
1930 const DebugLoc &DL = MI.getDebugLoc();
1931
1932 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1934
1935 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1936 unsigned IntrOpcode = Intr->BaseOpcode;
1937 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1938 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1939 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1940
1941 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1942
1943 Register VDataIn, VDataOut;
1944 LLT VDataTy;
1945 int NumVDataDwords = -1;
1946 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1947 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1948
1949 bool Unorm;
1950 if (!BaseOpcode->Sampler)
1951 Unorm = true;
1952 else
1953 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1954
1955 bool TFE;
1956 bool LWE;
1957 bool IsTexFail = false;
1958 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1959 TFE, LWE, IsTexFail))
1960 return false;
1961
1962 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1963 const bool IsA16 = (Flags & 1) != 0;
1964 const bool IsG16 = (Flags & 2) != 0;
1965
1966 // A16 implies 16 bit gradients if subtarget doesn't support G16
1967 if (IsA16 && !STI.hasG16() && !IsG16)
1968 return false;
1969
1970 unsigned DMask = 0;
1971 unsigned DMaskLanes = 0;
1972
1973 if (BaseOpcode->Atomic) {
1974 VDataOut = MI.getOperand(0).getReg();
1975 VDataIn = MI.getOperand(2).getReg();
1976 LLT Ty = MRI->getType(VDataIn);
1977
1978 // Be careful to allow atomic swap on 16-bit element vectors.
1979 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1980 Ty.getSizeInBits() == 128 :
1981 Ty.getSizeInBits() == 64;
1982
1983 if (BaseOpcode->AtomicX2) {
1984 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1985
1986 DMask = Is64Bit ? 0xf : 0x3;
1987 NumVDataDwords = Is64Bit ? 4 : 2;
1988 } else {
1989 DMask = Is64Bit ? 0x3 : 0x1;
1990 NumVDataDwords = Is64Bit ? 2 : 1;
1991 }
1992 } else {
1993 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1994 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1995
1996 if (BaseOpcode->Store) {
1997 VDataIn = MI.getOperand(1).getReg();
1998 VDataTy = MRI->getType(VDataIn);
1999 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2000 } else if (BaseOpcode->NoReturn) {
2001 NumVDataDwords = 0;
2002 } else {
2003 VDataOut = MI.getOperand(0).getReg();
2004 VDataTy = MRI->getType(VDataOut);
2005 NumVDataDwords = DMaskLanes;
2006
2007 if (IsD16 && !STI.hasUnpackedD16VMem())
2008 NumVDataDwords = (DMaskLanes + 1) / 2;
2009 }
2010 }
2011
2012 // Set G16 opcode
2013 if (Subtarget->hasG16() && IsG16) {
2014 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2016 assert(G16MappingInfo);
2017 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2018 }
2019
2020 // TODO: Check this in verifier.
2021 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2022
2023 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2024 if (BaseOpcode->Atomic)
2025 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2026 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2028 return false;
2029
2030 int NumVAddrRegs = 0;
2031 int NumVAddrDwords = 0;
2032 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2033 // Skip the $noregs and 0s inserted during legalization.
2034 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2035 if (!AddrOp.isReg())
2036 continue; // XXX - Break?
2037
2038 Register Addr = AddrOp.getReg();
2039 if (!Addr)
2040 break;
2041
2042 ++NumVAddrRegs;
2043 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2044 }
2045
2046 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2047 // NSA, these should have been packed into a single value in the first
2048 // address register
2049 const bool UseNSA =
2050 NumVAddrRegs != 1 &&
2051 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2052 : NumVAddrDwords == NumVAddrRegs);
2053 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2054 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2055 return false;
2056 }
2057
2058 if (IsTexFail)
2059 ++NumVDataDwords;
2060
2061 int Opcode = -1;
2062 if (IsGFX12Plus) {
2063 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2064 NumVDataDwords, NumVAddrDwords);
2065 } else if (IsGFX11Plus) {
2066 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2067 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2068 : AMDGPU::MIMGEncGfx11Default,
2069 NumVDataDwords, NumVAddrDwords);
2070 } else if (IsGFX10Plus) {
2071 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2072 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2073 : AMDGPU::MIMGEncGfx10Default,
2074 NumVDataDwords, NumVAddrDwords);
2075 } else {
2076 if (Subtarget->hasGFX90AInsts()) {
2077 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2078 NumVDataDwords, NumVAddrDwords);
2079 if (Opcode == -1) {
2080 LLVM_DEBUG(
2081 dbgs()
2082 << "requested image instruction is not supported on this GPU\n");
2083 return false;
2084 }
2085 }
2086 if (Opcode == -1 &&
2088 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2089 NumVDataDwords, NumVAddrDwords);
2090 if (Opcode == -1)
2091 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2092 NumVDataDwords, NumVAddrDwords);
2093 }
2094 if (Opcode == -1)
2095 return false;
2096
2097 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2098 .cloneMemRefs(MI);
2099
2100 if (VDataOut) {
2101 if (BaseOpcode->AtomicX2) {
2102 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2103
2104 Register TmpReg = MRI->createVirtualRegister(
2105 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2106 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2107
2108 MIB.addDef(TmpReg);
2109 if (!MRI->use_empty(VDataOut)) {
2110 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2111 .addReg(TmpReg, RegState::Kill, SubReg);
2112 }
2113
2114 } else {
2115 MIB.addDef(VDataOut); // vdata output
2116 }
2117 }
2118
2119 if (VDataIn)
2120 MIB.addReg(VDataIn); // vdata input
2121
2122 for (int I = 0; I != NumVAddrRegs; ++I) {
2123 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2124 if (SrcOp.isReg()) {
2125 assert(SrcOp.getReg() != 0);
2126 MIB.addReg(SrcOp.getReg());
2127 }
2128 }
2129
2130 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2131 if (BaseOpcode->Sampler)
2132 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2133
2134 MIB.addImm(DMask); // dmask
2135
2136 if (IsGFX10Plus)
2137 MIB.addImm(DimInfo->Encoding);
2138 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2139 MIB.addImm(Unorm);
2140
2141 MIB.addImm(CPol);
2142 MIB.addImm(IsA16 && // a16 or r128
2143 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2144 if (IsGFX10Plus)
2145 MIB.addImm(IsA16 ? -1 : 0);
2146
2147 if (!Subtarget->hasGFX90AInsts()) {
2148 MIB.addImm(TFE); // tfe
2149 } else if (TFE) {
2150 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2151 return false;
2152 }
2153
2154 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2155 MIB.addImm(LWE); // lwe
2156 if (!IsGFX10Plus)
2157 MIB.addImm(DimInfo->DA ? -1 : 0);
2158 if (BaseOpcode->HasD16)
2159 MIB.addImm(IsD16 ? -1 : 0);
2160
2161 MI.eraseFromParent();
2162 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2163 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2164 return true;
2165}
2166
2167// We need to handle this here because tablegen doesn't support matching
2168// instructions with multiple outputs.
2169bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2170 MachineInstr &MI) const {
2171 Register Dst0 = MI.getOperand(0).getReg();
2172 Register Dst1 = MI.getOperand(1).getReg();
2173
2174 const DebugLoc &DL = MI.getDebugLoc();
2175 MachineBasicBlock *MBB = MI.getParent();
2176
2177 Register Addr = MI.getOperand(3).getReg();
2178 Register Data0 = MI.getOperand(4).getReg();
2179 Register Data1 = MI.getOperand(5).getReg();
2180 unsigned Offset = MI.getOperand(6).getImm();
2181
2182 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2183 .addDef(Dst1)
2184 .addUse(Addr)
2185 .addUse(Data0)
2186 .addUse(Data1)
2187 .addImm(Offset)
2188 .cloneMemRefs(MI);
2189
2190 MI.eraseFromParent();
2191 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2192}
2193
2194bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2195 MachineInstr &I) const {
2196 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2197 switch (IntrinsicID) {
2198 case Intrinsic::amdgcn_end_cf:
2199 return selectEndCfIntrinsic(I);
2200 case Intrinsic::amdgcn_ds_ordered_add:
2201 case Intrinsic::amdgcn_ds_ordered_swap:
2202 return selectDSOrderedIntrinsic(I, IntrinsicID);
2203 case Intrinsic::amdgcn_ds_gws_init:
2204 case Intrinsic::amdgcn_ds_gws_barrier:
2205 case Intrinsic::amdgcn_ds_gws_sema_v:
2206 case Intrinsic::amdgcn_ds_gws_sema_br:
2207 case Intrinsic::amdgcn_ds_gws_sema_p:
2208 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2209 return selectDSGWSIntrinsic(I, IntrinsicID);
2210 case Intrinsic::amdgcn_ds_append:
2211 return selectDSAppendConsume(I, true);
2212 case Intrinsic::amdgcn_ds_consume:
2213 return selectDSAppendConsume(I, false);
2214 case Intrinsic::amdgcn_init_whole_wave:
2215 return selectInitWholeWave(I);
2216 case Intrinsic::amdgcn_s_barrier:
2217 case Intrinsic::amdgcn_s_barrier_signal:
2218 case Intrinsic::amdgcn_s_barrier_wait:
2219 return selectSBarrier(I);
2220 case Intrinsic::amdgcn_raw_buffer_load_lds:
2221 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2222 case Intrinsic::amdgcn_struct_buffer_load_lds:
2223 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2224 return selectBufferLoadLds(I);
2225 case Intrinsic::amdgcn_global_load_lds:
2226 return selectGlobalLoadLds(I);
2227 case Intrinsic::amdgcn_exp_compr:
2228 if (!STI.hasCompressedExport()) {
2229 Function &F = I.getMF()->getFunction();
2231 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2232 F.getContext().diagnose(NoFpRet);
2233 return false;
2234 }
2235 break;
2236 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2237 return selectDSBvhStackIntrinsic(I);
2238 case Intrinsic::amdgcn_s_barrier_init:
2239 case Intrinsic::amdgcn_s_barrier_signal_var:
2240 return selectNamedBarrierInit(I, IntrinsicID);
2241 case Intrinsic::amdgcn_s_barrier_join:
2242 case Intrinsic::amdgcn_s_get_named_barrier_state:
2243 return selectNamedBarrierInst(I, IntrinsicID);
2244 case Intrinsic::amdgcn_s_get_barrier_state:
2245 return selectSGetBarrierState(I, IntrinsicID);
2246 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2247 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2248 }
2249 return selectImpl(I, *CoverageInfo);
2250}
2251
2252bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2253 if (selectImpl(I, *CoverageInfo))
2254 return true;
2255
2256 MachineBasicBlock *BB = I.getParent();
2257 const DebugLoc &DL = I.getDebugLoc();
2258
2259 Register DstReg = I.getOperand(0).getReg();
2260 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2261 assert(Size <= 32 || Size == 64);
2262 const MachineOperand &CCOp = I.getOperand(1);
2263 Register CCReg = CCOp.getReg();
2264 if (!isVCC(CCReg, *MRI)) {
2265 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2266 AMDGPU::S_CSELECT_B32;
2267 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2268 .addReg(CCReg);
2269
2270 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2271 // bank, because it does not cover the register class that we used to represent
2272 // for it. So we need to manually set the register class here.
2273 if (!MRI->getRegClassOrNull(CCReg))
2274 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2275 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2276 .add(I.getOperand(2))
2277 .add(I.getOperand(3));
2278
2279 bool Ret = false;
2280 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2281 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2282 I.eraseFromParent();
2283 return Ret;
2284 }
2285
2286 // Wide VGPR select should have been split in RegBankSelect.
2287 if (Size > 32)
2288 return false;
2289
2291 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2292 .addImm(0)
2293 .add(I.getOperand(3))
2294 .addImm(0)
2295 .add(I.getOperand(2))
2296 .add(I.getOperand(1));
2297
2298 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2299 I.eraseFromParent();
2300 return Ret;
2301}
2302
2303bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2304 Register DstReg = I.getOperand(0).getReg();
2305 Register SrcReg = I.getOperand(1).getReg();
2306 const LLT DstTy = MRI->getType(DstReg);
2307 const LLT SrcTy = MRI->getType(SrcReg);
2308 const LLT S1 = LLT::scalar(1);
2309
2310 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2311 const RegisterBank *DstRB;
2312 if (DstTy == S1) {
2313 // This is a special case. We don't treat s1 for legalization artifacts as
2314 // vcc booleans.
2315 DstRB = SrcRB;
2316 } else {
2317 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2318 if (SrcRB != DstRB)
2319 return false;
2320 }
2321
2322 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2323
2324 unsigned DstSize = DstTy.getSizeInBits();
2325 unsigned SrcSize = SrcTy.getSizeInBits();
2326
2327 const TargetRegisterClass *SrcRC =
2328 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2329 const TargetRegisterClass *DstRC =
2330 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2331 if (!SrcRC || !DstRC)
2332 return false;
2333
2334 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2335 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2336 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2337 return false;
2338 }
2339
2340 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2342 const DebugLoc &DL = I.getDebugLoc();
2343 MachineBasicBlock *MBB = I.getParent();
2344 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2345 .addReg(SrcReg, 0, AMDGPU::lo16);
2346 I.eraseFromParent();
2347 return true;
2348 }
2349
2350 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2351 MachineBasicBlock *MBB = I.getParent();
2352 const DebugLoc &DL = I.getDebugLoc();
2353
2354 Register LoReg = MRI->createVirtualRegister(DstRC);
2355 Register HiReg = MRI->createVirtualRegister(DstRC);
2356 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2357 .addReg(SrcReg, 0, AMDGPU::sub0);
2358 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2359 .addReg(SrcReg, 0, AMDGPU::sub1);
2360
2361 if (IsVALU && STI.hasSDWA()) {
2362 // Write the low 16-bits of the high element into the high 16-bits of the
2363 // low element.
2364 MachineInstr *MovSDWA =
2365 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2366 .addImm(0) // $src0_modifiers
2367 .addReg(HiReg) // $src0
2368 .addImm(0) // $clamp
2369 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2370 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2371 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2372 .addReg(LoReg, RegState::Implicit);
2373 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2374 } else {
2375 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2376 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2377 Register ImmReg = MRI->createVirtualRegister(DstRC);
2378 if (IsVALU) {
2379 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2380 .addImm(16)
2381 .addReg(HiReg);
2382 } else {
2383 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2384 .addReg(HiReg)
2385 .addImm(16)
2386 .setOperandDead(3); // Dead scc
2387 }
2388
2389 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2390 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2391 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2392
2393 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2394 .addImm(0xffff);
2395 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2396 .addReg(LoReg)
2397 .addReg(ImmReg);
2398 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2399 .addReg(TmpReg0)
2400 .addReg(TmpReg1);
2401
2402 if (!IsVALU) {
2403 And.setOperandDead(3); // Dead scc
2404 Or.setOperandDead(3); // Dead scc
2405 }
2406 }
2407
2408 I.eraseFromParent();
2409 return true;
2410 }
2411
2412 if (!DstTy.isScalar())
2413 return false;
2414
2415 if (SrcSize > 32) {
2416 unsigned SubRegIdx =
2417 DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
2418 if (SubRegIdx == AMDGPU::NoSubRegister)
2419 return false;
2420
2421 // Deal with weird cases where the class only partially supports the subreg
2422 // index.
2423 const TargetRegisterClass *SrcWithSubRC
2424 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2425 if (!SrcWithSubRC)
2426 return false;
2427
2428 if (SrcWithSubRC != SrcRC) {
2429 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2430 return false;
2431 }
2432
2433 I.getOperand(1).setSubReg(SubRegIdx);
2434 }
2435
2436 I.setDesc(TII.get(TargetOpcode::COPY));
2437 return true;
2438}
2439
2440/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2441static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2442 Mask = maskTrailingOnes<unsigned>(Size);
2443 int SignedMask = static_cast<int>(Mask);
2444 return SignedMask >= -16 && SignedMask <= 64;
2445}
2446
2447// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2448const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2449 Register Reg, const MachineRegisterInfo &MRI,
2450 const TargetRegisterInfo &TRI) const {
2451 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2452 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2453 return RB;
2454
2455 // Ignore the type, since we don't use vcc in artifacts.
2456 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2457 return &RBI.getRegBankFromRegClass(*RC, LLT());
2458 return nullptr;
2459}
2460
2461bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2462 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2463 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2464 const DebugLoc &DL = I.getDebugLoc();
2465 MachineBasicBlock &MBB = *I.getParent();
2466 const Register DstReg = I.getOperand(0).getReg();
2467 const Register SrcReg = I.getOperand(1).getReg();
2468
2469 const LLT DstTy = MRI->getType(DstReg);
2470 const LLT SrcTy = MRI->getType(SrcReg);
2471 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2472 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2473 const unsigned DstSize = DstTy.getSizeInBits();
2474 if (!DstTy.isScalar())
2475 return false;
2476
2477 // Artifact casts should never use vcc.
2478 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2479
2480 // FIXME: This should probably be illegal and split earlier.
2481 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2482 if (DstSize <= 32)
2483 return selectCOPY(I);
2484
2485 const TargetRegisterClass *SrcRC =
2486 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2487 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2488 const TargetRegisterClass *DstRC =
2489 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2490
2491 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2492 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2493 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2494 .addReg(SrcReg)
2495 .addImm(AMDGPU::sub0)
2496 .addReg(UndefReg)
2497 .addImm(AMDGPU::sub1);
2498 I.eraseFromParent();
2499
2500 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2501 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2502 }
2503
2504 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2505 // 64-bit should have been split up in RegBankSelect
2506
2507 // Try to use an and with a mask if it will save code size.
2508 unsigned Mask;
2509 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2510 MachineInstr *ExtI =
2511 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2512 .addImm(Mask)
2513 .addReg(SrcReg);
2514 I.eraseFromParent();
2515 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2516 }
2517
2518 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2519 MachineInstr *ExtI =
2520 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2521 .addReg(SrcReg)
2522 .addImm(0) // Offset
2523 .addImm(SrcSize); // Width
2524 I.eraseFromParent();
2525 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2526 }
2527
2528 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2529 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2530 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2531 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2532 return false;
2533
2534 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2535 const unsigned SextOpc = SrcSize == 8 ?
2536 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2537 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2538 .addReg(SrcReg);
2539 I.eraseFromParent();
2540 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2541 }
2542
2543 // Using a single 32-bit SALU to calculate the high half is smaller than
2544 // S_BFE with a literal constant operand.
2545 if (DstSize > 32 && SrcSize == 32) {
2546 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2547 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2548 if (Signed) {
2549 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2550 .addReg(SrcReg, 0, SubReg)
2551 .addImm(31)
2552 .setOperandDead(3); // Dead scc
2553 } else {
2554 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2555 .addImm(0);
2556 }
2557 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2558 .addReg(SrcReg, 0, SubReg)
2559 .addImm(AMDGPU::sub0)
2560 .addReg(HiReg)
2561 .addImm(AMDGPU::sub1);
2562 I.eraseFromParent();
2563 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2564 *MRI);
2565 }
2566
2567 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2568 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2569
2570 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2571 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2572 // We need a 64-bit register source, but the high bits don't matter.
2573 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2574 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2575 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2576
2577 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2578 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2579 .addReg(SrcReg, 0, SubReg)
2580 .addImm(AMDGPU::sub0)
2581 .addReg(UndefReg)
2582 .addImm(AMDGPU::sub1);
2583
2584 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2585 .addReg(ExtReg)
2586 .addImm(SrcSize << 16);
2587
2588 I.eraseFromParent();
2589 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2590 }
2591
2592 unsigned Mask;
2593 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2594 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2595 .addReg(SrcReg)
2596 .addImm(Mask)
2597 .setOperandDead(3); // Dead scc
2598 } else {
2599 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2600 .addReg(SrcReg)
2601 .addImm(SrcSize << 16);
2602 }
2603
2604 I.eraseFromParent();
2605 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2606 }
2607
2608 return false;
2609}
2610
2612 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2613}
2614
2616 Register BitcastSrc;
2617 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2618 Reg = BitcastSrc;
2619 return Reg;
2620}
2621
2623 Register &Out) {
2624 Register Trunc;
2625 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2626 return false;
2627
2628 Register LShlSrc;
2629 Register Cst;
2630 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2631 Cst = stripCopy(Cst, MRI);
2632 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2633 Out = stripBitCast(LShlSrc, MRI);
2634 return true;
2635 }
2636 }
2637
2638 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2639 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2640 return false;
2641
2642 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2643 LLT::fixed_vector(2, 16));
2644
2645 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2646 assert(Mask.size() == 2);
2647
2648 if (Mask[0] == 1 && Mask[1] <= 1) {
2649 Out = Shuffle->getOperand(0).getReg();
2650 return true;
2651 }
2652
2653 return false;
2654}
2655
2656bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2657 if (!Subtarget->hasSALUFloatInsts())
2658 return false;
2659
2660 Register Dst = I.getOperand(0).getReg();
2661 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2662 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2663 return false;
2664
2665 Register Src = I.getOperand(1).getReg();
2666
2667 if (MRI->getType(Dst) == LLT::scalar(32) &&
2668 MRI->getType(Src) == LLT::scalar(16)) {
2669 if (isExtractHiElt(*MRI, Src, Src)) {
2670 MachineBasicBlock *BB = I.getParent();
2671 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2672 .addUse(Src);
2673 I.eraseFromParent();
2674 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2675 }
2676 }
2677
2678 return false;
2679}
2680
2681bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2682 // Only manually handle the f64 SGPR case.
2683 //
2684 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2685 // the bit ops theoretically have a second result due to the implicit def of
2686 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2687 // that is easy by disabling the check. The result works, but uses a
2688 // nonsensical sreg32orlds_and_sreg_1 regclass.
2689 //
2690 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2691 // the variadic REG_SEQUENCE operands.
2692
2693 Register Dst = MI.getOperand(0).getReg();
2694 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2695 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2696 MRI->getType(Dst) != LLT::scalar(64))
2697 return false;
2698
2699 Register Src = MI.getOperand(1).getReg();
2700 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2701 if (Fabs)
2702 Src = Fabs->getOperand(1).getReg();
2703
2704 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2705 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2706 return false;
2707
2708 MachineBasicBlock *BB = MI.getParent();
2709 const DebugLoc &DL = MI.getDebugLoc();
2710 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2711 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2712 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2713 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2714
2715 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2716 .addReg(Src, 0, AMDGPU::sub0);
2717 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2718 .addReg(Src, 0, AMDGPU::sub1);
2719 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2720 .addImm(0x80000000);
2721
2722 // Set or toggle sign bit.
2723 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2724 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2725 .addReg(HiReg)
2726 .addReg(ConstReg)
2727 .setOperandDead(3); // Dead scc
2728 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2729 .addReg(LoReg)
2730 .addImm(AMDGPU::sub0)
2731 .addReg(OpReg)
2732 .addImm(AMDGPU::sub1);
2733 MI.eraseFromParent();
2734 return true;
2735}
2736
2737// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2738bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2739 Register Dst = MI.getOperand(0).getReg();
2740 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2741 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2742 MRI->getType(Dst) != LLT::scalar(64))
2743 return false;
2744
2745 Register Src = MI.getOperand(1).getReg();
2746 MachineBasicBlock *BB = MI.getParent();
2747 const DebugLoc &DL = MI.getDebugLoc();
2748 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2749 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2750 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2751 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2752
2753 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2754 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2755 return false;
2756
2757 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2758 .addReg(Src, 0, AMDGPU::sub0);
2759 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2760 .addReg(Src, 0, AMDGPU::sub1);
2761 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2762 .addImm(0x7fffffff);
2763
2764 // Clear sign bit.
2765 // TODO: Should this used S_BITSET0_*?
2766 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2767 .addReg(HiReg)
2768 .addReg(ConstReg)
2769 .setOperandDead(3); // Dead scc
2770 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2771 .addReg(LoReg)
2772 .addImm(AMDGPU::sub0)
2773 .addReg(OpReg)
2774 .addImm(AMDGPU::sub1);
2775
2776 MI.eraseFromParent();
2777 return true;
2778}
2779
2780static bool isConstant(const MachineInstr &MI) {
2781 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2782}
2783
2784void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2785 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2786
2787 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2788 const MachineInstr *PtrMI =
2789 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2790
2791 assert(PtrMI);
2792
2793 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2794 return;
2795
2796 GEPInfo GEPInfo;
2797
2798 for (unsigned i = 1; i != 3; ++i) {
2799 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2800 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2801 assert(OpDef);
2802 if (i == 2 && isConstant(*OpDef)) {
2803 // TODO: Could handle constant base + variable offset, but a combine
2804 // probably should have commuted it.
2805 assert(GEPInfo.Imm == 0);
2806 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2807 continue;
2808 }
2809 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2810 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2811 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2812 else
2813 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2814 }
2815
2816 AddrInfo.push_back(GEPInfo);
2817 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2818}
2819
2820bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2821 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2822}
2823
2824bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2825 if (!MI.hasOneMemOperand())
2826 return false;
2827
2828 const MachineMemOperand *MMO = *MI.memoperands_begin();
2829 const Value *Ptr = MMO->getValue();
2830
2831 // UndefValue means this is a load of a kernel input. These are uniform.
2832 // Sometimes LDS instructions have constant pointers.
2833 // If Ptr is null, then that means this mem operand contains a
2834 // PseudoSourceValue like GOT.
2835 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2836 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2837 return true;
2838
2840 return true;
2841
2842 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2843 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2844 AMDGPU::SGPRRegBankID;
2845
2846 const Instruction *I = dyn_cast<Instruction>(Ptr);
2847 return I && I->getMetadata("amdgpu.uniform");
2848}
2849
2850bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2851 for (const GEPInfo &GEPInfo : AddrInfo) {
2852 if (!GEPInfo.VgprParts.empty())
2853 return true;
2854 }
2855 return false;
2856}
2857
2858void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2859 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2860 unsigned AS = PtrTy.getAddressSpace();
2862 STI.ldsRequiresM0Init()) {
2863 MachineBasicBlock *BB = I.getParent();
2864
2865 // If DS instructions require M0 initialization, insert it before selecting.
2866 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2867 .addImm(-1);
2868 }
2869}
2870
2871bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2872 MachineInstr &I) const {
2873 initM0(I);
2874 return selectImpl(I, *CoverageInfo);
2875}
2876
2878 if (Reg.isPhysical())
2879 return false;
2880
2881 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2882 const unsigned Opcode = MI.getOpcode();
2883
2884 if (Opcode == AMDGPU::COPY)
2885 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2886
2887 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2888 Opcode == AMDGPU::G_XOR)
2889 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2890 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2891
2892 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2893 return GI->is(Intrinsic::amdgcn_class);
2894
2895 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2896}
2897
2898bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2899 MachineBasicBlock *BB = I.getParent();
2900 MachineOperand &CondOp = I.getOperand(0);
2901 Register CondReg = CondOp.getReg();
2902 const DebugLoc &DL = I.getDebugLoc();
2903
2904 unsigned BrOpcode;
2905 Register CondPhysReg;
2906 const TargetRegisterClass *ConstrainRC;
2907
2908 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2909 // whether the branch is uniform when selecting the instruction. In
2910 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2911 // RegBankSelect knows what it's doing if the branch condition is scc, even
2912 // though it currently does not.
2913 if (!isVCC(CondReg, *MRI)) {
2914 if (MRI->getType(CondReg) != LLT::scalar(32))
2915 return false;
2916
2917 CondPhysReg = AMDGPU::SCC;
2918 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2919 ConstrainRC = &AMDGPU::SReg_32RegClass;
2920 } else {
2921 // FIXME: Should scc->vcc copies and with exec?
2922
2923 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2924 // need to insert an and with exec.
2925 if (!isVCmpResult(CondReg, *MRI)) {
2926 const bool Is64 = STI.isWave64();
2927 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2928 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2929
2930 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2931 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2932 .addReg(CondReg)
2933 .addReg(Exec)
2934 .setOperandDead(3); // Dead scc
2935 CondReg = TmpReg;
2936 }
2937
2938 CondPhysReg = TRI.getVCC();
2939 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2940 ConstrainRC = TRI.getBoolRC();
2941 }
2942
2943 if (!MRI->getRegClassOrNull(CondReg))
2944 MRI->setRegClass(CondReg, ConstrainRC);
2945
2946 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2947 .addReg(CondReg);
2948 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2949 .addMBB(I.getOperand(1).getMBB());
2950
2951 I.eraseFromParent();
2952 return true;
2953}
2954
2955bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2956 MachineInstr &I) const {
2957 Register DstReg = I.getOperand(0).getReg();
2958 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2959 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2960 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2961 if (IsVGPR)
2962 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2963
2964 return RBI.constrainGenericRegister(
2965 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2966}
2967
2968bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2969 Register DstReg = I.getOperand(0).getReg();
2970 Register SrcReg = I.getOperand(1).getReg();
2971 Register MaskReg = I.getOperand(2).getReg();
2972 LLT Ty = MRI->getType(DstReg);
2973 LLT MaskTy = MRI->getType(MaskReg);
2974 MachineBasicBlock *BB = I.getParent();
2975 const DebugLoc &DL = I.getDebugLoc();
2976
2977 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2978 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2979 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2980 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2981 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2982 return false;
2983
2984 // Try to avoid emitting a bit operation when we only need to touch half of
2985 // the 64-bit pointer.
2986 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2987 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2988 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2989
2990 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2991 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2992
2993 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2994 !CanCopyLow32 && !CanCopyHi32) {
2995 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2996 .addReg(SrcReg)
2997 .addReg(MaskReg)
2998 .setOperandDead(3); // Dead scc
2999 I.eraseFromParent();
3000 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3001 }
3002
3003 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3004 const TargetRegisterClass &RegRC
3005 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3006
3007 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3008 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3009 const TargetRegisterClass *MaskRC =
3010 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3011
3012 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3013 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3014 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3015 return false;
3016
3017 if (Ty.getSizeInBits() == 32) {
3018 assert(MaskTy.getSizeInBits() == 32 &&
3019 "ptrmask should have been narrowed during legalize");
3020
3021 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3022 .addReg(SrcReg)
3023 .addReg(MaskReg);
3024
3025 if (!IsVGPR)
3026 NewOp.setOperandDead(3); // Dead scc
3027 I.eraseFromParent();
3028 return true;
3029 }
3030
3031 Register HiReg = MRI->createVirtualRegister(&RegRC);
3032 Register LoReg = MRI->createVirtualRegister(&RegRC);
3033
3034 // Extract the subregisters from the source pointer.
3035 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3036 .addReg(SrcReg, 0, AMDGPU::sub0);
3037 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3038 .addReg(SrcReg, 0, AMDGPU::sub1);
3039
3040 Register MaskedLo, MaskedHi;
3041
3042 if (CanCopyLow32) {
3043 // If all the bits in the low half are 1, we only need a copy for it.
3044 MaskedLo = LoReg;
3045 } else {
3046 // Extract the mask subregister and apply the and.
3047 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3048 MaskedLo = MRI->createVirtualRegister(&RegRC);
3049
3050 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3051 .addReg(MaskReg, 0, AMDGPU::sub0);
3052 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3053 .addReg(LoReg)
3054 .addReg(MaskLo);
3055 }
3056
3057 if (CanCopyHi32) {
3058 // If all the bits in the high half are 1, we only need a copy for it.
3059 MaskedHi = HiReg;
3060 } else {
3061 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3062 MaskedHi = MRI->createVirtualRegister(&RegRC);
3063
3064 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3065 .addReg(MaskReg, 0, AMDGPU::sub1);
3066 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3067 .addReg(HiReg)
3068 .addReg(MaskHi);
3069 }
3070
3071 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3072 .addReg(MaskedLo)
3073 .addImm(AMDGPU::sub0)
3074 .addReg(MaskedHi)
3075 .addImm(AMDGPU::sub1);
3076 I.eraseFromParent();
3077 return true;
3078}
3079
3080/// Return the register to use for the index value, and the subregister to use
3081/// for the indirectly accessed register.
3082static std::pair<Register, unsigned>
3084 const TargetRegisterClass *SuperRC, Register IdxReg,
3085 unsigned EltSize, GISelKnownBits &KnownBits) {
3086 Register IdxBaseReg;
3087 int Offset;
3088
3089 std::tie(IdxBaseReg, Offset) =
3091 if (IdxBaseReg == AMDGPU::NoRegister) {
3092 // This will happen if the index is a known constant. This should ordinarily
3093 // be legalized out, but handle it as a register just in case.
3094 assert(Offset == 0);
3095 IdxBaseReg = IdxReg;
3096 }
3097
3098 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3099
3100 // Skip out of bounds offsets, or else we would end up using an undefined
3101 // register.
3102 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3103 return std::pair(IdxReg, SubRegs[0]);
3104 return std::pair(IdxBaseReg, SubRegs[Offset]);
3105}
3106
3107bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3108 MachineInstr &MI) const {
3109 Register DstReg = MI.getOperand(0).getReg();
3110 Register SrcReg = MI.getOperand(1).getReg();
3111 Register IdxReg = MI.getOperand(2).getReg();
3112
3113 LLT DstTy = MRI->getType(DstReg);
3114 LLT SrcTy = MRI->getType(SrcReg);
3115
3116 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3117 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3118 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3119
3120 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3121 // into a waterfall loop.
3122 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3123 return false;
3124
3125 const TargetRegisterClass *SrcRC =
3126 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3127 const TargetRegisterClass *DstRC =
3128 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3129 if (!SrcRC || !DstRC)
3130 return false;
3131 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3132 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3133 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3134 return false;
3135
3136 MachineBasicBlock *BB = MI.getParent();
3137 const DebugLoc &DL = MI.getDebugLoc();
3138 const bool Is64 = DstTy.getSizeInBits() == 64;
3139
3140 unsigned SubReg;
3141 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3142 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3143
3144 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3145 if (DstTy.getSizeInBits() != 32 && !Is64)
3146 return false;
3147
3148 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3149 .addReg(IdxReg);
3150
3151 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3152 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3153 .addReg(SrcReg, 0, SubReg)
3154 .addReg(SrcReg, RegState::Implicit);
3155 MI.eraseFromParent();
3156 return true;
3157 }
3158
3159 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3160 return false;
3161
3162 if (!STI.useVGPRIndexMode()) {
3163 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3164 .addReg(IdxReg);
3165 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3166 .addReg(SrcReg, 0, SubReg)
3167 .addReg(SrcReg, RegState::Implicit);
3168 MI.eraseFromParent();
3169 return true;
3170 }
3171
3172 const MCInstrDesc &GPRIDXDesc =
3173 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3174 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3175 .addReg(SrcReg)
3176 .addReg(IdxReg)
3177 .addImm(SubReg);
3178
3179 MI.eraseFromParent();
3180 return true;
3181}
3182
3183// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3184bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3185 MachineInstr &MI) const {
3186 Register DstReg = MI.getOperand(0).getReg();
3187 Register VecReg = MI.getOperand(1).getReg();
3188 Register ValReg = MI.getOperand(2).getReg();
3189 Register IdxReg = MI.getOperand(3).getReg();
3190
3191 LLT VecTy = MRI->getType(DstReg);
3192 LLT ValTy = MRI->getType(ValReg);
3193 unsigned VecSize = VecTy.getSizeInBits();
3194 unsigned ValSize = ValTy.getSizeInBits();
3195
3196 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3197 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3198 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3199
3200 assert(VecTy.getElementType() == ValTy);
3201
3202 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3203 // into a waterfall loop.
3204 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3205 return false;
3206
3207 const TargetRegisterClass *VecRC =
3208 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3209 const TargetRegisterClass *ValRC =
3210 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3211
3212 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3213 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3214 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3215 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3216 return false;
3217
3218 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3219 return false;
3220
3221 unsigned SubReg;
3222 std::tie(IdxReg, SubReg) =
3223 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3224
3225 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3226 STI.useVGPRIndexMode();
3227
3228 MachineBasicBlock *BB = MI.getParent();
3229 const DebugLoc &DL = MI.getDebugLoc();
3230
3231 if (!IndexMode) {
3232 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3233 .addReg(IdxReg);
3234
3235 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3236 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3237 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3238 .addReg(VecReg)
3239 .addReg(ValReg)
3240 .addImm(SubReg);
3241 MI.eraseFromParent();
3242 return true;
3243 }
3244
3245 const MCInstrDesc &GPRIDXDesc =
3246 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3247 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3248 .addReg(VecReg)
3249 .addReg(ValReg)
3250 .addReg(IdxReg)
3251 .addImm(SubReg);
3252
3253 MI.eraseFromParent();
3254 return true;
3255}
3256
3257bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3259 unsigned Opc;
3260 unsigned Size = MI.getOperand(3).getImm();
3261
3262 // The struct intrinsic variants add one additional operand over raw.
3263 const bool HasVIndex = MI.getNumOperands() == 9;
3264 Register VIndex;
3265 int OpOffset = 0;
3266 if (HasVIndex) {
3267 VIndex = MI.getOperand(4).getReg();
3268 OpOffset = 1;
3269 }
3270
3271 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3272 std::optional<ValueAndVReg> MaybeVOffset =
3274 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3275
3276 switch (Size) {
3277 default:
3278 return false;
3279 case 1:
3280 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3281 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3282 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3283 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3284 break;
3285 case 2:
3286 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3287 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3288 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3289 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3290 break;
3291 case 4:
3292 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3293 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3294 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3295 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3296 break;
3297 case 12:
3298 if (!Subtarget->hasLDSLoadB96_B128())
3299 return false;
3300
3301 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3302 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3303 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3304 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3305 break;
3306 case 16:
3307 if (!Subtarget->hasLDSLoadB96_B128())
3308 return false;
3309
3310 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3311 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3312 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3313 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3314 break;
3315 }
3316
3317 MachineBasicBlock *MBB = MI.getParent();
3318 const DebugLoc &DL = MI.getDebugLoc();
3319 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3320 .add(MI.getOperand(2));
3321
3322 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3323
3324 if (HasVIndex && HasVOffset) {
3325 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3326 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3327 .addReg(VIndex)
3328 .addImm(AMDGPU::sub0)
3329 .addReg(VOffset)
3330 .addImm(AMDGPU::sub1);
3331
3332 MIB.addReg(IdxReg);
3333 } else if (HasVIndex) {
3334 MIB.addReg(VIndex);
3335 } else if (HasVOffset) {
3336 MIB.addReg(VOffset);
3337 }
3338
3339 MIB.add(MI.getOperand(1)); // rsrc
3340 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3341 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3342 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3343 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3344 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3345 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3346 MIB.addImm(
3347 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3348 ? 1
3349 : 0); // swz
3350
3351 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3352 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3353 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3354 MachinePointerInfo StorePtrI = LoadPtrI;
3355 StorePtrI.V = nullptr;
3357
3358 auto F = LoadMMO->getFlags() &
3360 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3361 Size, LoadMMO->getBaseAlign());
3362
3363 MachineMemOperand *StoreMMO =
3365 sizeof(int32_t), LoadMMO->getBaseAlign());
3366
3367 MIB.setMemRefs({LoadMMO, StoreMMO});
3368
3369 MI.eraseFromParent();
3370 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3371}
3372
3373/// Match a zero extend from a 32-bit value to 64-bits.
3375 Register ZExtSrc;
3376 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3377 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3378
3379 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3380 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3381 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3382 return Register();
3383
3384 assert(Def->getNumOperands() == 3 &&
3385 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3386 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3387 return Def->getOperand(1).getReg();
3388 }
3389
3390 return Register();
3391}
3392
3393bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3394 unsigned Opc;
3395 unsigned Size = MI.getOperand(3).getImm();
3396
3397 switch (Size) {
3398 default:
3399 return false;
3400 case 1:
3401 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3402 break;
3403 case 2:
3404 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3405 break;
3406 case 4:
3407 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3408 break;
3409 case 12:
3410 if (!Subtarget->hasLDSLoadB96_B128())
3411 return false;
3412 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3413 break;
3414 case 16:
3415 if (!Subtarget->hasLDSLoadB96_B128())
3416 return false;
3417 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3418 break;
3419 }
3420
3421 MachineBasicBlock *MBB = MI.getParent();
3422 const DebugLoc &DL = MI.getDebugLoc();
3423 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3424 .add(MI.getOperand(2));
3425
3426 Register Addr = MI.getOperand(1).getReg();
3427 Register VOffset;
3428 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3429 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3430 if (!isSGPR(Addr)) {
3431 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3432 if (isSGPR(AddrDef->Reg)) {
3433 Addr = AddrDef->Reg;
3434 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3435 Register SAddr =
3436 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3437 if (isSGPR(SAddr)) {
3438 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3439 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3440 Addr = SAddr;
3441 VOffset = Off;
3442 }
3443 }
3444 }
3445 }
3446
3447 if (isSGPR(Addr)) {
3448 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3449 if (!VOffset) {
3450 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3451 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3452 .addImm(0);
3453 }
3454 }
3455
3456 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3457 .addReg(Addr);
3458
3459 if (isSGPR(Addr))
3460 MIB.addReg(VOffset);
3461
3462 MIB.add(MI.getOperand(4)) // offset
3463 .add(MI.getOperand(5)); // cpol
3464
3465 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3466 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3467 LoadPtrI.Offset = MI.getOperand(4).getImm();
3468 MachinePointerInfo StorePtrI = LoadPtrI;
3471 auto F = LoadMMO->getFlags() &
3473 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3474 Size, LoadMMO->getBaseAlign());
3475 MachineMemOperand *StoreMMO =
3477 sizeof(int32_t), Align(4));
3478
3479 MIB.setMemRefs({LoadMMO, StoreMMO});
3480
3481 MI.eraseFromParent();
3482 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3483}
3484
3485bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3486 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3487 MI.removeOperand(1);
3488 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3489 return true;
3490}
3491
3492// FIXME: This should be removed and let the patterns select. We just need the
3493// AGPR/VGPR combination versions.
3494bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3495 unsigned Opc;
3496 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3497 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3498 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3499 break;
3500 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3501 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3502 break;
3503 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3504 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3505 break;
3506 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3507 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3508 break;
3509 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3510 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3511 break;
3512 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3513 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3514 break;
3515 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3516 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3517 break;
3518 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3519 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3520 break;
3521 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3522 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3523 break;
3524 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3525 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3526 break;
3527 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3528 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3529 break;
3530 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3531 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3532 break;
3533 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3534 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3535 break;
3536 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3537 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3538 break;
3539 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3540 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3541 break;
3542 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3543 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3544 break;
3545 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3546 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3547 break;
3548 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3549 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3550 break;
3551 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3552 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3553 break;
3554 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3555 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3556 break;
3557 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3558 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3559 break;
3560 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3561 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3562 break;
3563 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3564 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3565 break;
3566 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3567 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3568 break;
3569 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3570 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3571 break;
3572 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3573 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3574 break;
3575 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3576 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3577 break;
3578 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3579 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3580 break;
3581 default:
3582 llvm_unreachable("unhandled smfmac intrinsic");
3583 }
3584
3585 auto VDst_In = MI.getOperand(4);
3586
3587 MI.setDesc(TII.get(Opc));
3588 MI.removeOperand(4); // VDst_In
3589 MI.removeOperand(1); // Intrinsic ID
3590 MI.addOperand(VDst_In); // Readd VDst_In to the end
3591 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3592 return true;
3593}
3594
3595bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3596 MachineInstr &MI, Intrinsic::ID IntrID) const {
3597 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3598 !Subtarget->hasPermlane16Swap())
3599 return false;
3600 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3601 !Subtarget->hasPermlane32Swap())
3602 return false;
3603
3604 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3605 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3606 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3607
3608 MI.removeOperand(2);
3609 MI.setDesc(TII.get(Opcode));
3610 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3611
3612 MachineOperand &FI = MI.getOperand(4);
3614
3615 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3616}
3617
3618bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3619 Register DstReg = MI.getOperand(0).getReg();
3620 Register SrcReg = MI.getOperand(1).getReg();
3621 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3622 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3623 MachineBasicBlock *MBB = MI.getParent();
3624 const DebugLoc &DL = MI.getDebugLoc();
3625
3626 if (IsVALU) {
3627 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3628 .addImm(Subtarget->getWavefrontSizeLog2())
3629 .addReg(SrcReg);
3630 } else {
3631 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3632 .addReg(SrcReg)
3633 .addImm(Subtarget->getWavefrontSizeLog2())
3634 .setOperandDead(3); // Dead scc
3635 }
3636
3637 const TargetRegisterClass &RC =
3638 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3639 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3640 return false;
3641
3642 MI.eraseFromParent();
3643 return true;
3644}
3645
3646// Match BITOP3 operation and return a number of matched instructions plus
3647// truth table.
3648static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3650 const MachineRegisterInfo &MRI) {
3651 unsigned NumOpcodes = 0;
3652 uint8_t LHSBits, RHSBits;
3653
3654 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3655 // Define truth table given Src0, Src1, Src2 bits permutations:
3656 // 0 0 0
3657 // 0 0 1
3658 // 0 1 0
3659 // 0 1 1
3660 // 1 0 0
3661 // 1 0 1
3662 // 1 1 0
3663 // 1 1 1
3664 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3665
3666 if (mi_match(Op, MRI, m_AllOnesInt())) {
3667 Bits = 0xff;
3668 return true;
3669 }
3670 if (mi_match(Op, MRI, m_ZeroInt())) {
3671 Bits = 0;
3672 return true;
3673 }
3674
3675 for (unsigned I = 0; I < Src.size(); ++I) {
3676 // Try to find existing reused operand
3677 if (Src[I] == Op) {
3678 Bits = SrcBits[I];
3679 return true;
3680 }
3681 // Try to replace parent operator
3682 if (Src[I] == R) {
3683 Bits = SrcBits[I];
3684 Src[I] = Op;
3685 return true;
3686 }
3687 }
3688
3689 if (Src.size() == 3) {
3690 // No room left for operands. Try one last time, there can be a 'not' of
3691 // one of our source operands. In this case we can compute the bits
3692 // without growing Src vector.
3693 Register LHS;
3694 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3696 for (unsigned I = 0; I < Src.size(); ++I) {
3697 if (Src[I] == LHS) {
3698 Bits = ~SrcBits[I];
3699 return true;
3700 }
3701 }
3702 }
3703
3704 return false;
3705 }
3706
3707 Bits = SrcBits[Src.size()];
3708 Src.push_back(Op);
3709 return true;
3710 };
3711
3712 MachineInstr *MI = MRI.getVRegDef(R);
3713 switch (MI->getOpcode()) {
3714 case TargetOpcode::G_AND:
3715 case TargetOpcode::G_OR:
3716 case TargetOpcode::G_XOR: {
3717 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3718 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3719
3720 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3721 if (!getOperandBits(LHS, LHSBits) ||
3722 !getOperandBits(RHS, RHSBits)) {
3723 Src = Backup;
3724 return std::make_pair(0, 0);
3725 }
3726
3727 // Recursion is naturally limited by the size of the operand vector.
3728 auto Op = BitOp3_Op(LHS, Src, MRI);
3729 if (Op.first) {
3730 NumOpcodes += Op.first;
3731 LHSBits = Op.second;
3732 }
3733
3734 Op = BitOp3_Op(RHS, Src, MRI);
3735 if (Op.first) {
3736 NumOpcodes += Op.first;
3737 RHSBits = Op.second;
3738 }
3739 break;
3740 }
3741 default:
3742 return std::make_pair(0, 0);
3743 }
3744
3745 uint8_t TTbl;
3746 switch (MI->getOpcode()) {
3747 case TargetOpcode::G_AND:
3748 TTbl = LHSBits & RHSBits;
3749 break;
3750 case TargetOpcode::G_OR:
3751 TTbl = LHSBits | RHSBits;
3752 break;
3753 case TargetOpcode::G_XOR:
3754 TTbl = LHSBits ^ RHSBits;
3755 break;
3756 default:
3757 break;
3758 }
3759
3760 return std::make_pair(NumOpcodes + 1, TTbl);
3761}
3762
3763bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3764 if (!Subtarget->hasBitOp3Insts())
3765 return false;
3766
3767 Register DstReg = MI.getOperand(0).getReg();
3768 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3769 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3770 if (!IsVALU)
3771 return false;
3772
3774 uint8_t TTbl;
3775 unsigned NumOpcodes;
3776
3777 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3778
3779 // Src.empty() case can happen if all operands are all zero or all ones.
3780 // Normally it shall be optimized out before reaching this.
3781 if (NumOpcodes < 2 || Src.empty())
3782 return false;
3783
3784 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3785 if (NumOpcodes == 2 && IsB32) {
3786 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3787 // asm more readable. This cannot be modeled with AddedComplexity because
3788 // selector does not know how many operations did we match.
3789 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3790 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3791 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3792 return false;
3793 } else if (NumOpcodes < 4) {
3794 // For a uniform case threshold should be higher to account for moves
3795 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3796 // in SGPRs and a readtfirstlane after.
3797 return false;
3798 }
3799
3800 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3801 unsigned CBL = STI.getConstantBusLimit(Opc);
3802 MachineBasicBlock *MBB = MI.getParent();
3803 const DebugLoc &DL = MI.getDebugLoc();
3804
3805 for (unsigned I = 0; I < Src.size(); ++I) {
3806 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3807 if (RB->getID() != AMDGPU::SGPRRegBankID)
3808 continue;
3809 if (CBL > 0) {
3810 --CBL;
3811 continue;
3812 }
3813 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3814 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3815 .addReg(Src[I]);
3816 Src[I] = NewReg;
3817 }
3818
3819 // Last operand can be ignored, turning a ternary operation into a binary.
3820 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3821 // 'c' with 'a' here without changing the answer. In some pathological
3822 // cases it should be possible to get an operation with a single operand
3823 // too if optimizer would not catch it.
3824 while (Src.size() < 3)
3825 Src.push_back(Src[0]);
3826
3827 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
3828 if (!IsB32)
3829 MIB.addImm(0); // src_mod0
3830 MIB.addReg(Src[0]);
3831 if (!IsB32)
3832 MIB.addImm(0); // src_mod1
3833 MIB.addReg(Src[1]);
3834 if (!IsB32)
3835 MIB.addImm(0); // src_mod2
3836 MIB.addReg(Src[2])
3837 .addImm(TTbl);
3838 if (!IsB32)
3839 MIB.addImm(0); // op_sel
3840
3841 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3842 MI.eraseFromParent();
3843
3844 return true;
3845}
3846
3847bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3848 Register SrcReg = MI.getOperand(0).getReg();
3849 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3850 return false;
3851
3852 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3853 Register SP =
3855 Register WaveAddr = getWaveAddress(DefMI);
3856 MachineBasicBlock *MBB = MI.getParent();
3857 const DebugLoc &DL = MI.getDebugLoc();
3858
3859 if (!WaveAddr) {
3860 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3861 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3862 .addReg(SrcReg)
3863 .addImm(Subtarget->getWavefrontSizeLog2())
3864 .setOperandDead(3); // Dead scc
3865 }
3866
3867 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3868 .addReg(WaveAddr);
3869
3870 MI.eraseFromParent();
3871 return true;
3872}
3873
3875
3876 if (!I.isPreISelOpcode()) {
3877 if (I.isCopy())
3878 return selectCOPY(I);
3879 return true;
3880 }
3881
3882 switch (I.getOpcode()) {
3883 case TargetOpcode::G_AND:
3884 case TargetOpcode::G_OR:
3885 case TargetOpcode::G_XOR:
3886 if (selectBITOP3(I))
3887 return true;
3888 if (selectImpl(I, *CoverageInfo))
3889 return true;
3890 return selectG_AND_OR_XOR(I);
3891 case TargetOpcode::G_ADD:
3892 case TargetOpcode::G_SUB:
3893 case TargetOpcode::G_PTR_ADD:
3894 if (selectImpl(I, *CoverageInfo))
3895 return true;
3896 return selectG_ADD_SUB(I);
3897 case TargetOpcode::G_UADDO:
3898 case TargetOpcode::G_USUBO:
3899 case TargetOpcode::G_UADDE:
3900 case TargetOpcode::G_USUBE:
3901 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3902 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3903 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3904 return selectG_AMDGPU_MAD_64_32(I);
3905 case TargetOpcode::G_INTTOPTR:
3906 case TargetOpcode::G_BITCAST:
3907 case TargetOpcode::G_PTRTOINT:
3908 case TargetOpcode::G_FREEZE:
3909 return selectCOPY(I);
3910 case TargetOpcode::G_FNEG:
3911 if (selectImpl(I, *CoverageInfo))
3912 return true;
3913 return selectG_FNEG(I);
3914 case TargetOpcode::G_FABS:
3915 if (selectImpl(I, *CoverageInfo))
3916 return true;
3917 return selectG_FABS(I);
3918 case TargetOpcode::G_EXTRACT:
3919 return selectG_EXTRACT(I);
3920 case TargetOpcode::G_MERGE_VALUES:
3921 case TargetOpcode::G_CONCAT_VECTORS:
3922 return selectG_MERGE_VALUES(I);
3923 case TargetOpcode::G_UNMERGE_VALUES:
3924 return selectG_UNMERGE_VALUES(I);
3925 case TargetOpcode::G_BUILD_VECTOR:
3926 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3927 return selectG_BUILD_VECTOR(I);
3928 case TargetOpcode::G_IMPLICIT_DEF:
3929 return selectG_IMPLICIT_DEF(I);
3930 case TargetOpcode::G_INSERT:
3931 return selectG_INSERT(I);
3932 case TargetOpcode::G_INTRINSIC:
3933 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3934 return selectG_INTRINSIC(I);
3935 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3936 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3937 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3938 case TargetOpcode::G_ICMP:
3939 case TargetOpcode::G_FCMP:
3940 if (selectG_ICMP_or_FCMP(I))
3941 return true;
3942 return selectImpl(I, *CoverageInfo);
3943 case TargetOpcode::G_LOAD:
3944 case TargetOpcode::G_ZEXTLOAD:
3945 case TargetOpcode::G_SEXTLOAD:
3946 case TargetOpcode::G_STORE:
3947 case TargetOpcode::G_ATOMIC_CMPXCHG:
3948 case TargetOpcode::G_ATOMICRMW_XCHG:
3949 case TargetOpcode::G_ATOMICRMW_ADD:
3950 case TargetOpcode::G_ATOMICRMW_SUB:
3951 case TargetOpcode::G_ATOMICRMW_AND:
3952 case TargetOpcode::G_ATOMICRMW_OR:
3953 case TargetOpcode::G_ATOMICRMW_XOR:
3954 case TargetOpcode::G_ATOMICRMW_MIN:
3955 case TargetOpcode::G_ATOMICRMW_MAX:
3956 case TargetOpcode::G_ATOMICRMW_UMIN:
3957 case TargetOpcode::G_ATOMICRMW_UMAX:
3958 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3959 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3960 case TargetOpcode::G_ATOMICRMW_FADD:
3961 case TargetOpcode::G_ATOMICRMW_FMIN:
3962 case TargetOpcode::G_ATOMICRMW_FMAX:
3963 return selectG_LOAD_STORE_ATOMICRMW(I);
3964 case TargetOpcode::G_SELECT:
3965 return selectG_SELECT(I);
3966 case TargetOpcode::G_TRUNC:
3967 return selectG_TRUNC(I);
3968 case TargetOpcode::G_SEXT:
3969 case TargetOpcode::G_ZEXT:
3970 case TargetOpcode::G_ANYEXT:
3971 case TargetOpcode::G_SEXT_INREG:
3972 // This is a workaround. For extension from type i1, `selectImpl()` uses
3973 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3974 // i1 can only be hold in a SGPR class.
3975 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3976 selectImpl(I, *CoverageInfo))
3977 return true;
3978 return selectG_SZA_EXT(I);
3979 case TargetOpcode::G_FPEXT:
3980 if (selectG_FPEXT(I))
3981 return true;
3982 return selectImpl(I, *CoverageInfo);
3983 case TargetOpcode::G_BRCOND:
3984 return selectG_BRCOND(I);
3985 case TargetOpcode::G_GLOBAL_VALUE:
3986 return selectG_GLOBAL_VALUE(I);
3987 case TargetOpcode::G_PTRMASK:
3988 return selectG_PTRMASK(I);
3989 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3990 return selectG_EXTRACT_VECTOR_ELT(I);
3991 case TargetOpcode::G_INSERT_VECTOR_ELT:
3992 return selectG_INSERT_VECTOR_ELT(I);
3993 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3994 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3995 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3996 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3997 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4000 assert(Intr && "not an image intrinsic with image pseudo");
4001 return selectImageIntrinsic(I, Intr);
4002 }
4003 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
4004 return selectBVHIntrinsic(I);
4005 case AMDGPU::G_SBFX:
4006 case AMDGPU::G_UBFX:
4007 return selectG_SBFX_UBFX(I);
4008 case AMDGPU::G_SI_CALL:
4009 I.setDesc(TII.get(AMDGPU::SI_CALL));
4010 return true;
4011 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4012 return selectWaveAddress(I);
4013 case AMDGPU::G_STACKRESTORE:
4014 return selectStackRestore(I);
4015 case AMDGPU::G_PHI:
4016 return selectPHI(I);
4017 case TargetOpcode::G_CONSTANT:
4018 case TargetOpcode::G_FCONSTANT:
4019 default:
4020 return selectImpl(I, *CoverageInfo);
4021 }
4022 return false;
4023}
4024
4026AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4027 return {{
4028 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4029 }};
4030
4031}
4032
4033std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4034 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4035 unsigned Mods = 0;
4036 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4037
4038 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4039 Src = MI->getOperand(1).getReg();
4040 Mods |= SISrcMods::NEG;
4041 MI = getDefIgnoringCopies(Src, *MRI);
4042 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4043 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4044 // denormal mode, but we're implicitly canonicalizing in a source operand.
4045 const ConstantFP *LHS =
4046 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4047 if (LHS && LHS->isZero()) {
4048 Mods |= SISrcMods::NEG;
4049 Src = MI->getOperand(2).getReg();
4050 }
4051 }
4052
4053 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4054 Src = MI->getOperand(1).getReg();
4055 Mods |= SISrcMods::ABS;
4056 }
4057
4058 if (OpSel)
4059 Mods |= SISrcMods::OP_SEL_0;
4060
4061 return std::pair(Src, Mods);
4062}
4063
4064Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4065 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4066 bool ForceVGPR) const {
4067 if ((Mods != 0 || ForceVGPR) &&
4068 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4069
4070 // If we looked through copies to find source modifiers on an SGPR operand,
4071 // we now have an SGPR register source. To avoid potentially violating the
4072 // constant bus restriction, we need to insert a copy to a VGPR.
4073 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4074 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4075 TII.get(AMDGPU::COPY), VGPRSrc)
4076 .addReg(Src);
4077 Src = VGPRSrc;
4078 }
4079
4080 return Src;
4081}
4082
4083///
4084/// This will select either an SGPR or VGPR operand and will save us from
4085/// having to write an extra tablegen pattern.
4087AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4088 return {{
4089 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4090 }};
4091}
4092
4094AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4095 Register Src;
4096 unsigned Mods;
4097 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4098
4099 return {{
4100 [=](MachineInstrBuilder &MIB) {
4101 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4102 },
4103 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4104 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4105 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4106 }};
4107}
4108
4110AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4111 Register Src;
4112 unsigned Mods;
4113 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4114 /*IsCanonicalizing=*/true,
4115 /*AllowAbs=*/false);
4116
4117 return {{
4118 [=](MachineInstrBuilder &MIB) {
4119 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4120 },
4121 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4122 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4123 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4124 }};
4125}
4126
4128AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4129 return {{
4130 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4131 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4132 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4133 }};
4134}
4135
4137AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4138 Register Src;
4139 unsigned Mods;
4140 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4141
4142 return {{
4143 [=](MachineInstrBuilder &MIB) {
4144 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4145 },
4146 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4147 }};
4148}
4149
4151AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4152 MachineOperand &Root) const {
4153 Register Src;
4154 unsigned Mods;
4155 std::tie(Src, Mods) =
4156 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4157
4158 return {{
4159 [=](MachineInstrBuilder &MIB) {
4160 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4161 },
4162 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4163 }};
4164}
4165
4167AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4168 Register Src;
4169 unsigned Mods;
4170 std::tie(Src, Mods) =
4171 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4172 /*AllowAbs=*/false);
4173
4174 return {{
4175 [=](MachineInstrBuilder &MIB) {
4176 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4177 },
4178 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4179 }};
4180}
4181
4183AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4184 Register Reg = Root.getReg();
4185 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4186 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4187 return {};
4188 return {{
4189 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4190 }};
4191}
4192
4193std::pair<Register, unsigned>
4194AMDGPUInstructionSelector::selectVOP3PModsImpl(
4195 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
4196 unsigned Mods = 0;
4197 MachineInstr *MI = MRI.getVRegDef(Src);
4198
4199 if (MI->getOpcode() == AMDGPU::G_FNEG &&
4200 // It's possible to see an f32 fneg here, but unlikely.
4201 // TODO: Treat f32 fneg as only high bit.
4202 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
4204 Src = MI->getOperand(1).getReg();
4205 MI = MRI.getVRegDef(Src);
4206 }
4207
4208 // TODO: Handle G_FSUB 0 as fneg
4209
4210 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
4211 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
4212
4213 // Packed instructions do not have abs modifiers.
4214 Mods |= SISrcMods::OP_SEL_1;
4215
4216 return std::pair(Src, Mods);
4217}
4218
4220AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4222 = Root.getParent()->getParent()->getParent()->getRegInfo();
4223
4224 Register Src;
4225 unsigned Mods;
4226 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
4227
4228 return {{
4229 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4230 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4231 }};
4232}
4233
4235AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4237 = Root.getParent()->getParent()->getParent()->getRegInfo();
4238
4239 Register Src;
4240 unsigned Mods;
4241 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
4242
4243 return {{
4244 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4245 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4246 }};
4247}
4248
4250AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
4251 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4252 // Value is in Imm operand as i1 sign extended to int64_t.
4253 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4254 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4255 "expected i1 value");
4256 unsigned Mods = SISrcMods::OP_SEL_1;
4257 if (Root.getImm() == -1)
4258 Mods ^= SISrcMods::NEG;
4259 return {{
4260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4261 }};
4262}
4263
4265AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4266 MachineOperand &Root) const {
4267 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4268 "expected i1 value");
4269 unsigned Mods = SISrcMods::OP_SEL_1;
4270 if (Root.getImm() != 0)
4271 Mods |= SISrcMods::OP_SEL_0;
4272
4273 return {{
4274 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4275 }};
4276}
4277
4279 MachineInstr *InsertPt,
4281 const TargetRegisterClass *DstRegClass;
4282 switch (Elts.size()) {
4283 case 8:
4284 DstRegClass = &AMDGPU::VReg_256RegClass;
4285 break;
4286 case 4:
4287 DstRegClass = &AMDGPU::VReg_128RegClass;
4288 break;
4289 case 2:
4290 DstRegClass = &AMDGPU::VReg_64RegClass;
4291 break;
4292 default:
4293 llvm_unreachable("unhandled Reg sequence size");
4294 }
4295
4296 MachineIRBuilder B(*InsertPt);
4297 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4298 .addDef(MRI.createVirtualRegister(DstRegClass));
4299 for (unsigned i = 0; i < Elts.size(); ++i) {
4300 MIB.addReg(Elts[i]);
4302 }
4303 return MIB->getOperand(0).getReg();
4304}
4305
4306static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
4308 MachineInstr *InsertPt,
4310 if (ModOpcode == TargetOpcode::G_FNEG) {
4311 Mods |= SISrcMods::NEG;
4312 // Check if all elements also have abs modifier
4313 SmallVector<Register, 8> NegAbsElts;
4314 for (auto El : Elts) {
4315 Register FabsSrc;
4316 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
4317 break;
4318 NegAbsElts.push_back(FabsSrc);
4319 }
4320 if (Elts.size() != NegAbsElts.size()) {
4321 // Neg
4322 Src = buildRegSequence(Elts, InsertPt, MRI);
4323 } else {
4324 // Neg and Abs
4325 Mods |= SISrcMods::NEG_HI;
4326 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4327 }
4328 } else {
4329 assert(ModOpcode == TargetOpcode::G_FABS);
4330 // Abs
4331 Mods |= SISrcMods::NEG_HI;
4332 Src = buildRegSequence(Elts, InsertPt, MRI);
4333 }
4334}
4335
4337AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4338 Register Src = Root.getReg();
4339 unsigned Mods = SISrcMods::OP_SEL_1;
4341
4342 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4343 assert(BV->getNumSources() > 0);
4344 // Based on first element decide which mod we match, neg or abs
4345 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
4346 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
4347 ? AMDGPU::G_FNEG
4348 : AMDGPU::G_FABS;
4349 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4350 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
4351 if (ElF32->getOpcode() != ModOpcode)
4352 break;
4353 EltsF32.push_back(ElF32->getOperand(1).getReg());
4354 }
4355
4356 // All elements had ModOpcode modifier
4357 if (BV->getNumSources() == EltsF32.size()) {
4358 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4359 *MRI);
4360 }
4361 }
4362
4363 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4364 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4365}
4366
4368AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4369 Register Src = Root.getReg();
4370 unsigned Mods = SISrcMods::OP_SEL_1;
4371 SmallVector<Register, 8> EltsV2F16;
4372
4373 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4374 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4375 Register FNegSrc;
4376 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4377 break;
4378 EltsV2F16.push_back(FNegSrc);
4379 }
4380
4381 // All elements had ModOpcode modifier
4382 if (CV->getNumSources() == EltsV2F16.size()) {
4383 Mods |= SISrcMods::NEG;
4384 Mods |= SISrcMods::NEG_HI;
4385 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4386 }
4387 }
4388
4389 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4390 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4391}
4392
4394AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4395 Register Src = Root.getReg();
4396 unsigned Mods = SISrcMods::OP_SEL_1;
4397 SmallVector<Register, 8> EltsV2F16;
4398
4399 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4400 assert(CV->getNumSources() > 0);
4401 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4402 // Based on first element decide which mod we match, neg or abs
4403 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4404 ? AMDGPU::G_FNEG
4405 : AMDGPU::G_FABS;
4406
4407 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4408 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4409 if (ElV2F16->getOpcode() != ModOpcode)
4410 break;
4411 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4412 }
4413
4414 // All elements had ModOpcode modifier
4415 if (CV->getNumSources() == EltsV2F16.size()) {
4416 MachineIRBuilder B(*Root.getParent());
4417 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4418 *MRI);
4419 }
4420 }
4421
4422 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4423 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4424}
4425
4427AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4428 std::optional<FPValueAndVReg> FPValReg;
4429 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4430 if (TII.isInlineConstant(FPValReg->Value)) {
4431 return {{[=](MachineInstrBuilder &MIB) {
4432 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4433 }}};
4434 }
4435 // Non-inlineable splat floats should not fall-through for integer immediate
4436 // checks.
4437 return {};
4438 }
4439
4440 APInt ICst;
4441 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4442 if (TII.isInlineConstant(ICst)) {
4443 return {
4444 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4445 }
4446 }
4447
4448 return {};
4449}
4450
4452AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4453 Register Src =
4454 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4455 unsigned Key = 0;
4456
4457 Register ShiftSrc;
4458 std::optional<ValueAndVReg> ShiftAmt;
4459 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4460 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4461 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4462 Key = ShiftAmt->Value.getZExtValue() / 8;
4463 Src = ShiftSrc;
4464 }
4465
4466 return {{
4467 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4468 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4469 }};
4470}
4471
4473AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4474
4475 Register Src =
4476 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4477 unsigned Key = 0;
4478
4479 Register ShiftSrc;
4480 std::optional<ValueAndVReg> ShiftAmt;
4481 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4482 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4483 ShiftAmt->Value.getZExtValue() == 16) {
4484 Src = ShiftSrc;
4485 Key = 1;
4486 }
4487
4488 return {{
4489 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4490 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4491 }};
4492}
4493
4495AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4496 Register Src;
4497 unsigned Mods;
4498 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4499
4500 // FIXME: Handle op_sel
4501 return {{
4502 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4503 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4504 }};
4505}
4506
4508AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4509 Register Src;
4510 unsigned Mods;
4511 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4512 /*IsCanonicalizing=*/true,
4513 /*AllowAbs=*/false,
4514 /*OpSel=*/false);
4515
4516 return {{
4517 [=](MachineInstrBuilder &MIB) {
4518 MIB.addReg(
4519 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4520 },
4521 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4522 }};
4523}
4524
4526AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4527 Register Src;
4528 unsigned Mods;
4529 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4530 /*IsCanonicalizing=*/true,
4531 /*AllowAbs=*/false,
4532 /*OpSel=*/true);
4533
4534 return {{
4535 [=](MachineInstrBuilder &MIB) {
4536 MIB.addReg(
4537 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4538 },
4539 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4540 }};
4541}
4542
4543bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4544 Register &Base,
4545 Register *SOffset,
4546 int64_t *Offset) const {
4547 MachineInstr *MI = Root.getParent();
4548 MachineBasicBlock *MBB = MI->getParent();
4549
4550 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4551 // then we can select all ptr + 32-bit offsets.
4552 SmallVector<GEPInfo, 4> AddrInfo;
4553 getAddrModeInfo(*MI, *MRI, AddrInfo);
4554
4555 if (AddrInfo.empty())
4556 return false;
4557
4558 const GEPInfo &GEPI = AddrInfo[0];
4559 std::optional<int64_t> EncodedImm;
4560
4561 if (SOffset && Offset) {
4562 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4563 /*HasSOffset=*/true);
4564 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4565 AddrInfo.size() > 1) {
4566 const GEPInfo &GEPI2 = AddrInfo[1];
4567 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4568 if (Register OffsetReg =
4569 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4570 Base = GEPI2.SgprParts[0];
4571 *SOffset = OffsetReg;
4572 *Offset = *EncodedImm;
4573 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4574 return true;
4575
4576 // For unbuffered smem loads, it is illegal for the Immediate Offset
4577 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4578 // is negative. Handle the case where the Immediate Offset + SOffset
4579 // is negative.
4580 auto SKnown = KB->getKnownBits(*SOffset);
4581 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4582 return false;
4583
4584 return true;
4585 }
4586 }
4587 }
4588 return false;
4589 }
4590
4591 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4592 /*HasSOffset=*/false);
4593 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4594 Base = GEPI.SgprParts[0];
4595 *Offset = *EncodedImm;
4596 return true;
4597 }
4598
4599 // SGPR offset is unsigned.
4600 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4601 GEPI.Imm != 0) {
4602 // If we make it this far we have a load with an 32-bit immediate offset.
4603 // It is OK to select this using a sgpr offset, because we have already
4604 // failed trying to select this load into one of the _IMM variants since
4605 // the _IMM Patterns are considered before the _SGPR patterns.
4606 Base = GEPI.SgprParts[0];
4607 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4608 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4609 .addImm(GEPI.Imm);
4610 return true;
4611 }
4612
4613 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4614 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4615 Base = GEPI.SgprParts[0];
4616 *SOffset = OffsetReg;
4617 return true;
4618 }
4619 }
4620
4621 return false;
4622}
4623
4625AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4626 Register Base;
4627 int64_t Offset;
4628 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4629 return std::nullopt;
4630
4631 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4632 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4633}
4634
4636AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4637 SmallVector<GEPInfo, 4> AddrInfo;
4638 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4639
4640 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4641 return std::nullopt;
4642
4643 const GEPInfo &GEPInfo = AddrInfo[0];
4644 Register PtrReg = GEPInfo.SgprParts[0];
4645 std::optional<int64_t> EncodedImm =
4646 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4647 if (!EncodedImm)
4648 return std::nullopt;
4649
4650 return {{
4651 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4652 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4653 }};
4654}
4655
4657AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4658 Register Base, SOffset;
4659 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4660 return std::nullopt;
4661
4662 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4663 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4664}
4665
4667AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4668 Register Base, SOffset;
4669 int64_t Offset;
4670 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4671 return std::nullopt;
4672
4673 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4674 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4675 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4676}
4677
4678std::pair<Register, int>
4679AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4680 uint64_t FlatVariant) const {
4681 MachineInstr *MI = Root.getParent();
4682
4683 auto Default = std::pair(Root.getReg(), 0);
4684
4685 if (!STI.hasFlatInstOffsets())
4686 return Default;
4687
4688 Register PtrBase;
4689 int64_t ConstOffset;
4690 std::tie(PtrBase, ConstOffset) =
4691 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4692
4693 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4694 !isFlatScratchBaseLegal(Root.getReg())))
4695 return Default;
4696
4697 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4698 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4699 return Default;
4700
4701 return std::pair(PtrBase, ConstOffset);
4702}
4703
4705AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4706 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4707
4708 return {{
4709 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4710 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4711 }};
4712}
4713
4715AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4716 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4717
4718 return {{
4719 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4720 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4721 }};
4722}
4723
4725AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4726 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4727
4728 return {{
4729 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4730 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4731 }};
4732}
4733
4734// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4736AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4737 Register Addr = Root.getReg();
4738 Register PtrBase;
4739 int64_t ConstOffset;
4740 int64_t ImmOffset = 0;
4741
4742 // Match the immediate offset first, which canonically is moved as low as
4743 // possible.
4744 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4745
4746 if (ConstOffset != 0) {
4747 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4749 Addr = PtrBase;
4750 ImmOffset = ConstOffset;
4751 } else {
4752 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4753 if (isSGPR(PtrBaseDef->Reg)) {
4754 if (ConstOffset > 0) {
4755 // Offset is too large.
4756 //
4757 // saddr + large_offset -> saddr +
4758 // (voffset = large_offset & ~MaxOffset) +
4759 // (large_offset & MaxOffset);
4760 int64_t SplitImmOffset, RemainderOffset;
4761 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4763
4764 if (isUInt<32>(RemainderOffset)) {
4765 MachineInstr *MI = Root.getParent();
4766 MachineBasicBlock *MBB = MI->getParent();
4767 Register HighBits =
4768 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4769
4770 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4771 HighBits)
4772 .addImm(RemainderOffset);
4773
4774 return {{
4775 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4776 [=](MachineInstrBuilder &MIB) {
4777 MIB.addReg(HighBits);
4778 }, // voffset
4779 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4780 }};
4781 }
4782 }
4783
4784 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4785 // is 1 we would need to perform 1 or 2 extra moves for each half of
4786 // the constant and it is better to do a scalar add and then issue a
4787 // single VALU instruction to materialize zero. Otherwise it is less
4788 // instructions to perform VALU adds with immediates or inline literals.
4789 unsigned NumLiterals =
4790 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
4791 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
4792 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4793 return std::nullopt;
4794 }
4795 }
4796 }
4797
4798 // Match the variable offset.
4799 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4800 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4801 // Look through the SGPR->VGPR copy.
4802 Register SAddr =
4803 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4804
4805 if (isSGPR(SAddr)) {
4806 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4807
4808 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4809 // inserted later.
4810 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4811 return {{[=](MachineInstrBuilder &MIB) { // saddr
4812 MIB.addReg(SAddr);
4813 },
4814 [=](MachineInstrBuilder &MIB) { // voffset
4815 MIB.addReg(VOffset);
4816 },
4817 [=](MachineInstrBuilder &MIB) { // offset
4818 MIB.addImm(ImmOffset);
4819 }}};
4820 }
4821 }
4822 }
4823
4824 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4825 // drop this.
4826 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4827 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4828 return std::nullopt;
4829
4830 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4831 // moves required to copy a 64-bit SGPR to VGPR.
4832 MachineInstr *MI = Root.getParent();
4833 MachineBasicBlock *MBB = MI->getParent();
4834 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4835
4836 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4837 .addImm(0);
4838
4839 return {{
4840 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4841 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4842 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4843 }};
4844}
4845
4847AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4848 Register Addr = Root.getReg();
4849 Register PtrBase;
4850 int64_t ConstOffset;
4851 int64_t ImmOffset = 0;
4852
4853 // Match the immediate offset first, which canonically is moved as low as
4854 // possible.
4855 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4856
4857 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4860 Addr = PtrBase;
4861 ImmOffset = ConstOffset;
4862 }
4863
4864 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4865 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4866 int FI = AddrDef->MI->getOperand(1).getIndex();
4867 return {{
4868 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4869 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4870 }};
4871 }
4872
4873 Register SAddr = AddrDef->Reg;
4874
4875 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4876 Register LHS = AddrDef->MI->getOperand(1).getReg();
4877 Register RHS = AddrDef->MI->getOperand(2).getReg();
4878 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4879 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4880
4881 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4882 isSGPR(RHSDef->Reg)) {
4883 int FI = LHSDef->MI->getOperand(1).getIndex();
4884 MachineInstr &I = *Root.getParent();
4885 MachineBasicBlock *BB = I.getParent();
4886 const DebugLoc &DL = I.getDebugLoc();
4887 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4888
4889 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4890 .addFrameIndex(FI)
4891 .addReg(RHSDef->Reg)
4892 .setOperandDead(3); // Dead scc
4893 }
4894 }
4895
4896 if (!isSGPR(SAddr))
4897 return std::nullopt;
4898
4899 return {{
4900 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4901 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4902 }};
4903}
4904
4905// Check whether the flat scratch SVS swizzle bug affects this access.
4906bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4907 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4908 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4909 return false;
4910
4911 // The bug affects the swizzling of SVS accesses if there is any carry out
4912 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4913 // voffset to (soffset + inst_offset).
4914 auto VKnown = KB->getKnownBits(VAddr);
4915 auto SKnown = KnownBits::add(KB->getKnownBits(SAddr),
4916 KnownBits::makeConstant(APInt(32, ImmOffset)));
4917 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4918 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4919 return (VMax & 3) + (SMax & 3) >= 4;
4920}
4921
4923AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4924 Register Addr = Root.getReg();
4925 Register PtrBase;
4926 int64_t ConstOffset;
4927 int64_t ImmOffset = 0;
4928
4929 // Match the immediate offset first, which canonically is moved as low as
4930 // possible.
4931 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4932
4933 Register OrigAddr = Addr;
4934 if (ConstOffset != 0 &&
4935 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4936 Addr = PtrBase;
4937 ImmOffset = ConstOffset;
4938 }
4939
4940 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4941 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4942 return std::nullopt;
4943
4944 Register RHS = AddrDef->MI->getOperand(2).getReg();
4945 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4946 return std::nullopt;
4947
4948 Register LHS = AddrDef->MI->getOperand(1).getReg();
4949 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4950
4951 if (OrigAddr != Addr) {
4952 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4953 return std::nullopt;
4954 } else {
4955 if (!isFlatScratchBaseLegalSV(OrigAddr))
4956 return std::nullopt;
4957 }
4958
4959 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4960 return std::nullopt;
4961
4962 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4963 int FI = LHSDef->MI->getOperand(1).getIndex();
4964 return {{
4965 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4966 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4967 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4968 }};
4969 }
4970
4971 if (!isSGPR(LHS))
4972 return std::nullopt;
4973
4974 return {{
4975 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4976 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4977 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4978 }};
4979}
4980
4982AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4983 MachineInstr *MI = Root.getParent();
4984 MachineBasicBlock *MBB = MI->getParent();
4987
4988 int64_t Offset = 0;
4989 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4991 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4992
4993 // TODO: Should this be inside the render function? The iterator seems to
4994 // move.
4995 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4996 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4997 HighBits)
4998 .addImm(Offset & ~MaxOffset);
4999
5000 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5001 MIB.addReg(Info->getScratchRSrcReg());
5002 },
5003 [=](MachineInstrBuilder &MIB) { // vaddr
5004 MIB.addReg(HighBits);
5005 },
5006 [=](MachineInstrBuilder &MIB) { // soffset
5007 // Use constant zero for soffset and rely on eliminateFrameIndex
5008 // to choose the appropriate frame register if need be.
5009 MIB.addImm(0);
5010 },
5011 [=](MachineInstrBuilder &MIB) { // offset
5012 MIB.addImm(Offset & MaxOffset);
5013 }}};
5014 }
5015
5016 assert(Offset == 0 || Offset == -1);
5017
5018 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5019 // offsets.
5020 std::optional<int> FI;
5021 Register VAddr = Root.getReg();
5022
5023 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5024 Register PtrBase;
5025 int64_t ConstOffset;
5026 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5027 if (ConstOffset != 0) {
5028 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5030 KB->signBitIsZero(PtrBase))) {
5031 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5032 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5033 FI = PtrBaseDef->getOperand(1).getIndex();
5034 else
5035 VAddr = PtrBase;
5036 Offset = ConstOffset;
5037 }
5038 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5039 FI = RootDef->getOperand(1).getIndex();
5040 }
5041
5042 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5043 MIB.addReg(Info->getScratchRSrcReg());
5044 },
5045 [=](MachineInstrBuilder &MIB) { // vaddr
5046 if (FI)
5047 MIB.addFrameIndex(*FI);
5048 else
5049 MIB.addReg(VAddr);
5050 },
5051 [=](MachineInstrBuilder &MIB) { // soffset
5052 // Use constant zero for soffset and rely on eliminateFrameIndex
5053 // to choose the appropriate frame register if need be.
5054 MIB.addImm(0);
5055 },
5056 [=](MachineInstrBuilder &MIB) { // offset
5057 MIB.addImm(Offset);
5058 }}};
5059}
5060
5061bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5062 int64_t Offset) const {
5063 if (!isUInt<16>(Offset))
5064 return false;
5065
5067 return true;
5068
5069 // On Southern Islands instruction with a negative base value and an offset
5070 // don't seem to work.
5071 return KB->signBitIsZero(Base);
5072}
5073
5074bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5075 int64_t Offset1,
5076 unsigned Size) const {
5077 if (Offset0 % Size != 0 || Offset1 % Size != 0)
5078 return false;
5079 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
5080 return false;
5081
5083 return true;
5084
5085 // On Southern Islands instruction with a negative base value and an offset
5086 // don't seem to work.
5087 return KB->signBitIsZero(Base);
5088}
5089
5090// Return whether the operation has NoUnsignedWrap property.
5092 return Addr->getOpcode() == TargetOpcode::G_OR ||
5093 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5094 Addr->getFlag(MachineInstr::NoUWrap));
5095}
5096
5097// Check that the base address of flat scratch load/store in the form of `base +
5098// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5099// requirement). We always treat the first operand as the base address here.
5100bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5101 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5102
5103 if (isNoUnsignedWrap(AddrMI))
5104 return true;
5105
5106 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5107 // values.
5108 if (STI.hasSignedScratchOffsets())
5109 return true;
5110
5111 Register LHS = AddrMI->getOperand(1).getReg();
5112 Register RHS = AddrMI->getOperand(2).getReg();
5113
5114 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5115 std::optional<ValueAndVReg> RhsValReg =
5117 // If the immediate offset is negative and within certain range, the base
5118 // address cannot also be negative. If the base is also negative, the sum
5119 // would be either negative or much larger than the valid range of scratch
5120 // memory a thread can access.
5121 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5122 RhsValReg->Value.getSExtValue() > -0x40000000)
5123 return true;
5124 }
5125
5126 return KB->signBitIsZero(LHS);
5127}
5128
5129// Check address value in SGPR/VGPR are legal for flat scratch in the form
5130// of: SGPR + VGPR.
5131bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
5132 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5133
5134 if (isNoUnsignedWrap(AddrMI))
5135 return true;
5136
5137 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5138 // values.
5139 if (STI.hasSignedScratchOffsets())
5140 return true;
5141
5142 Register LHS = AddrMI->getOperand(1).getReg();
5143 Register RHS = AddrMI->getOperand(2).getReg();
5144 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5145}
5146
5147// Check address value in SGPR/VGPR are legal for flat scratch in the form
5148// of: SGPR + VGPR + Imm.
5149bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5150 Register Addr) const {
5151 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5152 // values.
5153 if (STI.hasSignedScratchOffsets())
5154 return true;
5155
5156 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5157 Register Base = AddrMI->getOperand(1).getReg();
5158 std::optional<DefinitionAndSourceRegister> BaseDef =
5160 std::optional<ValueAndVReg> RHSOffset =
5162 assert(RHSOffset);
5163
5164 // If the immediate offset is negative and within certain range, the base
5165 // address cannot also be negative. If the base is also negative, the sum
5166 // would be either negative or much larger than the valid range of scratch
5167 // memory a thread can access.
5168 if (isNoUnsignedWrap(BaseDef->MI) &&
5169 (isNoUnsignedWrap(AddrMI) ||
5170 (RHSOffset->Value.getSExtValue() < 0 &&
5171 RHSOffset->Value.getSExtValue() > -0x40000000)))
5172 return true;
5173
5174 Register LHS = BaseDef->MI->getOperand(1).getReg();
5175 Register RHS = BaseDef->MI->getOperand(2).getReg();
5176 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5177}
5178
5179bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
5180 unsigned ShAmtBits) const {
5181 assert(MI.getOpcode() == TargetOpcode::G_AND);
5182
5183 std::optional<APInt> RHS =
5184 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
5185 if (!RHS)
5186 return false;
5187
5188 if (RHS->countr_one() >= ShAmtBits)
5189 return true;
5190
5191 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
5192 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5193}
5194
5196AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5197 MachineOperand &Root) const {
5198 Register Reg = Root.getReg();
5200
5201 std::optional<DefinitionAndSourceRegister> Def =
5202 getDefSrcRegIgnoringCopies(Reg, *MRI);
5203 assert(Def && "this shouldn't be an optional result");
5204 Reg = Def->Reg;
5205
5206 if (Register WaveBase = getWaveAddress(Def->MI)) {
5207 return {{
5208 [=](MachineInstrBuilder &MIB) { // rsrc
5209 MIB.addReg(Info->getScratchRSrcReg());
5210 },
5211 [=](MachineInstrBuilder &MIB) { // soffset
5212 MIB.addReg(WaveBase);
5213 },
5214 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
5215 }};
5216 }
5217
5218 int64_t Offset = 0;
5219
5220 // FIXME: Copy check is a hack
5222 if (mi_match(Reg, *MRI,
5223 m_GPtrAdd(m_Reg(BasePtr),
5225 if (!TII.isLegalMUBUFImmOffset(Offset))
5226 return {};
5227 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
5228 Register WaveBase = getWaveAddress(BasePtrDef);
5229 if (!WaveBase)
5230 return {};
5231
5232 return {{
5233 [=](MachineInstrBuilder &MIB) { // rsrc
5234 MIB.addReg(Info->getScratchRSrcReg());
5235 },
5236 [=](MachineInstrBuilder &MIB) { // soffset
5237 MIB.addReg(WaveBase);
5238 },
5239 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5240 }};
5241 }
5242
5243 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
5245 return {};
5246
5247 return {{
5248 [=](MachineInstrBuilder &MIB) { // rsrc
5249 MIB.addReg(Info->getScratchRSrcReg());
5250 },
5251 [=](MachineInstrBuilder &MIB) { // soffset
5252 MIB.addImm(0);
5253 },
5254 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5255 }};
5256}
5257
5258std::pair<Register, unsigned>
5259AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
5260 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5261 int64_t ConstAddr = 0;
5262
5263 Register PtrBase;
5264 int64_t Offset;
5265 std::tie(PtrBase, Offset) =
5266 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5267
5268 if (Offset) {
5269 if (isDSOffsetLegal(PtrBase, Offset)) {
5270 // (add n0, c0)
5271 return std::pair(PtrBase, Offset);
5272 }
5273 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5274 // TODO
5275
5276
5277 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5278 // TODO
5279
5280 }
5281
5282 return std::pair(Root.getReg(), 0);
5283}
5284
5286AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
5287 Register Reg;
5288 unsigned Offset;
5289 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
5290 return {{
5291 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5292 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
5293 }};
5294}
5295
5297AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
5298 return selectDSReadWrite2(Root, 4);
5299}
5300
5302AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
5303 return selectDSReadWrite2(Root, 8);
5304}
5305
5307AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5308 unsigned Size) const {
5309 Register Reg;
5310 unsigned Offset;
5311 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
5312 return {{
5313 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5314 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5315 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
5316 }};
5317}
5318
5319std::pair<Register, unsigned>
5320AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5321 unsigned Size) const {
5322 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5323 int64_t ConstAddr = 0;
5324
5325 Register PtrBase;
5326 int64_t Offset;
5327 std::tie(PtrBase, Offset) =
5328 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5329
5330 if (Offset) {
5331 int64_t OffsetValue0 = Offset;
5332 int64_t OffsetValue1 = Offset + Size;
5333 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5334 // (add n0, c0)
5335 return std::pair(PtrBase, OffsetValue0 / Size);
5336 }
5337 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5338 // TODO
5339
5340 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5341 // TODO
5342
5343 }
5344
5345 return std::pair(Root.getReg(), 0);
5346}
5347
5348/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5349/// the base value with the constant offset. There may be intervening copies
5350/// between \p Root and the identified constant. Returns \p Root, 0 if this does
5351/// not match the pattern.
5352std::pair<Register, int64_t>
5353AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5354 Register Root, const MachineRegisterInfo &MRI) const {
5355 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
5356 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5357 return {Root, 0};
5358
5359 MachineOperand &RHS = RootI->getOperand(2);
5360 std::optional<ValueAndVReg> MaybeOffset =
5362 if (!MaybeOffset)
5363 return {Root, 0};
5364 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5365}
5366
5368 MIB.addImm(0);
5369}
5370
5371/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5372/// BasePtr is not valid, a null base pointer will be used.
5374 uint32_t FormatLo, uint32_t FormatHi,
5375 Register BasePtr) {
5376 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5377 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5378 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5379 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5380
5381 B.buildInstr(AMDGPU::S_MOV_B32)
5382 .addDef(RSrc2)
5383 .addImm(FormatLo);
5384 B.buildInstr(AMDGPU::S_MOV_B32)
5385 .addDef(RSrc3)
5386 .addImm(FormatHi);
5387
5388 // Build the half of the subregister with the constants before building the
5389 // full 128-bit register. If we are building multiple resource descriptors,
5390 // this will allow CSEing of the 2-component register.
5391 B.buildInstr(AMDGPU::REG_SEQUENCE)
5392 .addDef(RSrcHi)
5393 .addReg(RSrc2)
5394 .addImm(AMDGPU::sub0)
5395 .addReg(RSrc3)
5396 .addImm(AMDGPU::sub1);
5397
5398 Register RSrcLo = BasePtr;
5399 if (!BasePtr) {
5400 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5401 B.buildInstr(AMDGPU::S_MOV_B64)
5402 .addDef(RSrcLo)
5403 .addImm(0);
5404 }
5405
5406 B.buildInstr(AMDGPU::REG_SEQUENCE)
5407 .addDef(RSrc)
5408 .addReg(RSrcLo)
5409 .addImm(AMDGPU::sub0_sub1)
5410 .addReg(RSrcHi)
5411 .addImm(AMDGPU::sub2_sub3);
5412
5413 return RSrc;
5414}
5415
5417 const SIInstrInfo &TII, Register BasePtr) {
5418 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5419
5420 // FIXME: Why are half the "default" bits ignored based on the addressing
5421 // mode?
5422 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5423}
5424
5426 const SIInstrInfo &TII, Register BasePtr) {
5427 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5428
5429 // FIXME: Why are half the "default" bits ignored based on the addressing
5430 // mode?
5431 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5432}
5433
5434AMDGPUInstructionSelector::MUBUFAddressData
5435AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5436 MUBUFAddressData Data;
5437 Data.N0 = Src;
5438
5439 Register PtrBase;
5440 int64_t Offset;
5441
5442 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5443 if (isUInt<32>(Offset)) {
5444 Data.N0 = PtrBase;
5445 Data.Offset = Offset;
5446 }
5447
5448 if (MachineInstr *InputAdd
5449 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5450 Data.N2 = InputAdd->getOperand(1).getReg();
5451 Data.N3 = InputAdd->getOperand(2).getReg();
5452
5453 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5454 // FIXME: Don't know this was defined by operand 0
5455 //
5456 // TODO: Remove this when we have copy folding optimizations after
5457 // RegBankSelect.
5458 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5459 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5460 }
5461
5462 return Data;
5463}
5464
5465/// Return if the addr64 mubuf mode should be used for the given address.
5466bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5467 // (ptr_add N2, N3) -> addr64, or
5468 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5469 if (Addr.N2)
5470 return true;
5471
5472 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5473 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5474}
5475
5476/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5477/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5478/// component.
5479void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5480 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5481 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5482 return;
5483
5484 // Illegal offset, store it in soffset.
5485 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5486 B.buildInstr(AMDGPU::S_MOV_B32)
5487 .addDef(SOffset)
5488 .addImm(ImmOffset);
5489 ImmOffset = 0;
5490}
5491
5492bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5493 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5494 Register &SOffset, int64_t &Offset) const {
5495 // FIXME: Predicates should stop this from reaching here.
5496 // addr64 bit was removed for volcanic islands.
5497 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5498 return false;
5499
5500 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5501 if (!shouldUseAddr64(AddrData))
5502 return false;
5503
5504 Register N0 = AddrData.N0;
5505 Register N2 = AddrData.N2;
5506 Register N3 = AddrData.N3;
5507 Offset = AddrData.Offset;
5508
5509 // Base pointer for the SRD.
5510 Register SRDPtr;
5511
5512 if (N2) {
5513 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5514 assert(N3);
5515 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5516 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5517 // addr64, and construct the default resource from a 0 address.
5518 VAddr = N0;
5519 } else {
5520 SRDPtr = N3;
5521 VAddr = N2;
5522 }
5523 } else {
5524 // N2 is not divergent.
5525 SRDPtr = N2;
5526 VAddr = N3;
5527 }
5528 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5529 // Use the default null pointer in the resource
5530 VAddr = N0;
5531 } else {
5532 // N0 -> offset, or
5533 // (N0 + C1) -> offset
5534 SRDPtr = N0;
5535 }
5536
5537 MachineIRBuilder B(*Root.getParent());
5538 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5539 splitIllegalMUBUFOffset(B, SOffset, Offset);
5540 return true;
5541}
5542
5543bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5544 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5545 int64_t &Offset) const {
5546
5547 // FIXME: Pattern should not reach here.
5548 if (STI.useFlatForGlobal())
5549 return false;
5550
5551 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5552 if (shouldUseAddr64(AddrData))
5553 return false;
5554
5555 // N0 -> offset, or
5556 // (N0 + C1) -> offset
5557 Register SRDPtr = AddrData.N0;
5558 Offset = AddrData.Offset;
5559
5560 // TODO: Look through extensions for 32-bit soffset.
5561 MachineIRBuilder B(*Root.getParent());
5562
5563 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5564 splitIllegalMUBUFOffset(B, SOffset, Offset);
5565 return true;
5566}
5567
5569AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5570 Register VAddr;
5571 Register RSrcReg;
5572 Register SOffset;
5573 int64_t Offset = 0;
5574
5575 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5576 return {};
5577
5578 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5579 // pattern.
5580 return {{
5581 [=](MachineInstrBuilder &MIB) { // rsrc
5582 MIB.addReg(RSrcReg);
5583 },
5584 [=](MachineInstrBuilder &MIB) { // vaddr
5585 MIB.addReg(VAddr);
5586 },
5587 [=](MachineInstrBuilder &MIB) { // soffset
5588 if (SOffset)
5589 MIB.addReg(SOffset);
5590 else if (STI.hasRestrictedSOffset())
5591 MIB.addReg(AMDGPU::SGPR_NULL);
5592 else
5593 MIB.addImm(0);
5594 },
5595 [=](MachineInstrBuilder &MIB) { // offset
5596 MIB.addImm(Offset);
5597 },
5598 addZeroImm, // cpol
5599 addZeroImm, // tfe
5600 addZeroImm // swz
5601 }};
5602}
5603
5605AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5606 Register RSrcReg;
5607 Register SOffset;
5608 int64_t Offset = 0;
5609
5610 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5611 return {};
5612
5613 return {{
5614 [=](MachineInstrBuilder &MIB) { // rsrc
5615 MIB.addReg(RSrcReg);
5616 },
5617 [=](MachineInstrBuilder &MIB) { // soffset
5618 if (SOffset)
5619 MIB.addReg(SOffset);
5620 else if (STI.hasRestrictedSOffset())
5621 MIB.addReg(AMDGPU::SGPR_NULL);
5622 else
5623 MIB.addImm(0);
5624 },
5625 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5626 addZeroImm, // cpol
5627 addZeroImm, // tfe
5628 addZeroImm, // swz
5629 }};
5630}
5631
5633AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5634
5635 Register SOffset = Root.getReg();
5636
5637 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5638 SOffset = AMDGPU::SGPR_NULL;
5639
5640 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5641}
5642
5643/// Get an immediate that must be 32-bits, and treated as zero extended.
5644static std::optional<uint64_t>
5646 // getIConstantVRegVal sexts any values, so see if that matters.
5647 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5648 if (!OffsetVal || !isInt<32>(*OffsetVal))
5649 return std::nullopt;
5650 return Lo_32(*OffsetVal);
5651}
5652
5654AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5655 std::optional<uint64_t> OffsetVal =
5656 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
5657 if (!OffsetVal)
5658 return {};
5659
5660 std::optional<int64_t> EncodedImm =
5661 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5662 if (!EncodedImm)
5663 return {};
5664
5665 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5666}
5667
5669AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5671
5672 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5673 if (!OffsetVal)
5674 return {};
5675
5676 std::optional<int64_t> EncodedImm =
5678 if (!EncodedImm)
5679 return {};
5680
5681 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5682}
5683
5685AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5686 // Match the (soffset + offset) pair as a 32-bit register base and
5687 // an immediate offset.
5688 Register SOffset;
5689 unsigned Offset;
5690 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5691 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5692 if (!SOffset)
5693 return std::nullopt;
5694
5695 std::optional<int64_t> EncodedOffset =
5696 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5697 if (!EncodedOffset)
5698 return std::nullopt;
5699
5700 assert(MRI->getType(SOffset) == LLT::scalar(32));
5701 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5702 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5703}
5704
5705std::pair<Register, unsigned>
5706AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5707 bool &Matched) const {
5708 Matched = false;
5709
5710 Register Src;
5711 unsigned Mods;
5712 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5713
5714 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
5715 assert(MRI->getType(Src) == LLT::scalar(16));
5716
5717 // Only change Src if src modifier could be gained. In such cases new Src
5718 // could be sgpr but this does not violate constant bus restriction for
5719 // instruction that is being selected.
5720 Src = stripBitCast(Src, *MRI);
5721
5722 const auto CheckAbsNeg = [&]() {
5723 // Be careful about folding modifiers if we already have an abs. fneg is
5724 // applied last, so we don't want to apply an earlier fneg.
5725 if ((Mods & SISrcMods::ABS) == 0) {
5726 unsigned ModsTmp;
5727 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
5728
5729 if ((ModsTmp & SISrcMods::NEG) != 0)
5730 Mods ^= SISrcMods::NEG;
5731
5732 if ((ModsTmp & SISrcMods::ABS) != 0)
5733 Mods |= SISrcMods::ABS;
5734 }
5735 };
5736
5737 CheckAbsNeg();
5738
5739 // op_sel/op_sel_hi decide the source type and source.
5740 // If the source's op_sel_hi is set, it indicates to do a conversion from
5741 // fp16. If the sources's op_sel is set, it picks the high half of the
5742 // source register.
5743
5744 Mods |= SISrcMods::OP_SEL_1;
5745
5746 if (isExtractHiElt(*MRI, Src, Src)) {
5747 Mods |= SISrcMods::OP_SEL_0;
5748 CheckAbsNeg();
5749 }
5750
5751 Matched = true;
5752 }
5753
5754 return {Src, Mods};
5755}
5756
5758AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5759 MachineOperand &Root) const {
5760 Register Src;
5761 unsigned Mods;
5762 bool Matched;
5763 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5764 if (!Matched)
5765 return {};
5766
5767 return {{
5768 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5769 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5770 }};
5771}
5772
5774AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5775 Register Src;
5776 unsigned Mods;
5777 bool Matched;
5778 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5779
5780 return {{
5781 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5782 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5783 }};
5784}
5785
5786bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5787 MachineInstr &I, Intrinsic::ID IntrID) const {
5788 MachineBasicBlock *MBB = I.getParent();
5789 const DebugLoc &DL = I.getDebugLoc();
5790 Register CCReg = I.getOperand(0).getReg();
5791
5792 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5793 .addImm(I.getOperand(2).getImm());
5794
5795 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5796
5797 I.eraseFromParent();
5798 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5799 *MRI);
5800}
5801
5802bool AMDGPUInstructionSelector::selectSGetBarrierState(
5803 MachineInstr &I, Intrinsic::ID IntrID) const {
5804 MachineBasicBlock *MBB = I.getParent();
5805 const DebugLoc &DL = I.getDebugLoc();
5806 MachineOperand BarOp = I.getOperand(2);
5807 std::optional<int64_t> BarValImm =
5808 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5809
5810 if (!BarValImm) {
5811 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5812 .addReg(BarOp.getReg());
5813 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5814 }
5816 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
5817 : AMDGPU::S_GET_BARRIER_STATE_M0;
5818 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5819
5820 auto DstReg = I.getOperand(0).getReg();
5821 const TargetRegisterClass *DstRC =
5822 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5823 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5824 return false;
5825 MIB.addDef(DstReg);
5826 if (BarValImm) {
5827 MIB.addImm(*BarValImm);
5828 }
5829 I.eraseFromParent();
5830 return true;
5831}
5832
5833unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5834 if (HasInlineConst) {
5835 switch (IntrID) {
5836 default:
5837 llvm_unreachable("not a named barrier op");
5838 case Intrinsic::amdgcn_s_barrier_join:
5839 return AMDGPU::S_BARRIER_JOIN_IMM;
5840 case Intrinsic::amdgcn_s_get_named_barrier_state:
5841 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5842 };
5843 } else {
5844 switch (IntrID) {
5845 default:
5846 llvm_unreachable("not a named barrier op");
5847 case Intrinsic::amdgcn_s_barrier_join:
5848 return AMDGPU::S_BARRIER_JOIN_M0;
5849 case Intrinsic::amdgcn_s_get_named_barrier_state:
5850 return AMDGPU::S_GET_BARRIER_STATE_M0;
5851 };
5852 }
5853}
5854
5855bool AMDGPUInstructionSelector::selectNamedBarrierInit(
5856 MachineInstr &I, Intrinsic::ID IntrID) const {
5857 MachineBasicBlock *MBB = I.getParent();
5858 const DebugLoc &DL = I.getDebugLoc();
5859 MachineOperand BarOp = I.getOperand(1);
5860 MachineOperand CntOp = I.getOperand(2);
5861
5862 // BarID = (BarOp >> 4) & 0x3F
5863 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5864 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5865 .add(BarOp)
5866 .addImm(4u)
5867 .setOperandDead(3); // Dead scc
5868
5869 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5870 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5871 .addReg(TmpReg0)
5872 .addImm(0x3F)
5873 .setOperandDead(3); // Dead scc
5874
5875 // MO = ((CntOp & 0x3F) << shAmt) | BarID
5876 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5877 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
5878 .add(CntOp)
5879 .addImm(0x3F)
5880 .setOperandDead(3); // Dead scc
5881
5882 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5883 constexpr unsigned ShAmt = 16;
5884 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
5885 .addReg(TmpReg2)
5886 .addImm(ShAmt)
5887 .setOperandDead(3); // Dead scc
5888
5889 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5890 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
5891 .addReg(TmpReg1)
5892 .addReg(TmpReg3)
5893 .setOperandDead(3); // Dead scc;
5894
5895 auto CopyMIB =
5896 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
5897 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5898
5899 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
5900 ? AMDGPU::S_BARRIER_INIT_M0
5901 : AMDGPU::S_BARRIER_SIGNAL_M0;
5903 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5904
5905 I.eraseFromParent();
5906 return true;
5907}
5908
5909bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5910 MachineInstr &I, Intrinsic::ID IntrID) const {
5911 MachineBasicBlock *MBB = I.getParent();
5912 const DebugLoc &DL = I.getDebugLoc();
5913 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
5914 ? I.getOperand(2)
5915 : I.getOperand(1);
5916 std::optional<int64_t> BarValImm =
5917 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5918
5919 if (!BarValImm) {
5920 // BarID = (BarOp >> 4) & 0x3F
5921 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5922 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5923 .addReg(BarOp.getReg())
5924 .addImm(4u)
5925 .setOperandDead(3); // Dead scc;
5926
5927 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5928 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5929 .addReg(TmpReg0)
5930 .addImm(0x3F)
5931 .setOperandDead(3); // Dead scc;
5932
5933 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5934 .addReg(TmpReg1);
5935 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5936 }
5937
5939 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5940 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5941
5942 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
5943 auto DstReg = I.getOperand(0).getReg();
5944 const TargetRegisterClass *DstRC =
5945 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5946 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5947 return false;
5948 MIB.addDef(DstReg);
5949 }
5950
5951 if (BarValImm) {
5952 auto BarId = ((*BarValImm) >> 4) & 0x3F;
5953 MIB.addImm(BarId);
5954 }
5955
5956 I.eraseFromParent();
5957 return true;
5958}
5959
5960void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5961 const MachineInstr &MI,
5962 int OpIdx) const {
5963 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5964 "Expected G_CONSTANT");
5965 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5966}
5967
5968void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5969 const MachineInstr &MI,
5970 int OpIdx) const {
5971 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5972 "Expected G_CONSTANT");
5973 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5974}
5975
5976void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
5977 const MachineInstr &MI,
5978 int OpIdx) const {
5979 const MachineOperand &Op = MI.getOperand(1);
5980 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
5981 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5982}
5983
5984void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5985 const MachineInstr &MI,
5986 int OpIdx) const {
5987 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5988 "Expected G_CONSTANT");
5989 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5990}
5991
5992/// This only really exists to satisfy DAG type checking machinery, so is a
5993/// no-op here.
5994void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5995 const MachineInstr &MI,
5996 int OpIdx) const {
5997 const MachineOperand &Op = MI.getOperand(OpIdx);
5998 int64_t Imm;
5999 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6000 MIB.addImm(Imm);
6001 else
6002 MIB.addImm(Op.getImm());
6003}
6004
6005void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6006 const MachineInstr &MI,
6007 int OpIdx) const {
6008 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6009}
6010
6011void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6012 const MachineInstr &MI,
6013 int OpIdx) const {
6014 assert(OpIdx >= 0 && "expected to match an immediate operand");
6015 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6016}
6017
6018void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6019 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6020 assert(OpIdx >= 0 && "expected to match an immediate operand");
6021 MIB.addImm(
6022 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6023}
6024
6025void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6026 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6027 assert(OpIdx >= 0 && "expected to match an immediate operand");
6028 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6030 : (int64_t)SISrcMods::DST_OP_SEL);
6031}
6032
6033void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6034 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6035 assert(OpIdx >= 0 && "expected to match an immediate operand");
6036 MIB.addImm(
6037 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6038}
6039
6040void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6041 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6042 assert(OpIdx >= 0 && "expected to match an immediate operand");
6043 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6044 ? (int64_t)(SISrcMods::OP_SEL_0)
6045 : 0);
6046}
6047
6048void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6049 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6050 assert(OpIdx >= 0 && "expected to match an immediate operand");
6051 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6052 : 0);
6053}
6054
6055void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6056 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6057 assert(OpIdx >= 0 && "expected to match an immediate operand");
6058 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6059 : 0);
6060}
6061
6062void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6063 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6064 assert(OpIdx >= 0 && "expected to match an immediate operand");
6065 MIB.addImm(
6066 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6067}
6068
6069void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6070 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6071 assert(OpIdx >= 0 && "expected to match an immediate operand");
6072 MIB.addImm(
6073 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
6074}
6075
6076void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6077 const MachineInstr &MI,
6078 int OpIdx) const {
6079 assert(OpIdx >= 0 && "expected to match an immediate operand");
6080 MIB.addImm(MI.getOperand(OpIdx).getImm() &
6083}
6084
6085void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6086 const MachineInstr &MI,
6087 int OpIdx) const {
6088 assert(OpIdx >= 0 && "expected to match an immediate operand");
6089 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
6092 MIB.addImm(Swizzle);
6093}
6094
6095void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6096 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6097 assert(OpIdx >= 0 && "expected to match an immediate operand");
6098 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
6101 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
6102}
6103
6104void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6105 const MachineInstr &MI,
6106 int OpIdx) const {
6107 MIB.addFrameIndex(MI.getOperand(1).getIndex());
6108}
6109
6110void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6111 const MachineInstr &MI,
6112 int OpIdx) const {
6113 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
6114 int ExpVal = APF.getExactLog2Abs();
6115 assert(ExpVal != INT_MIN);
6116 MIB.addImm(ExpVal);
6117}
6118
6119void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6120 const MachineInstr &MI,
6121 int OpIdx) const {
6122 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
6123 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6124 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
6125 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
6126 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
6127}
6128
6129/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
6130void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6131 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6132 unsigned Val = MI.getOperand(OpIdx).getImm();
6133 unsigned New = 0;
6134 if (Val & 0x1)
6136 if (Val & 0x2)
6138 MIB.addImm(New);
6139}
6140
6141bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
6142 return TII.isInlineConstant(Imm);
6143}
6144
6145bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
6146 return TII.isInlineConstant(Imm);
6147}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1489
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:690
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:688
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:687
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:685
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:686
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:675
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:271
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:163
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:350
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:482
bool hasBitOp3Insts() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:563
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:287
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:730
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:277
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:270
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1069
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:578
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
MCRegister getExec() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:910
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:645
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:294
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:486
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:314
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:439
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
DWARFExpression::Operation Op
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:467
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:493
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.