LLVM 20.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
60 CodeGenCoverage *CoverageInfo,
62 BlockFrequencyInfo *BFI) {
63 MRI = &MF.getRegInfo();
64 Subtarget = &MF.getSubtarget<GCNSubtarget>();
67}
68
69// Return the wave level SGPR base address if this is a wave address.
71 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72 ? Def->getOperand(1).getReg()
73 : Register();
74}
75
76bool AMDGPUInstructionSelector::isVCC(Register Reg,
77 const MachineRegisterInfo &MRI) const {
78 // The verifier is oblivious to s1 being a valid value for wavesize registers.
79 if (Reg.isPhysical())
80 return false;
81
82 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
83 const TargetRegisterClass *RC =
84 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
85 if (RC) {
86 const LLT Ty = MRI.getType(Reg);
87 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
88 return false;
89 // G_TRUNC s1 result is never vcc.
90 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
91 RC->hasSuperClassEq(TRI.getBoolRC());
92 }
93
94 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
95 return RB->getID() == AMDGPU::VCCRegBankID;
96}
97
98bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
99 unsigned NewOpc) const {
100 MI.setDesc(TII.get(NewOpc));
101 MI.removeOperand(1); // Remove intrinsic ID.
102 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
103
104 MachineOperand &Dst = MI.getOperand(0);
105 MachineOperand &Src = MI.getOperand(1);
106
107 // TODO: This should be legalized to s32 if needed
108 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
109 return false;
110
111 const TargetRegisterClass *DstRC
112 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
113 const TargetRegisterClass *SrcRC
114 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
115 if (!DstRC || DstRC != SrcRC)
116 return false;
117
118 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
119 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
120}
121
122bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
123 const DebugLoc &DL = I.getDebugLoc();
124 MachineBasicBlock *BB = I.getParent();
125 I.setDesc(TII.get(TargetOpcode::COPY));
126
127 const MachineOperand &Src = I.getOperand(1);
128 MachineOperand &Dst = I.getOperand(0);
129 Register DstReg = Dst.getReg();
130 Register SrcReg = Src.getReg();
131
132 if (isVCC(DstReg, *MRI)) {
133 if (SrcReg == AMDGPU::SCC) {
134 const TargetRegisterClass *RC
135 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
136 if (!RC)
137 return true;
138 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
139 }
140
141 if (!isVCC(SrcReg, *MRI)) {
142 // TODO: Should probably leave the copy and let copyPhysReg expand it.
143 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
144 return false;
145
146 const TargetRegisterClass *SrcRC
147 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
148
149 std::optional<ValueAndVReg> ConstVal =
150 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
151 if (ConstVal) {
152 unsigned MovOpc =
153 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
154 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
155 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
156 } else {
157 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
158
159 // We can't trust the high bits at this point, so clear them.
160
161 // TODO: Skip masking high bits if def is known boolean.
162
163 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
164 assert(Subtarget->useRealTrue16Insts());
165 const int64_t NoMods = 0;
166 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
167 .addImm(NoMods)
168 .addImm(1)
169 .addImm(NoMods)
170 .addReg(SrcReg)
171 .addImm(NoMods);
172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
173 .addImm(NoMods)
174 .addImm(0)
175 .addImm(NoMods)
176 .addReg(MaskedReg)
177 .addImm(NoMods);
178 } else {
179 bool IsSGPR = TRI.isSGPRClass(SrcRC);
180 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
181 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
182 .addImm(1)
183 .addReg(SrcReg);
184 if (IsSGPR)
185 And.setOperandDead(3); // Dead scc
186
187 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
188 .addImm(0)
189 .addReg(MaskedReg);
190 }
191 }
192
193 if (!MRI->getRegClassOrNull(SrcReg))
194 MRI->setRegClass(SrcReg, SrcRC);
195 I.eraseFromParent();
196 return true;
197 }
198
199 const TargetRegisterClass *RC =
201 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
202 return false;
203
204 return true;
205 }
206
207 for (const MachineOperand &MO : I.operands()) {
208 if (MO.getReg().isPhysical())
209 continue;
210
211 const TargetRegisterClass *RC =
213 if (!RC)
214 continue;
215 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
216 }
217 return true;
218}
219
220bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
221 const Register DefReg = I.getOperand(0).getReg();
222 const LLT DefTy = MRI->getType(DefReg);
223
224 // S1 G_PHIs should not be selected in instruction-select, instead:
225 // - divergent S1 G_PHI should go through lane mask merging algorithm
226 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
227 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
228 if (DefTy == LLT::scalar(1))
229 return false;
230
231 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
232
233 const RegClassOrRegBank &RegClassOrBank =
234 MRI->getRegClassOrRegBank(DefReg);
235
236 const TargetRegisterClass *DefRC =
237 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
238 if (!DefRC) {
239 if (!DefTy.isValid()) {
240 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
241 return false;
242 }
243
244 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
245 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
246 if (!DefRC) {
247 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
248 return false;
249 }
250 }
251
252 // TODO: Verify that all registers have the same bank
253 I.setDesc(TII.get(TargetOpcode::PHI));
254 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
255}
256
258AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
259 const TargetRegisterClass &SubRC,
260 unsigned SubIdx) const {
261
262 MachineInstr *MI = MO.getParent();
264 Register DstReg = MRI->createVirtualRegister(&SubRC);
265
266 if (MO.isReg()) {
267 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
268 Register Reg = MO.getReg();
269 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
270 .addReg(Reg, 0, ComposedSubIdx);
271
272 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
273 MO.isKill(), MO.isDead(), MO.isUndef(),
274 MO.isEarlyClobber(), 0, MO.isDebug(),
275 MO.isInternalRead());
276 }
277
278 assert(MO.isImm());
279
280 APInt Imm(64, MO.getImm());
281
282 switch (SubIdx) {
283 default:
284 llvm_unreachable("do not know to split immediate with this sub index.");
285 case AMDGPU::sub0:
286 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
287 case AMDGPU::sub1:
288 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
289 }
290}
291
292static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
293 switch (Opc) {
294 case AMDGPU::G_AND:
295 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
296 case AMDGPU::G_OR:
297 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
298 case AMDGPU::G_XOR:
299 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
300 default:
301 llvm_unreachable("not a bit op");
302 }
303}
304
305bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
306 Register DstReg = I.getOperand(0).getReg();
307 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
308
309 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
310 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
311 DstRB->getID() != AMDGPU::VCCRegBankID)
312 return false;
313
314 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
315 STI.isWave64());
316 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
317
318 // Dead implicit-def of scc
319 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
320 true, // isImp
321 false, // isKill
322 true)); // isDead
323 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
324}
325
326bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
327 MachineBasicBlock *BB = I.getParent();
329 Register DstReg = I.getOperand(0).getReg();
330 const DebugLoc &DL = I.getDebugLoc();
331 LLT Ty = MRI->getType(DstReg);
332 if (Ty.isVector())
333 return false;
334
335 unsigned Size = Ty.getSizeInBits();
336 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
337 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
338 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
339
340 if (Size == 32) {
341 if (IsSALU) {
342 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
344 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
345 .add(I.getOperand(1))
346 .add(I.getOperand(2))
347 .setOperandDead(3); // Dead scc
348 I.eraseFromParent();
349 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
350 }
351
352 if (STI.hasAddNoCarry()) {
353 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
354 I.setDesc(TII.get(Opc));
355 I.addOperand(*MF, MachineOperand::CreateImm(0));
356 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
357 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
358 }
359
360 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
361
362 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
364 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
365 .addDef(UnusedCarry, RegState::Dead)
366 .add(I.getOperand(1))
367 .add(I.getOperand(2))
368 .addImm(0);
369 I.eraseFromParent();
370 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
371 }
372
373 assert(!Sub && "illegal sub should not reach here");
374
375 const TargetRegisterClass &RC
376 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
377 const TargetRegisterClass &HalfRC
378 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
379
380 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
381 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
382 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
383 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
384
385 Register DstLo = MRI->createVirtualRegister(&HalfRC);
386 Register DstHi = MRI->createVirtualRegister(&HalfRC);
387
388 if (IsSALU) {
389 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
390 .add(Lo1)
391 .add(Lo2);
392 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
393 .add(Hi1)
394 .add(Hi2)
395 .setOperandDead(3); // Dead scc
396 } else {
397 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
398 Register CarryReg = MRI->createVirtualRegister(CarryRC);
399 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
400 .addDef(CarryReg)
401 .add(Lo1)
402 .add(Lo2)
403 .addImm(0);
404 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
405 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
406 .add(Hi1)
407 .add(Hi2)
408 .addReg(CarryReg, RegState::Kill)
409 .addImm(0);
410
411 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
412 return false;
413 }
414
415 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
416 .addReg(DstLo)
417 .addImm(AMDGPU::sub0)
418 .addReg(DstHi)
419 .addImm(AMDGPU::sub1);
420
421
422 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
423 return false;
424
425 I.eraseFromParent();
426 return true;
427}
428
429bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
430 MachineInstr &I) const {
431 MachineBasicBlock *BB = I.getParent();
433 const DebugLoc &DL = I.getDebugLoc();
434 Register Dst0Reg = I.getOperand(0).getReg();
435 Register Dst1Reg = I.getOperand(1).getReg();
436 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
437 I.getOpcode() == AMDGPU::G_UADDE;
438 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
439 I.getOpcode() == AMDGPU::G_USUBE;
440
441 if (isVCC(Dst1Reg, *MRI)) {
442 unsigned NoCarryOpc =
443 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
444 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
445 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
446 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
447 I.addOperand(*MF, MachineOperand::CreateImm(0));
448 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
449 }
450
451 Register Src0Reg = I.getOperand(2).getReg();
452 Register Src1Reg = I.getOperand(3).getReg();
453
454 if (HasCarryIn) {
455 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
456 .addReg(I.getOperand(4).getReg());
457 }
458
459 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
460 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
461
462 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
463 .add(I.getOperand(2))
464 .add(I.getOperand(3));
465
466 if (MRI->use_nodbg_empty(Dst1Reg)) {
467 CarryInst.setOperandDead(3); // Dead scc
468 } else {
469 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
470 .addReg(AMDGPU::SCC);
471 if (!MRI->getRegClassOrNull(Dst1Reg))
472 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
473 }
474
475 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
476 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
477 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
478 return false;
479
480 if (HasCarryIn &&
481 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
482 AMDGPU::SReg_32RegClass, *MRI))
483 return false;
484
485 I.eraseFromParent();
486 return true;
487}
488
489bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
490 MachineInstr &I) const {
491 MachineBasicBlock *BB = I.getParent();
493 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
494
495 unsigned Opc;
496 if (Subtarget->hasMADIntraFwdBug())
497 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
498 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
499 else
500 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
501 I.setDesc(TII.get(Opc));
502 I.addOperand(*MF, MachineOperand::CreateImm(0));
503 I.addImplicitDefUseOperands(*MF);
504 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
505}
506
507// TODO: We should probably legalize these to only using 32-bit results.
508bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
509 MachineBasicBlock *BB = I.getParent();
510 Register DstReg = I.getOperand(0).getReg();
511 Register SrcReg = I.getOperand(1).getReg();
512 LLT DstTy = MRI->getType(DstReg);
513 LLT SrcTy = MRI->getType(SrcReg);
514 const unsigned SrcSize = SrcTy.getSizeInBits();
515 unsigned DstSize = DstTy.getSizeInBits();
516
517 // TODO: Should handle any multiple of 32 offset.
518 unsigned Offset = I.getOperand(2).getImm();
519 if (Offset % 32 != 0 || DstSize > 128)
520 return false;
521
522 // 16-bit operations really use 32-bit registers.
523 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
524 if (DstSize == 16)
525 DstSize = 32;
526
527 const TargetRegisterClass *DstRC =
528 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
529 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
530 return false;
531
532 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
533 const TargetRegisterClass *SrcRC =
534 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
535 if (!SrcRC)
536 return false;
538 DstSize / 32);
539 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
540 if (!SrcRC)
541 return false;
542
543 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
544 *SrcRC, I.getOperand(1));
545 const DebugLoc &DL = I.getDebugLoc();
546 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
547 .addReg(SrcReg, 0, SubReg);
548
549 I.eraseFromParent();
550 return true;
551}
552
553bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
554 MachineBasicBlock *BB = MI.getParent();
555 Register DstReg = MI.getOperand(0).getReg();
556 LLT DstTy = MRI->getType(DstReg);
557 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
558
559 const unsigned SrcSize = SrcTy.getSizeInBits();
560 if (SrcSize < 32)
561 return selectImpl(MI, *CoverageInfo);
562
563 const DebugLoc &DL = MI.getDebugLoc();
564 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
565 const unsigned DstSize = DstTy.getSizeInBits();
566 const TargetRegisterClass *DstRC =
567 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
568 if (!DstRC)
569 return false;
570
571 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
573 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
574 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
575 MachineOperand &Src = MI.getOperand(I + 1);
576 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
577 MIB.addImm(SubRegs[I]);
578
579 const TargetRegisterClass *SrcRC
580 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
581 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
582 return false;
583 }
584
585 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
586 return false;
587
588 MI.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
593 MachineBasicBlock *BB = MI.getParent();
594 const int NumDst = MI.getNumOperands() - 1;
595
596 MachineOperand &Src = MI.getOperand(NumDst);
597
598 Register SrcReg = Src.getReg();
599 Register DstReg0 = MI.getOperand(0).getReg();
600 LLT DstTy = MRI->getType(DstReg0);
601 LLT SrcTy = MRI->getType(SrcReg);
602
603 const unsigned DstSize = DstTy.getSizeInBits();
604 const unsigned SrcSize = SrcTy.getSizeInBits();
605 const DebugLoc &DL = MI.getDebugLoc();
606 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
607
608 const TargetRegisterClass *SrcRC =
609 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
610 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
611 return false;
612
613 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
614 // source, and this relies on the fact that the same subregister indices are
615 // used for both.
616 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
617 for (int I = 0, E = NumDst; I != E; ++I) {
618 MachineOperand &Dst = MI.getOperand(I);
619 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
620 .addReg(SrcReg, 0, SubRegs[I]);
621
622 // Make sure the subregister index is valid for the source register.
623 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
624 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
625 return false;
626
627 const TargetRegisterClass *DstRC =
629 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
630 return false;
631 }
632
633 MI.eraseFromParent();
634 return true;
635}
636
637bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
638 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
639 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
640
641 Register Src0 = MI.getOperand(1).getReg();
642 Register Src1 = MI.getOperand(2).getReg();
643 LLT SrcTy = MRI->getType(Src0);
644 const unsigned SrcSize = SrcTy.getSizeInBits();
645
646 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
647 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
648 return selectG_MERGE_VALUES(MI);
649 }
650
651 // Selection logic below is for V2S16 only.
652 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
653 Register Dst = MI.getOperand(0).getReg();
654 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
655 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
656 SrcTy != LLT::scalar(32)))
657 return selectImpl(MI, *CoverageInfo);
658
659 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
660 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
661 return false;
662
663 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
664 DstBank->getID() == AMDGPU::VGPRRegBankID);
665 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
666
667 const DebugLoc &DL = MI.getDebugLoc();
668 MachineBasicBlock *BB = MI.getParent();
669
670 // First, before trying TableGen patterns, check if both sources are
671 // constants. In those cases, we can trivially compute the final constant
672 // and emit a simple move.
673 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
674 if (ConstSrc1) {
675 auto ConstSrc0 =
676 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
677 if (ConstSrc0) {
678 const int64_t K0 = ConstSrc0->Value.getSExtValue();
679 const int64_t K1 = ConstSrc1->Value.getSExtValue();
680 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
681 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
682 uint32_t Imm = Lo16 | (Hi16 << 16);
683
684 // VALU
685 if (IsVector) {
686 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
687 MI.eraseFromParent();
688 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
689 }
690
691 // SALU
692 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
693 MI.eraseFromParent();
694 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
695 }
696 }
697
698 // Now try TableGen patterns.
699 if (selectImpl(MI, *CoverageInfo))
700 return true;
701
702 // TODO: This should probably be a combine somewhere
703 // (build_vector $src0, undef) -> copy $src0
704 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
705 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
706 MI.setDesc(TII.get(AMDGPU::COPY));
707 MI.removeOperand(2);
708 const auto &RC =
709 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
710 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
711 RBI.constrainGenericRegister(Src0, RC, *MRI);
712 }
713
714 // TODO: Can be improved?
715 if (IsVector) {
716 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
717 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
718 .addImm(0xFFFF)
719 .addReg(Src0);
720 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
721 return false;
722
723 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
724 .addReg(Src1)
725 .addImm(16)
726 .addReg(TmpReg);
727 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
728 return false;
729
730 MI.eraseFromParent();
731 return true;
732 }
733
734 Register ShiftSrc0;
735 Register ShiftSrc1;
736
737 // With multiple uses of the shift, this will duplicate the shift and
738 // increase register pressure.
739 //
740 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
741 // => (S_PACK_HH_B32_B16 $src0, $src1)
742 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
743 // => (S_PACK_HL_B32_B16 $src0, $src1)
744 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
745 // => (S_PACK_LH_B32_B16 $src0, $src1)
746 // (build_vector $src0, $src1)
747 // => (S_PACK_LL_B32_B16 $src0, $src1)
748
749 bool Shift0 = mi_match(
750 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
751
752 bool Shift1 = mi_match(
753 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
754
755 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
756 if (Shift0 && Shift1) {
757 Opc = AMDGPU::S_PACK_HH_B32_B16;
758 MI.getOperand(1).setReg(ShiftSrc0);
759 MI.getOperand(2).setReg(ShiftSrc1);
760 } else if (Shift1) {
761 Opc = AMDGPU::S_PACK_LH_B32_B16;
762 MI.getOperand(2).setReg(ShiftSrc1);
763 } else if (Shift0) {
764 auto ConstSrc1 =
765 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
766 if (ConstSrc1 && ConstSrc1->Value == 0) {
767 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
768 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
769 .addReg(ShiftSrc0)
770 .addImm(16)
771 .setOperandDead(3); // Dead scc
772
773 MI.eraseFromParent();
774 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
775 }
776 if (STI.hasSPackHL()) {
777 Opc = AMDGPU::S_PACK_HL_B32_B16;
778 MI.getOperand(1).setReg(ShiftSrc0);
779 }
780 }
781
782 MI.setDesc(TII.get(Opc));
783 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
784}
785
786bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
787 const MachineOperand &MO = I.getOperand(0);
788
789 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
790 // regbank check here is to know why getConstrainedRegClassForOperand failed.
792 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
793 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
794 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
795 return true;
796 }
797
798 return false;
799}
800
801bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
802 MachineBasicBlock *BB = I.getParent();
803
804 Register DstReg = I.getOperand(0).getReg();
805 Register Src0Reg = I.getOperand(1).getReg();
806 Register Src1Reg = I.getOperand(2).getReg();
807 LLT Src1Ty = MRI->getType(Src1Reg);
808
809 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
810 unsigned InsSize = Src1Ty.getSizeInBits();
811
812 int64_t Offset = I.getOperand(3).getImm();
813
814 // FIXME: These cases should have been illegal and unnecessary to check here.
815 if (Offset % 32 != 0 || InsSize % 32 != 0)
816 return false;
817
818 // Currently not handled by getSubRegFromChannel.
819 if (InsSize > 128)
820 return false;
821
822 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
823 if (SubReg == AMDGPU::NoSubRegister)
824 return false;
825
826 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
827 const TargetRegisterClass *DstRC =
828 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
829 if (!DstRC)
830 return false;
831
832 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
833 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
834 const TargetRegisterClass *Src0RC =
835 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
836 const TargetRegisterClass *Src1RC =
837 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
838
839 // Deal with weird cases where the class only partially supports the subreg
840 // index.
841 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
842 if (!Src0RC || !Src1RC)
843 return false;
844
845 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
846 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
847 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
848 return false;
849
850 const DebugLoc &DL = I.getDebugLoc();
851 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
852 .addReg(Src0Reg)
853 .addReg(Src1Reg)
854 .addImm(SubReg);
855
856 I.eraseFromParent();
857 return true;
858}
859
860bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
861 Register DstReg = MI.getOperand(0).getReg();
862 Register SrcReg = MI.getOperand(1).getReg();
863 Register OffsetReg = MI.getOperand(2).getReg();
864 Register WidthReg = MI.getOperand(3).getReg();
865
866 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
867 "scalar BFX instructions are expanded in regbankselect");
868 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
869 "64-bit vector BFX instructions are expanded in regbankselect");
870
871 const DebugLoc &DL = MI.getDebugLoc();
872 MachineBasicBlock *MBB = MI.getParent();
873
874 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
875 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
876 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
877 .addReg(SrcReg)
878 .addReg(OffsetReg)
879 .addReg(WidthReg);
880 MI.eraseFromParent();
881 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
882}
883
884bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
885 if (STI.getLDSBankCount() != 16)
886 return selectImpl(MI, *CoverageInfo);
887
888 Register Dst = MI.getOperand(0).getReg();
889 Register Src0 = MI.getOperand(2).getReg();
890 Register M0Val = MI.getOperand(6).getReg();
891 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
892 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
893 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
894 return false;
895
896 // This requires 2 instructions. It is possible to write a pattern to support
897 // this, but the generated isel emitter doesn't correctly deal with multiple
898 // output instructions using the same physical register input. The copy to m0
899 // is incorrectly placed before the second instruction.
900 //
901 // TODO: Match source modifiers.
902
903 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
904 const DebugLoc &DL = MI.getDebugLoc();
905 MachineBasicBlock *MBB = MI.getParent();
906
907 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
908 .addReg(M0Val);
909 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
910 .addImm(2)
911 .addImm(MI.getOperand(4).getImm()) // $attr
912 .addImm(MI.getOperand(3).getImm()); // $attrchan
913
914 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
915 .addImm(0) // $src0_modifiers
916 .addReg(Src0) // $src0
917 .addImm(MI.getOperand(4).getImm()) // $attr
918 .addImm(MI.getOperand(3).getImm()) // $attrchan
919 .addImm(0) // $src2_modifiers
920 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
921 .addImm(MI.getOperand(5).getImm()) // $high
922 .addImm(0) // $clamp
923 .addImm(0); // $omod
924
925 MI.eraseFromParent();
926 return true;
927}
928
929// Writelane is special in that it can use SGPR and M0 (which would normally
930// count as using the constant bus twice - but in this case it is allowed since
931// the lane selector doesn't count as a use of the constant bus). However, it is
932// still required to abide by the 1 SGPR rule. Fix this up if we might have
933// multiple SGPRs.
934bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
935 // With a constant bus limit of at least 2, there's no issue.
936 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
937 return selectImpl(MI, *CoverageInfo);
938
939 MachineBasicBlock *MBB = MI.getParent();
940 const DebugLoc &DL = MI.getDebugLoc();
941 Register VDst = MI.getOperand(0).getReg();
942 Register Val = MI.getOperand(2).getReg();
943 Register LaneSelect = MI.getOperand(3).getReg();
944 Register VDstIn = MI.getOperand(4).getReg();
945
946 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
947
948 std::optional<ValueAndVReg> ConstSelect =
949 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
950 if (ConstSelect) {
951 // The selector has to be an inline immediate, so we can use whatever for
952 // the other operands.
953 MIB.addReg(Val);
954 MIB.addImm(ConstSelect->Value.getSExtValue() &
955 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
956 } else {
957 std::optional<ValueAndVReg> ConstVal =
959
960 // If the value written is an inline immediate, we can get away without a
961 // copy to m0.
962 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
963 STI.hasInv2PiInlineImm())) {
964 MIB.addImm(ConstVal->Value.getSExtValue());
965 MIB.addReg(LaneSelect);
966 } else {
967 MIB.addReg(Val);
968
969 // If the lane selector was originally in a VGPR and copied with
970 // readfirstlane, there's a hazard to read the same SGPR from the
971 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
972 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
973
974 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
975 .addReg(LaneSelect);
976 MIB.addReg(AMDGPU::M0);
977 }
978 }
979
980 MIB.addReg(VDstIn);
981
982 MI.eraseFromParent();
983 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
984}
985
986// We need to handle this here because tablegen doesn't support matching
987// instructions with multiple outputs.
988bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
989 Register Dst0 = MI.getOperand(0).getReg();
990 Register Dst1 = MI.getOperand(1).getReg();
991
992 LLT Ty = MRI->getType(Dst0);
993 unsigned Opc;
994 if (Ty == LLT::scalar(32))
995 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
996 else if (Ty == LLT::scalar(64))
997 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
998 else
999 return false;
1000
1001 // TODO: Match source modifiers.
1002
1003 const DebugLoc &DL = MI.getDebugLoc();
1004 MachineBasicBlock *MBB = MI.getParent();
1005
1006 Register Numer = MI.getOperand(3).getReg();
1007 Register Denom = MI.getOperand(4).getReg();
1008 unsigned ChooseDenom = MI.getOperand(5).getImm();
1009
1010 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1011
1012 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1013 .addDef(Dst1)
1014 .addImm(0) // $src0_modifiers
1015 .addUse(Src0) // $src0
1016 .addImm(0) // $src1_modifiers
1017 .addUse(Denom) // $src1
1018 .addImm(0) // $src2_modifiers
1019 .addUse(Numer) // $src2
1020 .addImm(0) // $clamp
1021 .addImm(0); // $omod
1022
1023 MI.eraseFromParent();
1024 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1025}
1026
1027bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1028 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1029 switch (IntrinsicID) {
1030 case Intrinsic::amdgcn_if_break: {
1031 MachineBasicBlock *BB = I.getParent();
1032
1033 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1034 // SelectionDAG uses for wave32 vs wave64.
1035 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1036 .add(I.getOperand(0))
1037 .add(I.getOperand(2))
1038 .add(I.getOperand(3));
1039
1040 Register DstReg = I.getOperand(0).getReg();
1041 Register Src0Reg = I.getOperand(2).getReg();
1042 Register Src1Reg = I.getOperand(3).getReg();
1043
1044 I.eraseFromParent();
1045
1046 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1047 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1048
1049 return true;
1050 }
1051 case Intrinsic::amdgcn_interp_p1_f16:
1052 return selectInterpP1F16(I);
1053 case Intrinsic::amdgcn_wqm:
1054 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1055 case Intrinsic::amdgcn_softwqm:
1056 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1057 case Intrinsic::amdgcn_strict_wwm:
1058 case Intrinsic::amdgcn_wwm:
1059 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1060 case Intrinsic::amdgcn_strict_wqm:
1061 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1062 case Intrinsic::amdgcn_writelane:
1063 return selectWritelane(I);
1064 case Intrinsic::amdgcn_div_scale:
1065 return selectDivScale(I);
1066 case Intrinsic::amdgcn_icmp:
1067 case Intrinsic::amdgcn_fcmp:
1068 if (selectImpl(I, *CoverageInfo))
1069 return true;
1070 return selectIntrinsicCmp(I);
1071 case Intrinsic::amdgcn_ballot:
1072 return selectBallot(I);
1073 case Intrinsic::amdgcn_reloc_constant:
1074 return selectRelocConstant(I);
1075 case Intrinsic::amdgcn_groupstaticsize:
1076 return selectGroupStaticSize(I);
1077 case Intrinsic::returnaddress:
1078 return selectReturnAddress(I);
1079 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1080 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1081 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1082 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1083 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1084 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1085 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1086 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1087 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1088 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1089 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1090 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1091 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1092 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1093 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1094 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1095 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1096 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1097 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1098 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1099 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1100 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1101 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1102 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1103 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1104 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1105 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1106 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1107 return selectSMFMACIntrin(I);
1108 case Intrinsic::amdgcn_permlane16_swap:
1109 case Intrinsic::amdgcn_permlane32_swap:
1110 return selectPermlaneSwapIntrin(I, IntrinsicID);
1111 default:
1112 return selectImpl(I, *CoverageInfo);
1113 }
1114}
1115
1117 const GCNSubtarget &ST) {
1118 if (Size != 16 && Size != 32 && Size != 64)
1119 return -1;
1120
1121 if (Size == 16 && !ST.has16BitInsts())
1122 return -1;
1123
1124 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1125 unsigned FakeS16Opc, unsigned S32Opc,
1126 unsigned S64Opc) {
1127 if (Size == 16)
1128 return ST.hasTrue16BitInsts()
1129 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1130 : S16Opc;
1131 if (Size == 32)
1132 return S32Opc;
1133 return S64Opc;
1134 };
1135
1136 switch (P) {
1137 default:
1138 llvm_unreachable("Unknown condition code!");
1139 case CmpInst::ICMP_NE:
1140 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1141 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1142 AMDGPU::V_CMP_NE_U64_e64);
1143 case CmpInst::ICMP_EQ:
1144 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1145 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1146 AMDGPU::V_CMP_EQ_U64_e64);
1147 case CmpInst::ICMP_SGT:
1148 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1149 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1150 AMDGPU::V_CMP_GT_I64_e64);
1151 case CmpInst::ICMP_SGE:
1152 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1153 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1154 AMDGPU::V_CMP_GE_I64_e64);
1155 case CmpInst::ICMP_SLT:
1156 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1157 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1158 AMDGPU::V_CMP_LT_I64_e64);
1159 case CmpInst::ICMP_SLE:
1160 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1161 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1162 AMDGPU::V_CMP_LE_I64_e64);
1163 case CmpInst::ICMP_UGT:
1164 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1165 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1166 AMDGPU::V_CMP_GT_U64_e64);
1167 case CmpInst::ICMP_UGE:
1168 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1169 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1170 AMDGPU::V_CMP_GE_U64_e64);
1171 case CmpInst::ICMP_ULT:
1172 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1173 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1174 AMDGPU::V_CMP_LT_U64_e64);
1175 case CmpInst::ICMP_ULE:
1176 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1177 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1178 AMDGPU::V_CMP_LE_U64_e64);
1179
1180 case CmpInst::FCMP_OEQ:
1181 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1182 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1183 AMDGPU::V_CMP_EQ_F64_e64);
1184 case CmpInst::FCMP_OGT:
1185 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1186 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1187 AMDGPU::V_CMP_GT_F64_e64);
1188 case CmpInst::FCMP_OGE:
1189 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1190 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1191 AMDGPU::V_CMP_GE_F64_e64);
1192 case CmpInst::FCMP_OLT:
1193 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1194 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1195 AMDGPU::V_CMP_LT_F64_e64);
1196 case CmpInst::FCMP_OLE:
1197 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1198 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1199 AMDGPU::V_CMP_LE_F64_e64);
1200 case CmpInst::FCMP_ONE:
1201 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1202 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1203 AMDGPU::V_CMP_NEQ_F64_e64);
1204 case CmpInst::FCMP_ORD:
1205 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1206 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1207 AMDGPU::V_CMP_O_F64_e64);
1208 case CmpInst::FCMP_UNO:
1209 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1210 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1211 AMDGPU::V_CMP_U_F64_e64);
1212 case CmpInst::FCMP_UEQ:
1213 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1214 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1215 AMDGPU::V_CMP_NLG_F64_e64);
1216 case CmpInst::FCMP_UGT:
1217 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1218 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1219 AMDGPU::V_CMP_NLE_F64_e64);
1220 case CmpInst::FCMP_UGE:
1221 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1222 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1223 AMDGPU::V_CMP_NLT_F64_e64);
1224 case CmpInst::FCMP_ULT:
1225 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1226 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1227 AMDGPU::V_CMP_NGE_F64_e64);
1228 case CmpInst::FCMP_ULE:
1229 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1230 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1231 AMDGPU::V_CMP_NGT_F64_e64);
1232 case CmpInst::FCMP_UNE:
1233 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1234 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1235 AMDGPU::V_CMP_NEQ_F64_e64);
1236 case CmpInst::FCMP_TRUE:
1237 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1238 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1239 AMDGPU::V_CMP_TRU_F64_e64);
1241 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1242 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1243 AMDGPU::V_CMP_F_F64_e64);
1244 }
1245}
1246
1247int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1248 unsigned Size) const {
1249 if (Size == 64) {
1250 if (!STI.hasScalarCompareEq64())
1251 return -1;
1252
1253 switch (P) {
1254 case CmpInst::ICMP_NE:
1255 return AMDGPU::S_CMP_LG_U64;
1256 case CmpInst::ICMP_EQ:
1257 return AMDGPU::S_CMP_EQ_U64;
1258 default:
1259 return -1;
1260 }
1261 }
1262
1263 if (Size == 32) {
1264 switch (P) {
1265 case CmpInst::ICMP_NE:
1266 return AMDGPU::S_CMP_LG_U32;
1267 case CmpInst::ICMP_EQ:
1268 return AMDGPU::S_CMP_EQ_U32;
1269 case CmpInst::ICMP_SGT:
1270 return AMDGPU::S_CMP_GT_I32;
1271 case CmpInst::ICMP_SGE:
1272 return AMDGPU::S_CMP_GE_I32;
1273 case CmpInst::ICMP_SLT:
1274 return AMDGPU::S_CMP_LT_I32;
1275 case CmpInst::ICMP_SLE:
1276 return AMDGPU::S_CMP_LE_I32;
1277 case CmpInst::ICMP_UGT:
1278 return AMDGPU::S_CMP_GT_U32;
1279 case CmpInst::ICMP_UGE:
1280 return AMDGPU::S_CMP_GE_U32;
1281 case CmpInst::ICMP_ULT:
1282 return AMDGPU::S_CMP_LT_U32;
1283 case CmpInst::ICMP_ULE:
1284 return AMDGPU::S_CMP_LE_U32;
1285 case CmpInst::FCMP_OEQ:
1286 return AMDGPU::S_CMP_EQ_F32;
1287 case CmpInst::FCMP_OGT:
1288 return AMDGPU::S_CMP_GT_F32;
1289 case CmpInst::FCMP_OGE:
1290 return AMDGPU::S_CMP_GE_F32;
1291 case CmpInst::FCMP_OLT:
1292 return AMDGPU::S_CMP_LT_F32;
1293 case CmpInst::FCMP_OLE:
1294 return AMDGPU::S_CMP_LE_F32;
1295 case CmpInst::FCMP_ONE:
1296 return AMDGPU::S_CMP_LG_F32;
1297 case CmpInst::FCMP_ORD:
1298 return AMDGPU::S_CMP_O_F32;
1299 case CmpInst::FCMP_UNO:
1300 return AMDGPU::S_CMP_U_F32;
1301 case CmpInst::FCMP_UEQ:
1302 return AMDGPU::S_CMP_NLG_F32;
1303 case CmpInst::FCMP_UGT:
1304 return AMDGPU::S_CMP_NLE_F32;
1305 case CmpInst::FCMP_UGE:
1306 return AMDGPU::S_CMP_NLT_F32;
1307 case CmpInst::FCMP_ULT:
1308 return AMDGPU::S_CMP_NGE_F32;
1309 case CmpInst::FCMP_ULE:
1310 return AMDGPU::S_CMP_NGT_F32;
1311 case CmpInst::FCMP_UNE:
1312 return AMDGPU::S_CMP_NEQ_F32;
1313 default:
1314 llvm_unreachable("Unknown condition code!");
1315 }
1316 }
1317
1318 if (Size == 16) {
1319 if (!STI.hasSALUFloatInsts())
1320 return -1;
1321
1322 switch (P) {
1323 case CmpInst::FCMP_OEQ:
1324 return AMDGPU::S_CMP_EQ_F16;
1325 case CmpInst::FCMP_OGT:
1326 return AMDGPU::S_CMP_GT_F16;
1327 case CmpInst::FCMP_OGE:
1328 return AMDGPU::S_CMP_GE_F16;
1329 case CmpInst::FCMP_OLT:
1330 return AMDGPU::S_CMP_LT_F16;
1331 case CmpInst::FCMP_OLE:
1332 return AMDGPU::S_CMP_LE_F16;
1333 case CmpInst::FCMP_ONE:
1334 return AMDGPU::S_CMP_LG_F16;
1335 case CmpInst::FCMP_ORD:
1336 return AMDGPU::S_CMP_O_F16;
1337 case CmpInst::FCMP_UNO:
1338 return AMDGPU::S_CMP_U_F16;
1339 case CmpInst::FCMP_UEQ:
1340 return AMDGPU::S_CMP_NLG_F16;
1341 case CmpInst::FCMP_UGT:
1342 return AMDGPU::S_CMP_NLE_F16;
1343 case CmpInst::FCMP_UGE:
1344 return AMDGPU::S_CMP_NLT_F16;
1345 case CmpInst::FCMP_ULT:
1346 return AMDGPU::S_CMP_NGE_F16;
1347 case CmpInst::FCMP_ULE:
1348 return AMDGPU::S_CMP_NGT_F16;
1349 case CmpInst::FCMP_UNE:
1350 return AMDGPU::S_CMP_NEQ_F16;
1351 default:
1352 llvm_unreachable("Unknown condition code!");
1353 }
1354 }
1355
1356 return -1;
1357}
1358
1359bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1360
1361 MachineBasicBlock *BB = I.getParent();
1362 const DebugLoc &DL = I.getDebugLoc();
1363
1364 Register SrcReg = I.getOperand(2).getReg();
1365 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1366
1367 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1368
1369 Register CCReg = I.getOperand(0).getReg();
1370 if (!isVCC(CCReg, *MRI)) {
1371 int Opcode = getS_CMPOpcode(Pred, Size);
1372 if (Opcode == -1)
1373 return false;
1374 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1375 .add(I.getOperand(2))
1376 .add(I.getOperand(3));
1377 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1378 .addReg(AMDGPU::SCC);
1379 bool Ret =
1380 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1381 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1382 I.eraseFromParent();
1383 return Ret;
1384 }
1385
1386 if (I.getOpcode() == AMDGPU::G_FCMP)
1387 return false;
1388
1389 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1390 if (Opcode == -1)
1391 return false;
1392
1393 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1394 I.getOperand(0).getReg())
1395 .add(I.getOperand(2))
1396 .add(I.getOperand(3));
1398 *TRI.getBoolRC(), *MRI);
1399 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1400 I.eraseFromParent();
1401 return Ret;
1402}
1403
1404bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1405 Register Dst = I.getOperand(0).getReg();
1406 if (isVCC(Dst, *MRI))
1407 return false;
1408
1409 LLT DstTy = MRI->getType(Dst);
1410 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1411 return false;
1412
1413 MachineBasicBlock *BB = I.getParent();
1414 const DebugLoc &DL = I.getDebugLoc();
1415 Register SrcReg = I.getOperand(2).getReg();
1416 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1417
1418 // i1 inputs are not supported in GlobalISel.
1419 if (Size == 1)
1420 return false;
1421
1422 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1423 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1424 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1425 I.eraseFromParent();
1426 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1427 }
1428
1429 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1430 if (Opcode == -1)
1431 return false;
1432
1433 MachineInstrBuilder SelectedMI;
1434 MachineOperand &LHS = I.getOperand(2);
1435 MachineOperand &RHS = I.getOperand(3);
1436 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1437 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1438 Register Src0Reg =
1439 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1440 Register Src1Reg =
1441 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1442 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1443 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1444 SelectedMI.addImm(Src0Mods);
1445 SelectedMI.addReg(Src0Reg);
1446 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1447 SelectedMI.addImm(Src1Mods);
1448 SelectedMI.addReg(Src1Reg);
1449 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1450 SelectedMI.addImm(0); // clamp
1451 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1452 SelectedMI.addImm(0); // op_sel
1453
1454 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1455 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1456 return false;
1457
1458 I.eraseFromParent();
1459 return true;
1460}
1461
1462// Ballot has to zero bits in input lane-mask that are zero in current exec,
1463// Done as AND with exec. For inputs that are results of instruction that
1464// implicitly use same exec, for example compares in same basic block or SCC to
1465// VCC copy, use copy.
1468 MachineInstr *MI = MRI.getVRegDef(Reg);
1469 if (MI->getParent() != MBB)
1470 return false;
1471
1472 // Lane mask generated by SCC to VCC copy.
1473 if (MI->getOpcode() == AMDGPU::COPY) {
1474 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1475 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1476 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1477 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1478 return true;
1479 }
1480
1481 // Lane mask generated using compare with same exec.
1482 if (isa<GAnyCmp>(MI))
1483 return true;
1484
1485 Register LHS, RHS;
1486 // Look through AND.
1487 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1488 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1490
1491 return false;
1492}
1493
1494bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1495 MachineBasicBlock *BB = I.getParent();
1496 const DebugLoc &DL = I.getDebugLoc();
1497 Register DstReg = I.getOperand(0).getReg();
1498 Register SrcReg = I.getOperand(2).getReg();
1499 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1500 const unsigned WaveSize = STI.getWavefrontSize();
1501
1502 // In the common case, the return type matches the wave size.
1503 // However we also support emitting i64 ballots in wave32 mode.
1504 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1505 return false;
1506
1507 std::optional<ValueAndVReg> Arg =
1509
1510 Register Dst = DstReg;
1511 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1512 if (BallotSize != WaveSize) {
1513 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1514 }
1515
1516 if (Arg) {
1517 const int64_t Value = Arg->Value.getZExtValue();
1518 if (Value == 0) {
1519 // Dst = S_MOV 0
1520 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1521 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1522 } else {
1523 // Dst = COPY EXEC
1524 assert(Value == 1);
1525 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1526 }
1527 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1528 return false;
1529 } else {
1530 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1531 // Dst = COPY SrcReg
1532 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1533 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1534 return false;
1535 } else {
1536 // Dst = S_AND SrcReg, EXEC
1537 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1538 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1539 .addReg(SrcReg)
1540 .addReg(TRI.getExec())
1541 .setOperandDead(3); // Dead scc
1542 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1543 return false;
1544 }
1545 }
1546
1547 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1548 if (BallotSize != WaveSize) {
1549 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1550 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1551 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1552 .addReg(Dst)
1553 .addImm(AMDGPU::sub0)
1554 .addReg(HiReg)
1555 .addImm(AMDGPU::sub1);
1556 }
1557
1558 I.eraseFromParent();
1559 return true;
1560}
1561
1562bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1563 Register DstReg = I.getOperand(0).getReg();
1564 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1565 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1566 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1567 return false;
1568
1569 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1570
1572 const MDNode *Metadata = I.getOperand(2).getMetadata();
1573 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1574 auto *RelocSymbol = cast<GlobalVariable>(
1575 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1576
1577 MachineBasicBlock *BB = I.getParent();
1578 BuildMI(*BB, &I, I.getDebugLoc(),
1579 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1581
1582 I.eraseFromParent();
1583 return true;
1584}
1585
1586bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1588
1589 Register DstReg = I.getOperand(0).getReg();
1590 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1591 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1592 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1593
1594 MachineBasicBlock *MBB = I.getParent();
1595 const DebugLoc &DL = I.getDebugLoc();
1596
1597 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1598
1599 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1601 MIB.addImm(MFI->getLDSSize());
1602 } else {
1604 const GlobalValue *GV =
1605 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1607 }
1608
1609 I.eraseFromParent();
1610 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1611}
1612
1613bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1614 MachineBasicBlock *MBB = I.getParent();
1616 const DebugLoc &DL = I.getDebugLoc();
1617
1618 MachineOperand &Dst = I.getOperand(0);
1619 Register DstReg = Dst.getReg();
1620 unsigned Depth = I.getOperand(2).getImm();
1621
1622 const TargetRegisterClass *RC
1623 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1624 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1625 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1626 return false;
1627
1628 // Check for kernel and shader functions
1629 if (Depth != 0 ||
1631 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1632 .addImm(0);
1633 I.eraseFromParent();
1634 return true;
1635 }
1636
1638 // There is a call to @llvm.returnaddress in this function
1639 MFI.setReturnAddressIsTaken(true);
1640
1641 // Get the return address reg and mark it as an implicit live-in
1642 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1643 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1644 AMDGPU::SReg_64RegClass, DL);
1645 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1646 .addReg(LiveIn);
1647 I.eraseFromParent();
1648 return true;
1649}
1650
1651bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1652 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1653 // SelectionDAG uses for wave32 vs wave64.
1654 MachineBasicBlock *BB = MI.getParent();
1655 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1656 .add(MI.getOperand(1));
1657
1658 Register Reg = MI.getOperand(1).getReg();
1659 MI.eraseFromParent();
1660
1661 if (!MRI->getRegClassOrNull(Reg))
1662 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1663 return true;
1664}
1665
1666bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1667 MachineInstr &MI, Intrinsic::ID IntrID) const {
1668 MachineBasicBlock *MBB = MI.getParent();
1670 const DebugLoc &DL = MI.getDebugLoc();
1671
1672 unsigned IndexOperand = MI.getOperand(7).getImm();
1673 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1674 bool WaveDone = MI.getOperand(9).getImm() != 0;
1675
1676 if (WaveDone && !WaveRelease)
1677 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1678
1679 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1680 IndexOperand &= ~0x3f;
1681 unsigned CountDw = 0;
1682
1684 CountDw = (IndexOperand >> 24) & 0xf;
1685 IndexOperand &= ~(0xf << 24);
1686
1687 if (CountDw < 1 || CountDw > 4) {
1689 "ds_ordered_count: dword count must be between 1 and 4");
1690 }
1691 }
1692
1693 if (IndexOperand)
1694 report_fatal_error("ds_ordered_count: bad index operand");
1695
1696 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1697 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1698
1699 unsigned Offset0 = OrderedCountIndex << 2;
1700 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1701
1703 Offset1 |= (CountDw - 1) << 6;
1704
1706 Offset1 |= ShaderType << 2;
1707
1708 unsigned Offset = Offset0 | (Offset1 << 8);
1709
1710 Register M0Val = MI.getOperand(2).getReg();
1711 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1712 .addReg(M0Val);
1713
1714 Register DstReg = MI.getOperand(0).getReg();
1715 Register ValReg = MI.getOperand(3).getReg();
1717 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1718 .addReg(ValReg)
1719 .addImm(Offset)
1720 .cloneMemRefs(MI);
1721
1722 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1723 return false;
1724
1725 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1726 MI.eraseFromParent();
1727 return Ret;
1728}
1729
1730static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1731 switch (IntrID) {
1732 case Intrinsic::amdgcn_ds_gws_init:
1733 return AMDGPU::DS_GWS_INIT;
1734 case Intrinsic::amdgcn_ds_gws_barrier:
1735 return AMDGPU::DS_GWS_BARRIER;
1736 case Intrinsic::amdgcn_ds_gws_sema_v:
1737 return AMDGPU::DS_GWS_SEMA_V;
1738 case Intrinsic::amdgcn_ds_gws_sema_br:
1739 return AMDGPU::DS_GWS_SEMA_BR;
1740 case Intrinsic::amdgcn_ds_gws_sema_p:
1741 return AMDGPU::DS_GWS_SEMA_P;
1742 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1743 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1744 default:
1745 llvm_unreachable("not a gws intrinsic");
1746 }
1747}
1748
1749bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1750 Intrinsic::ID IID) const {
1751 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1752 !STI.hasGWSSemaReleaseAll()))
1753 return false;
1754
1755 // intrinsic ID, vsrc, offset
1756 const bool HasVSrc = MI.getNumOperands() == 3;
1757 assert(HasVSrc || MI.getNumOperands() == 2);
1758
1759 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1760 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1761 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1762 return false;
1763
1764 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1765 unsigned ImmOffset;
1766
1767 MachineBasicBlock *MBB = MI.getParent();
1768 const DebugLoc &DL = MI.getDebugLoc();
1769
1770 MachineInstr *Readfirstlane = nullptr;
1771
1772 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1773 // incoming offset, in case there's an add of a constant. We'll have to put it
1774 // back later.
1775 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1776 Readfirstlane = OffsetDef;
1777 BaseOffset = OffsetDef->getOperand(1).getReg();
1778 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1779 }
1780
1781 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1782 // If we have a constant offset, try to use the 0 in m0 as the base.
1783 // TODO: Look into changing the default m0 initialization value. If the
1784 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1785 // the immediate offset.
1786
1787 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1788 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1789 .addImm(0);
1790 } else {
1791 std::tie(BaseOffset, ImmOffset) =
1792 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1793
1794 if (Readfirstlane) {
1795 // We have the constant offset now, so put the readfirstlane back on the
1796 // variable component.
1797 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1798 return false;
1799
1800 Readfirstlane->getOperand(1).setReg(BaseOffset);
1801 BaseOffset = Readfirstlane->getOperand(0).getReg();
1802 } else {
1803 if (!RBI.constrainGenericRegister(BaseOffset,
1804 AMDGPU::SReg_32RegClass, *MRI))
1805 return false;
1806 }
1807
1808 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1809 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1810 .addReg(BaseOffset)
1811 .addImm(16)
1812 .setOperandDead(3); // Dead scc
1813
1814 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1815 .addReg(M0Base);
1816 }
1817
1818 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1819 // offset field) % 64. Some versions of the programming guide omit the m0
1820 // part, or claim it's from offset 0.
1821 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1822
1823 if (HasVSrc) {
1824 Register VSrc = MI.getOperand(1).getReg();
1825 MIB.addReg(VSrc);
1826
1827 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1828 return false;
1829 }
1830
1831 MIB.addImm(ImmOffset)
1832 .cloneMemRefs(MI);
1833
1834 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1835
1836 MI.eraseFromParent();
1837 return true;
1838}
1839
1840bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1841 bool IsAppend) const {
1842 Register PtrBase = MI.getOperand(2).getReg();
1843 LLT PtrTy = MRI->getType(PtrBase);
1844 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1845
1846 unsigned Offset;
1847 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1848
1849 // TODO: Should this try to look through readfirstlane like GWS?
1850 if (!isDSOffsetLegal(PtrBase, Offset)) {
1851 PtrBase = MI.getOperand(2).getReg();
1852 Offset = 0;
1853 }
1854
1855 MachineBasicBlock *MBB = MI.getParent();
1856 const DebugLoc &DL = MI.getDebugLoc();
1857 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1858
1859 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1860 .addReg(PtrBase);
1861 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1862 return false;
1863
1864 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1865 .addImm(Offset)
1866 .addImm(IsGDS ? -1 : 0)
1867 .cloneMemRefs(MI);
1868 MI.eraseFromParent();
1869 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1870}
1871
1872bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1873 MachineFunction *MF = MI.getParent()->getParent();
1875
1876 MFInfo->setInitWholeWave();
1877 return selectImpl(MI, *CoverageInfo);
1878}
1879
1880bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1881 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1883 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1884 if (WGSize <= STI.getWavefrontSize()) {
1885 // If the workgroup fits in a wave, remove s_barrier_signal and lower
1886 // s_barrier/s_barrier_wait to wave_barrier.
1887 if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1888 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1889 MachineBasicBlock *MBB = MI.getParent();
1890 const DebugLoc &DL = MI.getDebugLoc();
1891 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1892 }
1893 MI.eraseFromParent();
1894 return true;
1895 }
1896 }
1897
1898 if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
1899 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1900 MachineBasicBlock *MBB = MI.getParent();
1901 const DebugLoc &DL = MI.getDebugLoc();
1902 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1904 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1906 MI.eraseFromParent();
1907 return true;
1908 }
1909
1910 return selectImpl(MI, *CoverageInfo);
1911}
1912
1913static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1914 bool &IsTexFail) {
1915 if (TexFailCtrl)
1916 IsTexFail = true;
1917
1918 TFE = (TexFailCtrl & 0x1) ? true : false;
1919 TexFailCtrl &= ~(uint64_t)0x1;
1920 LWE = (TexFailCtrl & 0x2) ? true : false;
1921 TexFailCtrl &= ~(uint64_t)0x2;
1922
1923 return TexFailCtrl == 0;
1924}
1925
1926bool AMDGPUInstructionSelector::selectImageIntrinsic(
1928 MachineBasicBlock *MBB = MI.getParent();
1929 const DebugLoc &DL = MI.getDebugLoc();
1930
1931 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1933
1934 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1935 unsigned IntrOpcode = Intr->BaseOpcode;
1936 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1937 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1938 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1939
1940 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1941
1942 Register VDataIn, VDataOut;
1943 LLT VDataTy;
1944 int NumVDataDwords = -1;
1945 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1946 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1947
1948 bool Unorm;
1949 if (!BaseOpcode->Sampler)
1950 Unorm = true;
1951 else
1952 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1953
1954 bool TFE;
1955 bool LWE;
1956 bool IsTexFail = false;
1957 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1958 TFE, LWE, IsTexFail))
1959 return false;
1960
1961 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1962 const bool IsA16 = (Flags & 1) != 0;
1963 const bool IsG16 = (Flags & 2) != 0;
1964
1965 // A16 implies 16 bit gradients if subtarget doesn't support G16
1966 if (IsA16 && !STI.hasG16() && !IsG16)
1967 return false;
1968
1969 unsigned DMask = 0;
1970 unsigned DMaskLanes = 0;
1971
1972 if (BaseOpcode->Atomic) {
1973 VDataOut = MI.getOperand(0).getReg();
1974 VDataIn = MI.getOperand(2).getReg();
1975 LLT Ty = MRI->getType(VDataIn);
1976
1977 // Be careful to allow atomic swap on 16-bit element vectors.
1978 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1979 Ty.getSizeInBits() == 128 :
1980 Ty.getSizeInBits() == 64;
1981
1982 if (BaseOpcode->AtomicX2) {
1983 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1984
1985 DMask = Is64Bit ? 0xf : 0x3;
1986 NumVDataDwords = Is64Bit ? 4 : 2;
1987 } else {
1988 DMask = Is64Bit ? 0x3 : 0x1;
1989 NumVDataDwords = Is64Bit ? 2 : 1;
1990 }
1991 } else {
1992 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1993 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1994
1995 if (BaseOpcode->Store) {
1996 VDataIn = MI.getOperand(1).getReg();
1997 VDataTy = MRI->getType(VDataIn);
1998 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1999 } else if (BaseOpcode->NoReturn) {
2000 NumVDataDwords = 0;
2001 } else {
2002 VDataOut = MI.getOperand(0).getReg();
2003 VDataTy = MRI->getType(VDataOut);
2004 NumVDataDwords = DMaskLanes;
2005
2006 if (IsD16 && !STI.hasUnpackedD16VMem())
2007 NumVDataDwords = (DMaskLanes + 1) / 2;
2008 }
2009 }
2010
2011 // Set G16 opcode
2012 if (Subtarget->hasG16() && IsG16) {
2013 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2015 assert(G16MappingInfo);
2016 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2017 }
2018
2019 // TODO: Check this in verifier.
2020 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2021
2022 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2023 if (BaseOpcode->Atomic)
2024 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2025 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2027 return false;
2028
2029 int NumVAddrRegs = 0;
2030 int NumVAddrDwords = 0;
2031 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2032 // Skip the $noregs and 0s inserted during legalization.
2033 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2034 if (!AddrOp.isReg())
2035 continue; // XXX - Break?
2036
2037 Register Addr = AddrOp.getReg();
2038 if (!Addr)
2039 break;
2040
2041 ++NumVAddrRegs;
2042 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2043 }
2044
2045 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2046 // NSA, these should have been packed into a single value in the first
2047 // address register
2048 const bool UseNSA =
2049 NumVAddrRegs != 1 &&
2050 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2051 : NumVAddrDwords == NumVAddrRegs);
2052 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2053 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2054 return false;
2055 }
2056
2057 if (IsTexFail)
2058 ++NumVDataDwords;
2059
2060 int Opcode = -1;
2061 if (IsGFX12Plus) {
2062 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2063 NumVDataDwords, NumVAddrDwords);
2064 } else if (IsGFX11Plus) {
2065 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2066 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2067 : AMDGPU::MIMGEncGfx11Default,
2068 NumVDataDwords, NumVAddrDwords);
2069 } else if (IsGFX10Plus) {
2070 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2071 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2072 : AMDGPU::MIMGEncGfx10Default,
2073 NumVDataDwords, NumVAddrDwords);
2074 } else {
2075 if (Subtarget->hasGFX90AInsts()) {
2076 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2077 NumVDataDwords, NumVAddrDwords);
2078 if (Opcode == -1) {
2079 LLVM_DEBUG(
2080 dbgs()
2081 << "requested image instruction is not supported on this GPU\n");
2082 return false;
2083 }
2084 }
2085 if (Opcode == -1 &&
2087 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2088 NumVDataDwords, NumVAddrDwords);
2089 if (Opcode == -1)
2090 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2091 NumVDataDwords, NumVAddrDwords);
2092 }
2093 if (Opcode == -1)
2094 return false;
2095
2096 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2097 .cloneMemRefs(MI);
2098
2099 if (VDataOut) {
2100 if (BaseOpcode->AtomicX2) {
2101 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2102
2103 Register TmpReg = MRI->createVirtualRegister(
2104 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2105 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2106
2107 MIB.addDef(TmpReg);
2108 if (!MRI->use_empty(VDataOut)) {
2109 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2110 .addReg(TmpReg, RegState::Kill, SubReg);
2111 }
2112
2113 } else {
2114 MIB.addDef(VDataOut); // vdata output
2115 }
2116 }
2117
2118 if (VDataIn)
2119 MIB.addReg(VDataIn); // vdata input
2120
2121 for (int I = 0; I != NumVAddrRegs; ++I) {
2122 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2123 if (SrcOp.isReg()) {
2124 assert(SrcOp.getReg() != 0);
2125 MIB.addReg(SrcOp.getReg());
2126 }
2127 }
2128
2129 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2130 if (BaseOpcode->Sampler)
2131 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2132
2133 MIB.addImm(DMask); // dmask
2134
2135 if (IsGFX10Plus)
2136 MIB.addImm(DimInfo->Encoding);
2137 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2138 MIB.addImm(Unorm);
2139
2140 MIB.addImm(CPol);
2141 MIB.addImm(IsA16 && // a16 or r128
2142 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2143 if (IsGFX10Plus)
2144 MIB.addImm(IsA16 ? -1 : 0);
2145
2146 if (!Subtarget->hasGFX90AInsts()) {
2147 MIB.addImm(TFE); // tfe
2148 } else if (TFE) {
2149 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2150 return false;
2151 }
2152
2153 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2154 MIB.addImm(LWE); // lwe
2155 if (!IsGFX10Plus)
2156 MIB.addImm(DimInfo->DA ? -1 : 0);
2157 if (BaseOpcode->HasD16)
2158 MIB.addImm(IsD16 ? -1 : 0);
2159
2160 MI.eraseFromParent();
2161 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2162 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2163 return true;
2164}
2165
2166// We need to handle this here because tablegen doesn't support matching
2167// instructions with multiple outputs.
2168bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2169 MachineInstr &MI) const {
2170 Register Dst0 = MI.getOperand(0).getReg();
2171 Register Dst1 = MI.getOperand(1).getReg();
2172
2173 const DebugLoc &DL = MI.getDebugLoc();
2174 MachineBasicBlock *MBB = MI.getParent();
2175
2176 Register Addr = MI.getOperand(3).getReg();
2177 Register Data0 = MI.getOperand(4).getReg();
2178 Register Data1 = MI.getOperand(5).getReg();
2179 unsigned Offset = MI.getOperand(6).getImm();
2180
2181 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2182 .addDef(Dst1)
2183 .addUse(Addr)
2184 .addUse(Data0)
2185 .addUse(Data1)
2186 .addImm(Offset)
2187 .cloneMemRefs(MI);
2188
2189 MI.eraseFromParent();
2190 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2191}
2192
2193bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2194 MachineInstr &I) const {
2195 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2196 switch (IntrinsicID) {
2197 case Intrinsic::amdgcn_end_cf:
2198 return selectEndCfIntrinsic(I);
2199 case Intrinsic::amdgcn_ds_ordered_add:
2200 case Intrinsic::amdgcn_ds_ordered_swap:
2201 return selectDSOrderedIntrinsic(I, IntrinsicID);
2202 case Intrinsic::amdgcn_ds_gws_init:
2203 case Intrinsic::amdgcn_ds_gws_barrier:
2204 case Intrinsic::amdgcn_ds_gws_sema_v:
2205 case Intrinsic::amdgcn_ds_gws_sema_br:
2206 case Intrinsic::amdgcn_ds_gws_sema_p:
2207 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2208 return selectDSGWSIntrinsic(I, IntrinsicID);
2209 case Intrinsic::amdgcn_ds_append:
2210 return selectDSAppendConsume(I, true);
2211 case Intrinsic::amdgcn_ds_consume:
2212 return selectDSAppendConsume(I, false);
2213 case Intrinsic::amdgcn_init_whole_wave:
2214 return selectInitWholeWave(I);
2215 case Intrinsic::amdgcn_s_barrier:
2216 case Intrinsic::amdgcn_s_barrier_signal:
2217 case Intrinsic::amdgcn_s_barrier_wait:
2218 return selectSBarrier(I);
2219 case Intrinsic::amdgcn_raw_buffer_load_lds:
2220 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2221 case Intrinsic::amdgcn_struct_buffer_load_lds:
2222 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2223 return selectBufferLoadLds(I);
2224 case Intrinsic::amdgcn_global_load_lds:
2225 return selectGlobalLoadLds(I);
2226 case Intrinsic::amdgcn_exp_compr:
2227 if (!STI.hasCompressedExport()) {
2228 Function &F = I.getMF()->getFunction();
2230 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2231 F.getContext().diagnose(NoFpRet);
2232 return false;
2233 }
2234 break;
2235 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2236 return selectDSBvhStackIntrinsic(I);
2237 case Intrinsic::amdgcn_s_barrier_init:
2238 case Intrinsic::amdgcn_s_barrier_signal_var:
2239 return selectNamedBarrierInit(I, IntrinsicID);
2240 case Intrinsic::amdgcn_s_barrier_join:
2241 case Intrinsic::amdgcn_s_wakeup_barrier:
2242 case Intrinsic::amdgcn_s_get_named_barrier_state:
2243 return selectNamedBarrierInst(I, IntrinsicID);
2244 case Intrinsic::amdgcn_s_get_barrier_state:
2245 return selectSGetBarrierState(I, IntrinsicID);
2246 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2247 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2248 }
2249 return selectImpl(I, *CoverageInfo);
2250}
2251
2252bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2253 if (selectImpl(I, *CoverageInfo))
2254 return true;
2255
2256 MachineBasicBlock *BB = I.getParent();
2257 const DebugLoc &DL = I.getDebugLoc();
2258
2259 Register DstReg = I.getOperand(0).getReg();
2260 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2261 assert(Size <= 32 || Size == 64);
2262 const MachineOperand &CCOp = I.getOperand(1);
2263 Register CCReg = CCOp.getReg();
2264 if (!isVCC(CCReg, *MRI)) {
2265 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2266 AMDGPU::S_CSELECT_B32;
2267 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2268 .addReg(CCReg);
2269
2270 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2271 // bank, because it does not cover the register class that we used to represent
2272 // for it. So we need to manually set the register class here.
2273 if (!MRI->getRegClassOrNull(CCReg))
2274 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2275 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2276 .add(I.getOperand(2))
2277 .add(I.getOperand(3));
2278
2279 bool Ret = false;
2280 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2281 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2282 I.eraseFromParent();
2283 return Ret;
2284 }
2285
2286 // Wide VGPR select should have been split in RegBankSelect.
2287 if (Size > 32)
2288 return false;
2289
2291 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2292 .addImm(0)
2293 .add(I.getOperand(3))
2294 .addImm(0)
2295 .add(I.getOperand(2))
2296 .add(I.getOperand(1));
2297
2298 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2299 I.eraseFromParent();
2300 return Ret;
2301}
2302
2303bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2304 Register DstReg = I.getOperand(0).getReg();
2305 Register SrcReg = I.getOperand(1).getReg();
2306 const LLT DstTy = MRI->getType(DstReg);
2307 const LLT SrcTy = MRI->getType(SrcReg);
2308 const LLT S1 = LLT::scalar(1);
2309
2310 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2311 const RegisterBank *DstRB;
2312 if (DstTy == S1) {
2313 // This is a special case. We don't treat s1 for legalization artifacts as
2314 // vcc booleans.
2315 DstRB = SrcRB;
2316 } else {
2317 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2318 if (SrcRB != DstRB)
2319 return false;
2320 }
2321
2322 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2323
2324 unsigned DstSize = DstTy.getSizeInBits();
2325 unsigned SrcSize = SrcTy.getSizeInBits();
2326
2327 const TargetRegisterClass *SrcRC =
2328 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2329 const TargetRegisterClass *DstRC =
2330 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2331 if (!SrcRC || !DstRC)
2332 return false;
2333
2334 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2335 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2336 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2337 return false;
2338 }
2339
2340 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2342 const DebugLoc &DL = I.getDebugLoc();
2343 MachineBasicBlock *MBB = I.getParent();
2344 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2345 .addReg(SrcReg, 0, AMDGPU::lo16);
2346 I.eraseFromParent();
2347 return true;
2348 }
2349
2350 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2351 MachineBasicBlock *MBB = I.getParent();
2352 const DebugLoc &DL = I.getDebugLoc();
2353
2354 Register LoReg = MRI->createVirtualRegister(DstRC);
2355 Register HiReg = MRI->createVirtualRegister(DstRC);
2356 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2357 .addReg(SrcReg, 0, AMDGPU::sub0);
2358 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2359 .addReg(SrcReg, 0, AMDGPU::sub1);
2360
2361 if (IsVALU && STI.hasSDWA()) {
2362 // Write the low 16-bits of the high element into the high 16-bits of the
2363 // low element.
2364 MachineInstr *MovSDWA =
2365 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2366 .addImm(0) // $src0_modifiers
2367 .addReg(HiReg) // $src0
2368 .addImm(0) // $clamp
2369 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2370 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2371 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2372 .addReg(LoReg, RegState::Implicit);
2373 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2374 } else {
2375 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2376 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2377 Register ImmReg = MRI->createVirtualRegister(DstRC);
2378 if (IsVALU) {
2379 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2380 .addImm(16)
2381 .addReg(HiReg);
2382 } else {
2383 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2384 .addReg(HiReg)
2385 .addImm(16)
2386 .setOperandDead(3); // Dead scc
2387 }
2388
2389 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2390 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2391 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2392
2393 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2394 .addImm(0xffff);
2395 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2396 .addReg(LoReg)
2397 .addReg(ImmReg);
2398 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2399 .addReg(TmpReg0)
2400 .addReg(TmpReg1);
2401
2402 if (!IsVALU) {
2403 And.setOperandDead(3); // Dead scc
2404 Or.setOperandDead(3); // Dead scc
2405 }
2406 }
2407
2408 I.eraseFromParent();
2409 return true;
2410 }
2411
2412 if (!DstTy.isScalar())
2413 return false;
2414
2415 if (SrcSize > 32) {
2416 unsigned SubRegIdx =
2417 DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
2418 if (SubRegIdx == AMDGPU::NoSubRegister)
2419 return false;
2420
2421 // Deal with weird cases where the class only partially supports the subreg
2422 // index.
2423 const TargetRegisterClass *SrcWithSubRC
2424 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2425 if (!SrcWithSubRC)
2426 return false;
2427
2428 if (SrcWithSubRC != SrcRC) {
2429 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2430 return false;
2431 }
2432
2433 I.getOperand(1).setSubReg(SubRegIdx);
2434 }
2435
2436 I.setDesc(TII.get(TargetOpcode::COPY));
2437 return true;
2438}
2439
2440/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2441static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2442 Mask = maskTrailingOnes<unsigned>(Size);
2443 int SignedMask = static_cast<int>(Mask);
2444 return SignedMask >= -16 && SignedMask <= 64;
2445}
2446
2447// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2448const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2449 Register Reg, const MachineRegisterInfo &MRI,
2450 const TargetRegisterInfo &TRI) const {
2451 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2452 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2453 return RB;
2454
2455 // Ignore the type, since we don't use vcc in artifacts.
2456 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2457 return &RBI.getRegBankFromRegClass(*RC, LLT());
2458 return nullptr;
2459}
2460
2461bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2462 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2463 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2464 const DebugLoc &DL = I.getDebugLoc();
2465 MachineBasicBlock &MBB = *I.getParent();
2466 const Register DstReg = I.getOperand(0).getReg();
2467 const Register SrcReg = I.getOperand(1).getReg();
2468
2469 const LLT DstTy = MRI->getType(DstReg);
2470 const LLT SrcTy = MRI->getType(SrcReg);
2471 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2472 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2473 const unsigned DstSize = DstTy.getSizeInBits();
2474 if (!DstTy.isScalar())
2475 return false;
2476
2477 // Artifact casts should never use vcc.
2478 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2479
2480 // FIXME: This should probably be illegal and split earlier.
2481 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2482 if (DstSize <= 32)
2483 return selectCOPY(I);
2484
2485 const TargetRegisterClass *SrcRC =
2486 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2487 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2488 const TargetRegisterClass *DstRC =
2489 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2490
2491 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2492 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2493 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2494 .addReg(SrcReg)
2495 .addImm(AMDGPU::sub0)
2496 .addReg(UndefReg)
2497 .addImm(AMDGPU::sub1);
2498 I.eraseFromParent();
2499
2500 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2501 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2502 }
2503
2504 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2505 // 64-bit should have been split up in RegBankSelect
2506
2507 // Try to use an and with a mask if it will save code size.
2508 unsigned Mask;
2509 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2510 MachineInstr *ExtI =
2511 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2512 .addImm(Mask)
2513 .addReg(SrcReg);
2514 I.eraseFromParent();
2515 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2516 }
2517
2518 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2519 MachineInstr *ExtI =
2520 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2521 .addReg(SrcReg)
2522 .addImm(0) // Offset
2523 .addImm(SrcSize); // Width
2524 I.eraseFromParent();
2525 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2526 }
2527
2528 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2529 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2530 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2531 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2532 return false;
2533
2534 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2535 const unsigned SextOpc = SrcSize == 8 ?
2536 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2537 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2538 .addReg(SrcReg);
2539 I.eraseFromParent();
2540 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2541 }
2542
2543 // Using a single 32-bit SALU to calculate the high half is smaller than
2544 // S_BFE with a literal constant operand.
2545 if (DstSize > 32 && SrcSize == 32) {
2546 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2547 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2548 if (Signed) {
2549 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2550 .addReg(SrcReg, 0, SubReg)
2551 .addImm(31)
2552 .setOperandDead(3); // Dead scc
2553 } else {
2554 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2555 .addImm(0);
2556 }
2557 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2558 .addReg(SrcReg, 0, SubReg)
2559 .addImm(AMDGPU::sub0)
2560 .addReg(HiReg)
2561 .addImm(AMDGPU::sub1);
2562 I.eraseFromParent();
2563 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2564 *MRI);
2565 }
2566
2567 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2568 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2569
2570 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2571 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2572 // We need a 64-bit register source, but the high bits don't matter.
2573 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2574 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2575 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2576
2577 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2578 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2579 .addReg(SrcReg, 0, SubReg)
2580 .addImm(AMDGPU::sub0)
2581 .addReg(UndefReg)
2582 .addImm(AMDGPU::sub1);
2583
2584 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2585 .addReg(ExtReg)
2586 .addImm(SrcSize << 16);
2587
2588 I.eraseFromParent();
2589 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2590 }
2591
2592 unsigned Mask;
2593 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2594 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2595 .addReg(SrcReg)
2596 .addImm(Mask)
2597 .setOperandDead(3); // Dead scc
2598 } else {
2599 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2600 .addReg(SrcReg)
2601 .addImm(SrcSize << 16);
2602 }
2603
2604 I.eraseFromParent();
2605 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2606 }
2607
2608 return false;
2609}
2610
2612 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
2613}
2614
2616 Register BitcastSrc;
2617 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2618 Reg = BitcastSrc;
2619 return Reg;
2620}
2621
2623 Register &Out) {
2624 Register Trunc;
2625 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2626 return false;
2627
2628 Register LShlSrc;
2629 Register Cst;
2630 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2631 Cst = stripCopy(Cst, MRI);
2632 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2633 Out = stripBitCast(LShlSrc, MRI);
2634 return true;
2635 }
2636 }
2637
2638 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2639 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2640 return false;
2641
2642 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2643 LLT::fixed_vector(2, 16));
2644
2645 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2646 assert(Mask.size() == 2);
2647
2648 if (Mask[0] == 1 && Mask[1] <= 1) {
2649 Out = Shuffle->getOperand(0).getReg();
2650 return true;
2651 }
2652
2653 return false;
2654}
2655
2656bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2657 if (!Subtarget->hasSALUFloatInsts())
2658 return false;
2659
2660 Register Dst = I.getOperand(0).getReg();
2661 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2662 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2663 return false;
2664
2665 Register Src = I.getOperand(1).getReg();
2666
2667 if (MRI->getType(Dst) == LLT::scalar(32) &&
2668 MRI->getType(Src) == LLT::scalar(16)) {
2669 if (isExtractHiElt(*MRI, Src, Src)) {
2670 MachineBasicBlock *BB = I.getParent();
2671 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2672 .addUse(Src);
2673 I.eraseFromParent();
2674 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2675 }
2676 }
2677
2678 return false;
2679}
2680
2681bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2682 // Only manually handle the f64 SGPR case.
2683 //
2684 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2685 // the bit ops theoretically have a second result due to the implicit def of
2686 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2687 // that is easy by disabling the check. The result works, but uses a
2688 // nonsensical sreg32orlds_and_sreg_1 regclass.
2689 //
2690 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2691 // the variadic REG_SEQUENCE operands.
2692
2693 Register Dst = MI.getOperand(0).getReg();
2694 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2695 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2696 MRI->getType(Dst) != LLT::scalar(64))
2697 return false;
2698
2699 Register Src = MI.getOperand(1).getReg();
2700 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2701 if (Fabs)
2702 Src = Fabs->getOperand(1).getReg();
2703
2704 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2705 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2706 return false;
2707
2708 MachineBasicBlock *BB = MI.getParent();
2709 const DebugLoc &DL = MI.getDebugLoc();
2710 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2711 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2712 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2713 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2714
2715 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2716 .addReg(Src, 0, AMDGPU::sub0);
2717 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2718 .addReg(Src, 0, AMDGPU::sub1);
2719 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2720 .addImm(0x80000000);
2721
2722 // Set or toggle sign bit.
2723 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2724 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2725 .addReg(HiReg)
2726 .addReg(ConstReg)
2727 .setOperandDead(3); // Dead scc
2728 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2729 .addReg(LoReg)
2730 .addImm(AMDGPU::sub0)
2731 .addReg(OpReg)
2732 .addImm(AMDGPU::sub1);
2733 MI.eraseFromParent();
2734 return true;
2735}
2736
2737// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2738bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2739 Register Dst = MI.getOperand(0).getReg();
2740 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2741 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2742 MRI->getType(Dst) != LLT::scalar(64))
2743 return false;
2744
2745 Register Src = MI.getOperand(1).getReg();
2746 MachineBasicBlock *BB = MI.getParent();
2747 const DebugLoc &DL = MI.getDebugLoc();
2748 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2749 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2750 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2751 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2752
2753 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2754 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2755 return false;
2756
2757 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2758 .addReg(Src, 0, AMDGPU::sub0);
2759 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2760 .addReg(Src, 0, AMDGPU::sub1);
2761 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2762 .addImm(0x7fffffff);
2763
2764 // Clear sign bit.
2765 // TODO: Should this used S_BITSET0_*?
2766 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2767 .addReg(HiReg)
2768 .addReg(ConstReg)
2769 .setOperandDead(3); // Dead scc
2770 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2771 .addReg(LoReg)
2772 .addImm(AMDGPU::sub0)
2773 .addReg(OpReg)
2774 .addImm(AMDGPU::sub1);
2775
2776 MI.eraseFromParent();
2777 return true;
2778}
2779
2780static bool isConstant(const MachineInstr &MI) {
2781 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2782}
2783
2784void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2785 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2786
2787 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2788 const MachineInstr *PtrMI =
2789 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2790
2791 assert(PtrMI);
2792
2793 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2794 return;
2795
2796 GEPInfo GEPInfo;
2797
2798 for (unsigned i = 1; i != 3; ++i) {
2799 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2800 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2801 assert(OpDef);
2802 if (i == 2 && isConstant(*OpDef)) {
2803 // TODO: Could handle constant base + variable offset, but a combine
2804 // probably should have commuted it.
2805 assert(GEPInfo.Imm == 0);
2806 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2807 continue;
2808 }
2809 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2810 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2811 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2812 else
2813 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2814 }
2815
2816 AddrInfo.push_back(GEPInfo);
2817 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2818}
2819
2820bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2821 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2822}
2823
2824bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2825 if (!MI.hasOneMemOperand())
2826 return false;
2827
2828 const MachineMemOperand *MMO = *MI.memoperands_begin();
2829 const Value *Ptr = MMO->getValue();
2830
2831 // UndefValue means this is a load of a kernel input. These are uniform.
2832 // Sometimes LDS instructions have constant pointers.
2833 // If Ptr is null, then that means this mem operand contains a
2834 // PseudoSourceValue like GOT.
2835 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2836 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2837 return true;
2838
2840 return true;
2841
2842 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2843 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2844 AMDGPU::SGPRRegBankID;
2845
2846 const Instruction *I = dyn_cast<Instruction>(Ptr);
2847 return I && I->getMetadata("amdgpu.uniform");
2848}
2849
2850bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2851 for (const GEPInfo &GEPInfo : AddrInfo) {
2852 if (!GEPInfo.VgprParts.empty())
2853 return true;
2854 }
2855 return false;
2856}
2857
2858void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2859 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2860 unsigned AS = PtrTy.getAddressSpace();
2862 STI.ldsRequiresM0Init()) {
2863 MachineBasicBlock *BB = I.getParent();
2864
2865 // If DS instructions require M0 initialization, insert it before selecting.
2866 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2867 .addImm(-1);
2868 }
2869}
2870
2871bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2872 MachineInstr &I) const {
2873 initM0(I);
2874 return selectImpl(I, *CoverageInfo);
2875}
2876
2878 if (Reg.isPhysical())
2879 return false;
2880
2881 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2882 const unsigned Opcode = MI.getOpcode();
2883
2884 if (Opcode == AMDGPU::COPY)
2885 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2886
2887 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2888 Opcode == AMDGPU::G_XOR)
2889 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2890 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2891
2892 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2893 return GI->is(Intrinsic::amdgcn_class);
2894
2895 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2896}
2897
2898bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2899 MachineBasicBlock *BB = I.getParent();
2900 MachineOperand &CondOp = I.getOperand(0);
2901 Register CondReg = CondOp.getReg();
2902 const DebugLoc &DL = I.getDebugLoc();
2903
2904 unsigned BrOpcode;
2905 Register CondPhysReg;
2906 const TargetRegisterClass *ConstrainRC;
2907
2908 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2909 // whether the branch is uniform when selecting the instruction. In
2910 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2911 // RegBankSelect knows what it's doing if the branch condition is scc, even
2912 // though it currently does not.
2913 if (!isVCC(CondReg, *MRI)) {
2914 if (MRI->getType(CondReg) != LLT::scalar(32))
2915 return false;
2916
2917 CondPhysReg = AMDGPU::SCC;
2918 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2919 ConstrainRC = &AMDGPU::SReg_32RegClass;
2920 } else {
2921 // FIXME: Should scc->vcc copies and with exec?
2922
2923 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2924 // need to insert an and with exec.
2925 if (!isVCmpResult(CondReg, *MRI)) {
2926 const bool Is64 = STI.isWave64();
2927 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2928 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2929
2930 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2931 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2932 .addReg(CondReg)
2933 .addReg(Exec)
2934 .setOperandDead(3); // Dead scc
2935 CondReg = TmpReg;
2936 }
2937
2938 CondPhysReg = TRI.getVCC();
2939 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2940 ConstrainRC = TRI.getBoolRC();
2941 }
2942
2943 if (!MRI->getRegClassOrNull(CondReg))
2944 MRI->setRegClass(CondReg, ConstrainRC);
2945
2946 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2947 .addReg(CondReg);
2948 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2949 .addMBB(I.getOperand(1).getMBB());
2950
2951 I.eraseFromParent();
2952 return true;
2953}
2954
2955bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2956 MachineInstr &I) const {
2957 Register DstReg = I.getOperand(0).getReg();
2958 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2959 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2960 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2961 if (IsVGPR)
2962 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2963
2964 return RBI.constrainGenericRegister(
2965 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2966}
2967
2968bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2969 Register DstReg = I.getOperand(0).getReg();
2970 Register SrcReg = I.getOperand(1).getReg();
2971 Register MaskReg = I.getOperand(2).getReg();
2972 LLT Ty = MRI->getType(DstReg);
2973 LLT MaskTy = MRI->getType(MaskReg);
2974 MachineBasicBlock *BB = I.getParent();
2975 const DebugLoc &DL = I.getDebugLoc();
2976
2977 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2978 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2979 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2980 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2981 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2982 return false;
2983
2984 // Try to avoid emitting a bit operation when we only need to touch half of
2985 // the 64-bit pointer.
2986 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2987 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2988 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2989
2990 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2991 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2992
2993 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2994 !CanCopyLow32 && !CanCopyHi32) {
2995 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2996 .addReg(SrcReg)
2997 .addReg(MaskReg)
2998 .setOperandDead(3); // Dead scc
2999 I.eraseFromParent();
3000 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3001 }
3002
3003 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3004 const TargetRegisterClass &RegRC
3005 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3006
3007 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3008 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3009 const TargetRegisterClass *MaskRC =
3010 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3011
3012 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3013 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3014 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3015 return false;
3016
3017 if (Ty.getSizeInBits() == 32) {
3018 assert(MaskTy.getSizeInBits() == 32 &&
3019 "ptrmask should have been narrowed during legalize");
3020
3021 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3022 .addReg(SrcReg)
3023 .addReg(MaskReg);
3024
3025 if (!IsVGPR)
3026 NewOp.setOperandDead(3); // Dead scc
3027 I.eraseFromParent();
3028 return true;
3029 }
3030
3031 Register HiReg = MRI->createVirtualRegister(&RegRC);
3032 Register LoReg = MRI->createVirtualRegister(&RegRC);
3033
3034 // Extract the subregisters from the source pointer.
3035 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3036 .addReg(SrcReg, 0, AMDGPU::sub0);
3037 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3038 .addReg(SrcReg, 0, AMDGPU::sub1);
3039
3040 Register MaskedLo, MaskedHi;
3041
3042 if (CanCopyLow32) {
3043 // If all the bits in the low half are 1, we only need a copy for it.
3044 MaskedLo = LoReg;
3045 } else {
3046 // Extract the mask subregister and apply the and.
3047 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3048 MaskedLo = MRI->createVirtualRegister(&RegRC);
3049
3050 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3051 .addReg(MaskReg, 0, AMDGPU::sub0);
3052 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3053 .addReg(LoReg)
3054 .addReg(MaskLo);
3055 }
3056
3057 if (CanCopyHi32) {
3058 // If all the bits in the high half are 1, we only need a copy for it.
3059 MaskedHi = HiReg;
3060 } else {
3061 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3062 MaskedHi = MRI->createVirtualRegister(&RegRC);
3063
3064 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3065 .addReg(MaskReg, 0, AMDGPU::sub1);
3066 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3067 .addReg(HiReg)
3068 .addReg(MaskHi);
3069 }
3070
3071 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3072 .addReg(MaskedLo)
3073 .addImm(AMDGPU::sub0)
3074 .addReg(MaskedHi)
3075 .addImm(AMDGPU::sub1);
3076 I.eraseFromParent();
3077 return true;
3078}
3079
3080/// Return the register to use for the index value, and the subregister to use
3081/// for the indirectly accessed register.
3082static std::pair<Register, unsigned>
3084 const TargetRegisterClass *SuperRC, Register IdxReg,
3085 unsigned EltSize, GISelKnownBits &KnownBits) {
3086 Register IdxBaseReg;
3087 int Offset;
3088
3089 std::tie(IdxBaseReg, Offset) =
3091 if (IdxBaseReg == AMDGPU::NoRegister) {
3092 // This will happen if the index is a known constant. This should ordinarily
3093 // be legalized out, but handle it as a register just in case.
3094 assert(Offset == 0);
3095 IdxBaseReg = IdxReg;
3096 }
3097
3098 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3099
3100 // Skip out of bounds offsets, or else we would end up using an undefined
3101 // register.
3102 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3103 return std::pair(IdxReg, SubRegs[0]);
3104 return std::pair(IdxBaseReg, SubRegs[Offset]);
3105}
3106
3107bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3108 MachineInstr &MI) const {
3109 Register DstReg = MI.getOperand(0).getReg();
3110 Register SrcReg = MI.getOperand(1).getReg();
3111 Register IdxReg = MI.getOperand(2).getReg();
3112
3113 LLT DstTy = MRI->getType(DstReg);
3114 LLT SrcTy = MRI->getType(SrcReg);
3115
3116 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3117 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3118 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3119
3120 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3121 // into a waterfall loop.
3122 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3123 return false;
3124
3125 const TargetRegisterClass *SrcRC =
3126 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3127 const TargetRegisterClass *DstRC =
3128 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3129 if (!SrcRC || !DstRC)
3130 return false;
3131 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3132 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3133 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3134 return false;
3135
3136 MachineBasicBlock *BB = MI.getParent();
3137 const DebugLoc &DL = MI.getDebugLoc();
3138 const bool Is64 = DstTy.getSizeInBits() == 64;
3139
3140 unsigned SubReg;
3141 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3142 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3143
3144 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3145 if (DstTy.getSizeInBits() != 32 && !Is64)
3146 return false;
3147
3148 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3149 .addReg(IdxReg);
3150
3151 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3152 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3153 .addReg(SrcReg, 0, SubReg)
3154 .addReg(SrcReg, RegState::Implicit);
3155 MI.eraseFromParent();
3156 return true;
3157 }
3158
3159 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3160 return false;
3161
3162 if (!STI.useVGPRIndexMode()) {
3163 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3164 .addReg(IdxReg);
3165 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3166 .addReg(SrcReg, 0, SubReg)
3167 .addReg(SrcReg, RegState::Implicit);
3168 MI.eraseFromParent();
3169 return true;
3170 }
3171
3172 const MCInstrDesc &GPRIDXDesc =
3173 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3174 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3175 .addReg(SrcReg)
3176 .addReg(IdxReg)
3177 .addImm(SubReg);
3178
3179 MI.eraseFromParent();
3180 return true;
3181}
3182
3183// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3184bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3185 MachineInstr &MI) const {
3186 Register DstReg = MI.getOperand(0).getReg();
3187 Register VecReg = MI.getOperand(1).getReg();
3188 Register ValReg = MI.getOperand(2).getReg();
3189 Register IdxReg = MI.getOperand(3).getReg();
3190
3191 LLT VecTy = MRI->getType(DstReg);
3192 LLT ValTy = MRI->getType(ValReg);
3193 unsigned VecSize = VecTy.getSizeInBits();
3194 unsigned ValSize = ValTy.getSizeInBits();
3195
3196 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3197 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3198 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3199
3200 assert(VecTy.getElementType() == ValTy);
3201
3202 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3203 // into a waterfall loop.
3204 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3205 return false;
3206
3207 const TargetRegisterClass *VecRC =
3208 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3209 const TargetRegisterClass *ValRC =
3210 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3211
3212 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3213 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3214 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3215 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3216 return false;
3217
3218 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3219 return false;
3220
3221 unsigned SubReg;
3222 std::tie(IdxReg, SubReg) =
3223 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3224
3225 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3226 STI.useVGPRIndexMode();
3227
3228 MachineBasicBlock *BB = MI.getParent();
3229 const DebugLoc &DL = MI.getDebugLoc();
3230
3231 if (!IndexMode) {
3232 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3233 .addReg(IdxReg);
3234
3235 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3236 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3237 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3238 .addReg(VecReg)
3239 .addReg(ValReg)
3240 .addImm(SubReg);
3241 MI.eraseFromParent();
3242 return true;
3243 }
3244
3245 const MCInstrDesc &GPRIDXDesc =
3246 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3247 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3248 .addReg(VecReg)
3249 .addReg(ValReg)
3250 .addReg(IdxReg)
3251 .addImm(SubReg);
3252
3253 MI.eraseFromParent();
3254 return true;
3255}
3256
3257bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3259 unsigned Opc;
3260 unsigned Size = MI.getOperand(3).getImm();
3261
3262 // The struct intrinsic variants add one additional operand over raw.
3263 const bool HasVIndex = MI.getNumOperands() == 9;
3264 Register VIndex;
3265 int OpOffset = 0;
3266 if (HasVIndex) {
3267 VIndex = MI.getOperand(4).getReg();
3268 OpOffset = 1;
3269 }
3270
3271 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3272 std::optional<ValueAndVReg> MaybeVOffset =
3274 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3275
3276 switch (Size) {
3277 default:
3278 return false;
3279 case 1:
3280 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3281 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3282 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3283 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3284 break;
3285 case 2:
3286 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3287 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3288 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3289 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3290 break;
3291 case 4:
3292 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3293 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3294 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3295 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3296 break;
3297 case 12:
3298 if (!Subtarget->hasLDSLoadB96_B128())
3299 return false;
3300
3301 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3302 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3303 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3304 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3305 break;
3306 case 16:
3307 if (!Subtarget->hasLDSLoadB96_B128())
3308 return false;
3309
3310 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3311 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3312 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3313 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3314 break;
3315 }
3316
3317 MachineBasicBlock *MBB = MI.getParent();
3318 const DebugLoc &DL = MI.getDebugLoc();
3319 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3320 .add(MI.getOperand(2));
3321
3322 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3323
3324 if (HasVIndex && HasVOffset) {
3325 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3326 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3327 .addReg(VIndex)
3328 .addImm(AMDGPU::sub0)
3329 .addReg(VOffset)
3330 .addImm(AMDGPU::sub1);
3331
3332 MIB.addReg(IdxReg);
3333 } else if (HasVIndex) {
3334 MIB.addReg(VIndex);
3335 } else if (HasVOffset) {
3336 MIB.addReg(VOffset);
3337 }
3338
3339 MIB.add(MI.getOperand(1)); // rsrc
3340 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3341 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3342 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3343 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3344 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3345 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3346 MIB.addImm(
3347 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3348 ? 1
3349 : 0); // swz
3350
3351 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3352 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3353 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3354 MachinePointerInfo StorePtrI = LoadPtrI;
3355 StorePtrI.V = nullptr;
3357
3358 auto F = LoadMMO->getFlags() &
3360 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3361 Size, LoadMMO->getBaseAlign());
3362
3363 MachineMemOperand *StoreMMO =
3365 sizeof(int32_t), LoadMMO->getBaseAlign());
3366
3367 MIB.setMemRefs({LoadMMO, StoreMMO});
3368
3369 MI.eraseFromParent();
3370 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3371}
3372
3373/// Match a zero extend from a 32-bit value to 64-bits.
3375 Register ZExtSrc;
3376 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3377 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3378
3379 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3380 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3381 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3382 return Register();
3383
3384 assert(Def->getNumOperands() == 3 &&
3385 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3386 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3387 return Def->getOperand(1).getReg();
3388 }
3389
3390 return Register();
3391}
3392
3393bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3394 unsigned Opc;
3395 unsigned Size = MI.getOperand(3).getImm();
3396
3397 switch (Size) {
3398 default:
3399 return false;
3400 case 1:
3401 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3402 break;
3403 case 2:
3404 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3405 break;
3406 case 4:
3407 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3408 break;
3409 case 12:
3410 if (!Subtarget->hasLDSLoadB96_B128())
3411 return false;
3412 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3413 break;
3414 case 16:
3415 if (!Subtarget->hasLDSLoadB96_B128())
3416 return false;
3417 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3418 break;
3419 }
3420
3421 MachineBasicBlock *MBB = MI.getParent();
3422 const DebugLoc &DL = MI.getDebugLoc();
3423 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3424 .add(MI.getOperand(2));
3425
3426 Register Addr = MI.getOperand(1).getReg();
3427 Register VOffset;
3428 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3429 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3430 if (!isSGPR(Addr)) {
3431 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3432 if (isSGPR(AddrDef->Reg)) {
3433 Addr = AddrDef->Reg;
3434 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3435 Register SAddr =
3436 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3437 if (isSGPR(SAddr)) {
3438 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3439 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3440 Addr = SAddr;
3441 VOffset = Off;
3442 }
3443 }
3444 }
3445 }
3446
3447 if (isSGPR(Addr)) {
3448 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3449 if (!VOffset) {
3450 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3451 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3452 .addImm(0);
3453 }
3454 }
3455
3456 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3457 .addReg(Addr);
3458
3459 if (isSGPR(Addr))
3460 MIB.addReg(VOffset);
3461
3462 MIB.add(MI.getOperand(4)) // offset
3463 .add(MI.getOperand(5)); // cpol
3464
3465 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3466 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3467 LoadPtrI.Offset = MI.getOperand(4).getImm();
3468 MachinePointerInfo StorePtrI = LoadPtrI;
3471 auto F = LoadMMO->getFlags() &
3473 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3474 Size, LoadMMO->getBaseAlign());
3475 MachineMemOperand *StoreMMO =
3477 sizeof(int32_t), Align(4));
3478
3479 MIB.setMemRefs({LoadMMO, StoreMMO});
3480
3481 MI.eraseFromParent();
3482 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3483}
3484
3485bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3486 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3487 MI.removeOperand(1);
3488 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3489 return true;
3490}
3491
3492// FIXME: This should be removed and let the patterns select. We just need the
3493// AGPR/VGPR combination versions.
3494bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3495 unsigned Opc;
3496 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3497 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3498 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3499 break;
3500 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3501 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3502 break;
3503 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3504 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3505 break;
3506 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3507 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3508 break;
3509 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3510 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3511 break;
3512 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3513 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3514 break;
3515 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3516 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3517 break;
3518 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3519 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3520 break;
3521 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3522 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3523 break;
3524 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3525 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3526 break;
3527 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3528 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3529 break;
3530 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3531 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3532 break;
3533 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3534 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3535 break;
3536 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3537 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3538 break;
3539 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3540 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3541 break;
3542 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3543 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3544 break;
3545 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3546 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3547 break;
3548 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3549 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3550 break;
3551 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3552 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3553 break;
3554 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3555 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3556 break;
3557 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3558 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3559 break;
3560 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3561 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3562 break;
3563 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3564 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3565 break;
3566 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3567 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3568 break;
3569 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3570 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3571 break;
3572 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3573 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3574 break;
3575 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3576 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3577 break;
3578 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3579 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3580 break;
3581 default:
3582 llvm_unreachable("unhandled smfmac intrinsic");
3583 }
3584
3585 auto VDst_In = MI.getOperand(4);
3586
3587 MI.setDesc(TII.get(Opc));
3588 MI.removeOperand(4); // VDst_In
3589 MI.removeOperand(1); // Intrinsic ID
3590 MI.addOperand(VDst_In); // Readd VDst_In to the end
3591 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3592 return true;
3593}
3594
3595bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3596 MachineInstr &MI, Intrinsic::ID IntrID) const {
3597 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3598 !Subtarget->hasPermlane16Swap())
3599 return false;
3600 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3601 !Subtarget->hasPermlane32Swap())
3602 return false;
3603
3604 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3605 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3606 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3607
3608 MI.removeOperand(2);
3609 MI.setDesc(TII.get(Opcode));
3610 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3611
3612 MachineOperand &FI = MI.getOperand(4);
3614
3615 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3616}
3617
3618bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3619 Register DstReg = MI.getOperand(0).getReg();
3620 Register SrcReg = MI.getOperand(1).getReg();
3621 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3622 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3623 MachineBasicBlock *MBB = MI.getParent();
3624 const DebugLoc &DL = MI.getDebugLoc();
3625
3626 if (IsVALU) {
3627 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3628 .addImm(Subtarget->getWavefrontSizeLog2())
3629 .addReg(SrcReg);
3630 } else {
3631 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3632 .addReg(SrcReg)
3633 .addImm(Subtarget->getWavefrontSizeLog2())
3634 .setOperandDead(3); // Dead scc
3635 }
3636
3637 const TargetRegisterClass &RC =
3638 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3639 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3640 return false;
3641
3642 MI.eraseFromParent();
3643 return true;
3644}
3645
3646// Match BITOP3 operation and return a number of matched instructions plus
3647// truth table.
3648static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3650 const MachineRegisterInfo &MRI) {
3651 unsigned NumOpcodes = 0;
3652 uint8_t LHSBits, RHSBits;
3653
3654 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3655 // Define truth table given Src0, Src1, Src2 bits permutations:
3656 // 0 0 0
3657 // 0 0 1
3658 // 0 1 0
3659 // 0 1 1
3660 // 1 0 0
3661 // 1 0 1
3662 // 1 1 0
3663 // 1 1 1
3664 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3665
3666 if (mi_match(Op, MRI, m_AllOnesInt())) {
3667 Bits = 0xff;
3668 return true;
3669 }
3670 if (mi_match(Op, MRI, m_ZeroInt())) {
3671 Bits = 0;
3672 return true;
3673 }
3674
3675 for (unsigned I = 0; I < Src.size(); ++I) {
3676 // Try to find existing reused operand
3677 if (Src[I] == Op) {
3678 Bits = SrcBits[I];
3679 return true;
3680 }
3681 // Try to replace parent operator
3682 if (Src[I] == R) {
3683 Bits = SrcBits[I];
3684 Src[I] = Op;
3685 return true;
3686 }
3687 }
3688
3689 if (Src.size() == 3) {
3690 // No room left for operands. Try one last time, there can be a 'not' of
3691 // one of our source operands. In this case we can compute the bits
3692 // without growing Src vector.
3693 Register LHS;
3694 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3696 for (unsigned I = 0; I < Src.size(); ++I) {
3697 if (Src[I] == LHS) {
3698 Bits = ~SrcBits[I];
3699 return true;
3700 }
3701 }
3702 }
3703
3704 return false;
3705 }
3706
3707 Bits = SrcBits[Src.size()];
3708 Src.push_back(Op);
3709 return true;
3710 };
3711
3712 MachineInstr *MI = MRI.getVRegDef(R);
3713 switch (MI->getOpcode()) {
3714 case TargetOpcode::G_AND:
3715 case TargetOpcode::G_OR:
3716 case TargetOpcode::G_XOR: {
3717 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3718 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3719
3720 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3721 if (!getOperandBits(LHS, LHSBits) ||
3722 !getOperandBits(RHS, RHSBits)) {
3723 Src = Backup;
3724 return std::make_pair(0, 0);
3725 }
3726
3727 // Recursion is naturally limited by the size of the operand vector.
3728 auto Op = BitOp3_Op(LHS, Src, MRI);
3729 if (Op.first) {
3730 NumOpcodes += Op.first;
3731 LHSBits = Op.second;
3732 }
3733
3734 Op = BitOp3_Op(RHS, Src, MRI);
3735 if (Op.first) {
3736 NumOpcodes += Op.first;
3737 RHSBits = Op.second;
3738 }
3739 break;
3740 }
3741 default:
3742 return std::make_pair(0, 0);
3743 }
3744
3745 uint8_t TTbl;
3746 switch (MI->getOpcode()) {
3747 case TargetOpcode::G_AND:
3748 TTbl = LHSBits & RHSBits;
3749 break;
3750 case TargetOpcode::G_OR:
3751 TTbl = LHSBits | RHSBits;
3752 break;
3753 case TargetOpcode::G_XOR:
3754 TTbl = LHSBits ^ RHSBits;
3755 break;
3756 default:
3757 break;
3758 }
3759
3760 return std::make_pair(NumOpcodes + 1, TTbl);
3761}
3762
3763bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3764 if (!Subtarget->hasBitOp3Insts())
3765 return false;
3766
3767 Register DstReg = MI.getOperand(0).getReg();
3768 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3769 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3770 if (!IsVALU)
3771 return false;
3772
3774 uint8_t TTbl;
3775 unsigned NumOpcodes;
3776
3777 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3778
3779 // Src.empty() case can happen if all operands are all zero or all ones.
3780 // Normally it shall be optimized out before reaching this.
3781 if (NumOpcodes < 2 || Src.empty())
3782 return false;
3783
3784 // For a uniform case threshold should be higher to account for moves between
3785 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3786 // and a readtfirstlane after.
3787 if (NumOpcodes < 4)
3788 return false;
3789
3790 bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3791 if (NumOpcodes == 2 && IsB32) {
3792 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3793 // asm more readable. This cannot be modeled with AddedComplexity because
3794 // selector does not know how many operations did we match.
3795 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3796 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3797 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3798 return false;
3799 }
3800
3801 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3802 unsigned CBL = STI.getConstantBusLimit(Opc);
3803 MachineBasicBlock *MBB = MI.getParent();
3804 const DebugLoc &DL = MI.getDebugLoc();
3805
3806 for (unsigned I = 0; I < Src.size(); ++I) {
3807 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3808 if (RB->getID() != AMDGPU::SGPRRegBankID)
3809 continue;
3810 if (CBL > 0) {
3811 --CBL;
3812 continue;
3813 }
3814 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3815 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3816 .addReg(Src[I]);
3817 Src[I] = NewReg;
3818 }
3819
3820 // Last operand can be ignored, turning a ternary operation into a binary.
3821 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3822 // 'c' with 'a' here without changing the answer. In some pathological
3823 // cases it should be possible to get an operation with a single operand
3824 // too if optimizer would not catch it.
3825 while (Src.size() < 3)
3826 Src.push_back(Src[0]);
3827
3828 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
3829 if (!IsB32)
3830 MIB.addImm(0); // src_mod0
3831 MIB.addReg(Src[0]);
3832 if (!IsB32)
3833 MIB.addImm(0); // src_mod1
3834 MIB.addReg(Src[1]);
3835 if (!IsB32)
3836 MIB.addImm(0); // src_mod2
3837 MIB.addReg(Src[2])
3838 .addImm(TTbl);
3839 if (!IsB32)
3840 MIB.addImm(0); // op_sel
3841
3842 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3843 MI.eraseFromParent();
3844
3845 return true;
3846}
3847
3848bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3849 Register SrcReg = MI.getOperand(0).getReg();
3850 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3851 return false;
3852
3853 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3854 Register SP =
3856 Register WaveAddr = getWaveAddress(DefMI);
3857 MachineBasicBlock *MBB = MI.getParent();
3858 const DebugLoc &DL = MI.getDebugLoc();
3859
3860 if (!WaveAddr) {
3861 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3862 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3863 .addReg(SrcReg)
3864 .addImm(Subtarget->getWavefrontSizeLog2())
3865 .setOperandDead(3); // Dead scc
3866 }
3867
3868 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3869 .addReg(WaveAddr);
3870
3871 MI.eraseFromParent();
3872 return true;
3873}
3874
3876
3877 if (!I.isPreISelOpcode()) {
3878 if (I.isCopy())
3879 return selectCOPY(I);
3880 return true;
3881 }
3882
3883 switch (I.getOpcode()) {
3884 case TargetOpcode::G_AND:
3885 case TargetOpcode::G_OR:
3886 case TargetOpcode::G_XOR:
3887 if (selectBITOP3(I))
3888 return true;
3889 if (selectImpl(I, *CoverageInfo))
3890 return true;
3891 return selectG_AND_OR_XOR(I);
3892 case TargetOpcode::G_ADD:
3893 case TargetOpcode::G_SUB:
3894 case TargetOpcode::G_PTR_ADD:
3895 if (selectImpl(I, *CoverageInfo))
3896 return true;
3897 return selectG_ADD_SUB(I);
3898 case TargetOpcode::G_UADDO:
3899 case TargetOpcode::G_USUBO:
3900 case TargetOpcode::G_UADDE:
3901 case TargetOpcode::G_USUBE:
3902 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3903 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3904 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3905 return selectG_AMDGPU_MAD_64_32(I);
3906 case TargetOpcode::G_INTTOPTR:
3907 case TargetOpcode::G_BITCAST:
3908 case TargetOpcode::G_PTRTOINT:
3909 case TargetOpcode::G_FREEZE:
3910 return selectCOPY(I);
3911 case TargetOpcode::G_FNEG:
3912 if (selectImpl(I, *CoverageInfo))
3913 return true;
3914 return selectG_FNEG(I);
3915 case TargetOpcode::G_FABS:
3916 if (selectImpl(I, *CoverageInfo))
3917 return true;
3918 return selectG_FABS(I);
3919 case TargetOpcode::G_EXTRACT:
3920 return selectG_EXTRACT(I);
3921 case TargetOpcode::G_MERGE_VALUES:
3922 case TargetOpcode::G_CONCAT_VECTORS:
3923 return selectG_MERGE_VALUES(I);
3924 case TargetOpcode::G_UNMERGE_VALUES:
3925 return selectG_UNMERGE_VALUES(I);
3926 case TargetOpcode::G_BUILD_VECTOR:
3927 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3928 return selectG_BUILD_VECTOR(I);
3929 case TargetOpcode::G_IMPLICIT_DEF:
3930 return selectG_IMPLICIT_DEF(I);
3931 case TargetOpcode::G_INSERT:
3932 return selectG_INSERT(I);
3933 case TargetOpcode::G_INTRINSIC:
3934 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3935 return selectG_INTRINSIC(I);
3936 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3937 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3938 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3939 case TargetOpcode::G_ICMP:
3940 case TargetOpcode::G_FCMP:
3941 if (selectG_ICMP_or_FCMP(I))
3942 return true;
3943 return selectImpl(I, *CoverageInfo);
3944 case TargetOpcode::G_LOAD:
3945 case TargetOpcode::G_ZEXTLOAD:
3946 case TargetOpcode::G_SEXTLOAD:
3947 case TargetOpcode::G_STORE:
3948 case TargetOpcode::G_ATOMIC_CMPXCHG:
3949 case TargetOpcode::G_ATOMICRMW_XCHG:
3950 case TargetOpcode::G_ATOMICRMW_ADD:
3951 case TargetOpcode::G_ATOMICRMW_SUB:
3952 case TargetOpcode::G_ATOMICRMW_AND:
3953 case TargetOpcode::G_ATOMICRMW_OR:
3954 case TargetOpcode::G_ATOMICRMW_XOR:
3955 case TargetOpcode::G_ATOMICRMW_MIN:
3956 case TargetOpcode::G_ATOMICRMW_MAX:
3957 case TargetOpcode::G_ATOMICRMW_UMIN:
3958 case TargetOpcode::G_ATOMICRMW_UMAX:
3959 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3960 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3961 case TargetOpcode::G_ATOMICRMW_FADD:
3962 case TargetOpcode::G_ATOMICRMW_FMIN:
3963 case TargetOpcode::G_ATOMICRMW_FMAX:
3964 return selectG_LOAD_STORE_ATOMICRMW(I);
3965 case TargetOpcode::G_SELECT:
3966 return selectG_SELECT(I);
3967 case TargetOpcode::G_TRUNC:
3968 return selectG_TRUNC(I);
3969 case TargetOpcode::G_SEXT:
3970 case TargetOpcode::G_ZEXT:
3971 case TargetOpcode::G_ANYEXT:
3972 case TargetOpcode::G_SEXT_INREG:
3973 // This is a workaround. For extension from type i1, `selectImpl()` uses
3974 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3975 // i1 can only be hold in a SGPR class.
3976 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3977 selectImpl(I, *CoverageInfo))
3978 return true;
3979 return selectG_SZA_EXT(I);
3980 case TargetOpcode::G_FPEXT:
3981 if (selectG_FPEXT(I))
3982 return true;
3983 return selectImpl(I, *CoverageInfo);
3984 case TargetOpcode::G_BRCOND:
3985 return selectG_BRCOND(I);
3986 case TargetOpcode::G_GLOBAL_VALUE:
3987 return selectG_GLOBAL_VALUE(I);
3988 case TargetOpcode::G_PTRMASK:
3989 return selectG_PTRMASK(I);
3990 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3991 return selectG_EXTRACT_VECTOR_ELT(I);
3992 case TargetOpcode::G_INSERT_VECTOR_ELT:
3993 return selectG_INSERT_VECTOR_ELT(I);
3994 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3995 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3996 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3997 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3998 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4001 assert(Intr && "not an image intrinsic with image pseudo");
4002 return selectImageIntrinsic(I, Intr);
4003 }
4004 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
4005 return selectBVHIntrinsic(I);
4006 case AMDGPU::G_SBFX:
4007 case AMDGPU::G_UBFX:
4008 return selectG_SBFX_UBFX(I);
4009 case AMDGPU::G_SI_CALL:
4010 I.setDesc(TII.get(AMDGPU::SI_CALL));
4011 return true;
4012 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4013 return selectWaveAddress(I);
4014 case AMDGPU::G_STACKRESTORE:
4015 return selectStackRestore(I);
4016 case AMDGPU::G_PHI:
4017 return selectPHI(I);
4018 case TargetOpcode::G_CONSTANT:
4019 case TargetOpcode::G_FCONSTANT:
4020 default:
4021 return selectImpl(I, *CoverageInfo);
4022 }
4023 return false;
4024}
4025
4027AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4028 return {{
4029 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4030 }};
4031
4032}
4033
4034std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4035 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4036 unsigned Mods = 0;
4037 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4038
4039 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4040 Src = MI->getOperand(1).getReg();
4041 Mods |= SISrcMods::NEG;
4042 MI = getDefIgnoringCopies(Src, *MRI);
4043 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4044 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4045 // denormal mode, but we're implicitly canonicalizing in a source operand.
4046 const ConstantFP *LHS =
4047 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4048 if (LHS && LHS->isZero()) {
4049 Mods |= SISrcMods::NEG;
4050 Src = MI->getOperand(2).getReg();
4051 }
4052 }
4053
4054 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4055 Src = MI->getOperand(1).getReg();
4056 Mods |= SISrcMods::ABS;
4057 }
4058
4059 if (OpSel)
4060 Mods |= SISrcMods::OP_SEL_0;
4061
4062 return std::pair(Src, Mods);
4063}
4064
4065Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4066 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4067 bool ForceVGPR) const {
4068 if ((Mods != 0 || ForceVGPR) &&
4069 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4070
4071 // If we looked through copies to find source modifiers on an SGPR operand,
4072 // we now have an SGPR register source. To avoid potentially violating the
4073 // constant bus restriction, we need to insert a copy to a VGPR.
4074 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4075 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4076 TII.get(AMDGPU::COPY), VGPRSrc)
4077 .addReg(Src);
4078 Src = VGPRSrc;
4079 }
4080
4081 return Src;
4082}
4083
4084///
4085/// This will select either an SGPR or VGPR operand and will save us from
4086/// having to write an extra tablegen pattern.
4088AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4089 return {{
4090 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4091 }};
4092}
4093
4095AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4096 Register Src;
4097 unsigned Mods;
4098 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4099
4100 return {{
4101 [=](MachineInstrBuilder &MIB) {
4102 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4103 },
4104 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4105 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4106 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4107 }};
4108}
4109
4111AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4112 Register Src;
4113 unsigned Mods;
4114 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4115 /*IsCanonicalizing=*/true,
4116 /*AllowAbs=*/false);
4117
4118 return {{
4119 [=](MachineInstrBuilder &MIB) {
4120 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4121 },
4122 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4123 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4124 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4125 }};
4126}
4127
4129AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4130 return {{
4131 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4132 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4133 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4134 }};
4135}
4136
4138AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4139 Register Src;
4140 unsigned Mods;
4141 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4142
4143 return {{
4144 [=](MachineInstrBuilder &MIB) {
4145 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4146 },
4147 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4148 }};
4149}
4150
4152AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4153 MachineOperand &Root) const {
4154 Register Src;
4155 unsigned Mods;
4156 std::tie(Src, Mods) =
4157 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4158
4159 return {{
4160 [=](MachineInstrBuilder &MIB) {
4161 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4162 },
4163 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4164 }};
4165}
4166
4168AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4169 Register Src;
4170 unsigned Mods;
4171 std::tie(Src, Mods) =
4172 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4173 /*AllowAbs=*/false);
4174
4175 return {{
4176 [=](MachineInstrBuilder &MIB) {
4177 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4178 },
4179 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4180 }};
4181}
4182
4184AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4185 Register Reg = Root.getReg();
4186 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4187 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4188 return {};
4189 return {{
4190 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4191 }};
4192}
4193
4194std::pair<Register, unsigned>
4195AMDGPUInstructionSelector::selectVOP3PModsImpl(
4196 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
4197 unsigned Mods = 0;
4198 MachineInstr *MI = MRI.getVRegDef(Src);
4199
4200 if (MI->getOpcode() == AMDGPU::G_FNEG &&
4201 // It's possible to see an f32 fneg here, but unlikely.
4202 // TODO: Treat f32 fneg as only high bit.
4203 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
4205 Src = MI->getOperand(1).getReg();
4206 MI = MRI.getVRegDef(Src);
4207 }
4208
4209 // TODO: Handle G_FSUB 0 as fneg
4210
4211 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
4212 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
4213
4214 // Packed instructions do not have abs modifiers.
4215 Mods |= SISrcMods::OP_SEL_1;
4216
4217 return std::pair(Src, Mods);
4218}
4219
4221AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4223 = Root.getParent()->getParent()->getParent()->getRegInfo();
4224
4225 Register Src;
4226 unsigned Mods;
4227 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
4228
4229 return {{
4230 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4231 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4232 }};
4233}
4234
4236AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4238 = Root.getParent()->getParent()->getParent()->getRegInfo();
4239
4240 Register Src;
4241 unsigned Mods;
4242 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
4243
4244 return {{
4245 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4246 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4247 }};
4248}
4249
4251AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
4252 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4253 // Value is in Imm operand as i1 sign extended to int64_t.
4254 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4255 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4256 "expected i1 value");
4257 unsigned Mods = SISrcMods::OP_SEL_1;
4258 if (Root.getImm() == -1)
4259 Mods ^= SISrcMods::NEG;
4260 return {{
4261 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4262 }};
4263}
4264
4266AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4267 MachineOperand &Root) const {
4268 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4269 "expected i1 value");
4270 unsigned Mods = SISrcMods::OP_SEL_1;
4271 if (Root.getImm() != 0)
4272 Mods |= SISrcMods::OP_SEL_0;
4273
4274 return {{
4275 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4276 }};
4277}
4278
4280 MachineInstr *InsertPt,
4282 const TargetRegisterClass *DstRegClass;
4283 switch (Elts.size()) {
4284 case 8:
4285 DstRegClass = &AMDGPU::VReg_256RegClass;
4286 break;
4287 case 4:
4288 DstRegClass = &AMDGPU::VReg_128RegClass;
4289 break;
4290 case 2:
4291 DstRegClass = &AMDGPU::VReg_64RegClass;
4292 break;
4293 default:
4294 llvm_unreachable("unhandled Reg sequence size");
4295 }
4296
4297 MachineIRBuilder B(*InsertPt);
4298 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4299 .addDef(MRI.createVirtualRegister(DstRegClass));
4300 for (unsigned i = 0; i < Elts.size(); ++i) {
4301 MIB.addReg(Elts[i]);
4303 }
4304 return MIB->getOperand(0).getReg();
4305}
4306
4307static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
4309 MachineInstr *InsertPt,
4311 if (ModOpcode == TargetOpcode::G_FNEG) {
4312 Mods |= SISrcMods::NEG;
4313 // Check if all elements also have abs modifier
4314 SmallVector<Register, 8> NegAbsElts;
4315 for (auto El : Elts) {
4316 Register FabsSrc;
4317 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
4318 break;
4319 NegAbsElts.push_back(FabsSrc);
4320 }
4321 if (Elts.size() != NegAbsElts.size()) {
4322 // Neg
4323 Src = buildRegSequence(Elts, InsertPt, MRI);
4324 } else {
4325 // Neg and Abs
4326 Mods |= SISrcMods::NEG_HI;
4327 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4328 }
4329 } else {
4330 assert(ModOpcode == TargetOpcode::G_FABS);
4331 // Abs
4332 Mods |= SISrcMods::NEG_HI;
4333 Src = buildRegSequence(Elts, InsertPt, MRI);
4334 }
4335}
4336
4338AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4339 Register Src = Root.getReg();
4340 unsigned Mods = SISrcMods::OP_SEL_1;
4342
4343 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4344 assert(BV->getNumSources() > 0);
4345 // Based on first element decide which mod we match, neg or abs
4346 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
4347 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
4348 ? AMDGPU::G_FNEG
4349 : AMDGPU::G_FABS;
4350 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4351 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
4352 if (ElF32->getOpcode() != ModOpcode)
4353 break;
4354 EltsF32.push_back(ElF32->getOperand(1).getReg());
4355 }
4356
4357 // All elements had ModOpcode modifier
4358 if (BV->getNumSources() == EltsF32.size()) {
4359 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4360 *MRI);
4361 }
4362 }
4363
4364 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4365 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4366}
4367
4369AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4370 Register Src = Root.getReg();
4371 unsigned Mods = SISrcMods::OP_SEL_1;
4372 SmallVector<Register, 8> EltsV2F16;
4373
4374 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4375 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4376 Register FNegSrc;
4377 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4378 break;
4379 EltsV2F16.push_back(FNegSrc);
4380 }
4381
4382 // All elements had ModOpcode modifier
4383 if (CV->getNumSources() == EltsV2F16.size()) {
4384 Mods |= SISrcMods::NEG;
4385 Mods |= SISrcMods::NEG_HI;
4386 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4387 }
4388 }
4389
4390 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4391 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4392}
4393
4395AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4396 Register Src = Root.getReg();
4397 unsigned Mods = SISrcMods::OP_SEL_1;
4398 SmallVector<Register, 8> EltsV2F16;
4399
4400 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4401 assert(CV->getNumSources() > 0);
4402 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4403 // Based on first element decide which mod we match, neg or abs
4404 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4405 ? AMDGPU::G_FNEG
4406 : AMDGPU::G_FABS;
4407
4408 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4409 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4410 if (ElV2F16->getOpcode() != ModOpcode)
4411 break;
4412 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4413 }
4414
4415 // All elements had ModOpcode modifier
4416 if (CV->getNumSources() == EltsV2F16.size()) {
4417 MachineIRBuilder B(*Root.getParent());
4418 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4419 *MRI);
4420 }
4421 }
4422
4423 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4424 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4425}
4426
4428AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4429 std::optional<FPValueAndVReg> FPValReg;
4430 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4431 if (TII.isInlineConstant(FPValReg->Value)) {
4432 return {{[=](MachineInstrBuilder &MIB) {
4433 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4434 }}};
4435 }
4436 // Non-inlineable splat floats should not fall-through for integer immediate
4437 // checks.
4438 return {};
4439 }
4440
4441 APInt ICst;
4442 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4443 if (TII.isInlineConstant(ICst)) {
4444 return {
4445 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4446 }
4447 }
4448
4449 return {};
4450}
4451
4453AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4454 Register Src =
4455 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4456 unsigned Key = 0;
4457
4458 Register ShiftSrc;
4459 std::optional<ValueAndVReg> ShiftAmt;
4460 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4461 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4462 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4463 Key = ShiftAmt->Value.getZExtValue() / 8;
4464 Src = ShiftSrc;
4465 }
4466
4467 return {{
4468 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4469 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4470 }};
4471}
4472
4474AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4475
4476 Register Src =
4477 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4478 unsigned Key = 0;
4479
4480 Register ShiftSrc;
4481 std::optional<ValueAndVReg> ShiftAmt;
4482 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4483 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4484 ShiftAmt->Value.getZExtValue() == 16) {
4485 Src = ShiftSrc;
4486 Key = 1;
4487 }
4488
4489 return {{
4490 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4491 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4492 }};
4493}
4494
4496AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4497 Register Src;
4498 unsigned Mods;
4499 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4500
4501 // FIXME: Handle op_sel
4502 return {{
4503 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4504 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4505 }};
4506}
4507
4509AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4510 Register Src;
4511 unsigned Mods;
4512 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4513 /*IsCanonicalizing=*/true,
4514 /*AllowAbs=*/false,
4515 /*OpSel=*/false);
4516
4517 return {{
4518 [=](MachineInstrBuilder &MIB) {
4519 MIB.addReg(
4520 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4521 },
4522 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4523 }};
4524}
4525
4527AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4528 Register Src;
4529 unsigned Mods;
4530 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4531 /*IsCanonicalizing=*/true,
4532 /*AllowAbs=*/false,
4533 /*OpSel=*/true);
4534
4535 return {{
4536 [=](MachineInstrBuilder &MIB) {
4537 MIB.addReg(
4538 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4539 },
4540 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4541 }};
4542}
4543
4544bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4545 Register &Base,
4546 Register *SOffset,
4547 int64_t *Offset) const {
4548 MachineInstr *MI = Root.getParent();
4549 MachineBasicBlock *MBB = MI->getParent();
4550
4551 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4552 // then we can select all ptr + 32-bit offsets.
4553 SmallVector<GEPInfo, 4> AddrInfo;
4554 getAddrModeInfo(*MI, *MRI, AddrInfo);
4555
4556 if (AddrInfo.empty())
4557 return false;
4558
4559 const GEPInfo &GEPI = AddrInfo[0];
4560 std::optional<int64_t> EncodedImm;
4561
4562 if (SOffset && Offset) {
4563 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4564 /*HasSOffset=*/true);
4565 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4566 AddrInfo.size() > 1) {
4567 const GEPInfo &GEPI2 = AddrInfo[1];
4568 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4569 if (Register OffsetReg =
4570 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4571 Base = GEPI2.SgprParts[0];
4572 *SOffset = OffsetReg;
4573 *Offset = *EncodedImm;
4574 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4575 return true;
4576
4577 // For unbuffered smem loads, it is illegal for the Immediate Offset
4578 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4579 // is negative. Handle the case where the Immediate Offset + SOffset
4580 // is negative.
4581 auto SKnown = KB->getKnownBits(*SOffset);
4582 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4583 return false;
4584
4585 return true;
4586 }
4587 }
4588 }
4589 return false;
4590 }
4591
4592 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4593 /*HasSOffset=*/false);
4594 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4595 Base = GEPI.SgprParts[0];
4596 *Offset = *EncodedImm;
4597 return true;
4598 }
4599
4600 // SGPR offset is unsigned.
4601 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4602 GEPI.Imm != 0) {
4603 // If we make it this far we have a load with an 32-bit immediate offset.
4604 // It is OK to select this using a sgpr offset, because we have already
4605 // failed trying to select this load into one of the _IMM variants since
4606 // the _IMM Patterns are considered before the _SGPR patterns.
4607 Base = GEPI.SgprParts[0];
4608 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4609 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4610 .addImm(GEPI.Imm);
4611 return true;
4612 }
4613
4614 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4615 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4616 Base = GEPI.SgprParts[0];
4617 *SOffset = OffsetReg;
4618 return true;
4619 }
4620 }
4621
4622 return false;
4623}
4624
4626AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4627 Register Base;
4628 int64_t Offset;
4629 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4630 return std::nullopt;
4631
4632 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4633 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4634}
4635
4637AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4638 SmallVector<GEPInfo, 4> AddrInfo;
4639 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4640
4641 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4642 return std::nullopt;
4643
4644 const GEPInfo &GEPInfo = AddrInfo[0];
4645 Register PtrReg = GEPInfo.SgprParts[0];
4646 std::optional<int64_t> EncodedImm =
4647 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4648 if (!EncodedImm)
4649 return std::nullopt;
4650
4651 return {{
4652 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4653 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4654 }};
4655}
4656
4658AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4659 Register Base, SOffset;
4660 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4661 return std::nullopt;
4662
4663 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4664 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4665}
4666
4668AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4669 Register Base, SOffset;
4670 int64_t Offset;
4671 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4672 return std::nullopt;
4673
4674 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4675 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4676 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4677}
4678
4679std::pair<Register, int>
4680AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4681 uint64_t FlatVariant) const {
4682 MachineInstr *MI = Root.getParent();
4683
4684 auto Default = std::pair(Root.getReg(), 0);
4685
4686 if (!STI.hasFlatInstOffsets())
4687 return Default;
4688
4689 Register PtrBase;
4690 int64_t ConstOffset;
4691 std::tie(PtrBase, ConstOffset) =
4692 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4693
4694 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4695 !isFlatScratchBaseLegal(Root.getReg())))
4696 return Default;
4697
4698 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4699 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4700 return Default;
4701
4702 return std::pair(PtrBase, ConstOffset);
4703}
4704
4706AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4707 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4708
4709 return {{
4710 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4711 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4712 }};
4713}
4714
4716AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4717 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4718
4719 return {{
4720 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4721 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4722 }};
4723}
4724
4726AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4727 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4728
4729 return {{
4730 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4731 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4732 }};
4733}
4734
4735// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4737AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4738 Register Addr = Root.getReg();
4739 Register PtrBase;
4740 int64_t ConstOffset;
4741 int64_t ImmOffset = 0;
4742
4743 // Match the immediate offset first, which canonically is moved as low as
4744 // possible.
4745 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4746
4747 if (ConstOffset != 0) {
4748 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4750 Addr = PtrBase;
4751 ImmOffset = ConstOffset;
4752 } else {
4753 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4754 if (isSGPR(PtrBaseDef->Reg)) {
4755 if (ConstOffset > 0) {
4756 // Offset is too large.
4757 //
4758 // saddr + large_offset -> saddr +
4759 // (voffset = large_offset & ~MaxOffset) +
4760 // (large_offset & MaxOffset);
4761 int64_t SplitImmOffset, RemainderOffset;
4762 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4764
4765 if (isUInt<32>(RemainderOffset)) {
4766 MachineInstr *MI = Root.getParent();
4767 MachineBasicBlock *MBB = MI->getParent();
4768 Register HighBits =
4769 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4770
4771 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4772 HighBits)
4773 .addImm(RemainderOffset);
4774
4775 return {{
4776 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4777 [=](MachineInstrBuilder &MIB) {
4778 MIB.addReg(HighBits);
4779 }, // voffset
4780 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4781 }};
4782 }
4783 }
4784
4785 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4786 // is 1 we would need to perform 1 or 2 extra moves for each half of
4787 // the constant and it is better to do a scalar add and then issue a
4788 // single VALU instruction to materialize zero. Otherwise it is less
4789 // instructions to perform VALU adds with immediates or inline literals.
4790 unsigned NumLiterals =
4791 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
4792 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
4793 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4794 return std::nullopt;
4795 }
4796 }
4797 }
4798
4799 // Match the variable offset.
4800 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4801 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4802 // Look through the SGPR->VGPR copy.
4803 Register SAddr =
4804 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4805
4806 if (isSGPR(SAddr)) {
4807 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4808
4809 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4810 // inserted later.
4811 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4812 return {{[=](MachineInstrBuilder &MIB) { // saddr
4813 MIB.addReg(SAddr);
4814 },
4815 [=](MachineInstrBuilder &MIB) { // voffset
4816 MIB.addReg(VOffset);
4817 },
4818 [=](MachineInstrBuilder &MIB) { // offset
4819 MIB.addImm(ImmOffset);
4820 }}};
4821 }
4822 }
4823 }
4824
4825 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4826 // drop this.
4827 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4828 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4829 return std::nullopt;
4830
4831 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4832 // moves required to copy a 64-bit SGPR to VGPR.
4833 MachineInstr *MI = Root.getParent();
4834 MachineBasicBlock *MBB = MI->getParent();
4835 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4836
4837 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4838 .addImm(0);
4839
4840 return {{
4841 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4842 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4843 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4844 }};
4845}
4846
4848AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4849 Register Addr = Root.getReg();
4850 Register PtrBase;
4851 int64_t ConstOffset;
4852 int64_t ImmOffset = 0;
4853
4854 // Match the immediate offset first, which canonically is moved as low as
4855 // possible.
4856 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4857
4858 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4861 Addr = PtrBase;
4862 ImmOffset = ConstOffset;
4863 }
4864
4865 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4866 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4867 int FI = AddrDef->MI->getOperand(1).getIndex();
4868 return {{
4869 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4870 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4871 }};
4872 }
4873
4874 Register SAddr = AddrDef->Reg;
4875
4876 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4877 Register LHS = AddrDef->MI->getOperand(1).getReg();
4878 Register RHS = AddrDef->MI->getOperand(2).getReg();
4879 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4880 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4881
4882 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4883 isSGPR(RHSDef->Reg)) {
4884 int FI = LHSDef->MI->getOperand(1).getIndex();
4885 MachineInstr &I = *Root.getParent();
4886 MachineBasicBlock *BB = I.getParent();
4887 const DebugLoc &DL = I.getDebugLoc();
4888 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4889
4890 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4891 .addFrameIndex(FI)
4892 .addReg(RHSDef->Reg)
4893 .setOperandDead(3); // Dead scc
4894 }
4895 }
4896
4897 if (!isSGPR(SAddr))
4898 return std::nullopt;
4899
4900 return {{
4901 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4902 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4903 }};
4904}
4905
4906// Check whether the flat scratch SVS swizzle bug affects this access.
4907bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4908 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4909 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4910 return false;
4911
4912 // The bug affects the swizzling of SVS accesses if there is any carry out
4913 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4914 // voffset to (soffset + inst_offset).
4915 auto VKnown = KB->getKnownBits(VAddr);
4916 auto SKnown = KnownBits::add(KB->getKnownBits(SAddr),
4917 KnownBits::makeConstant(APInt(32, ImmOffset)));
4918 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4919 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4920 return (VMax & 3) + (SMax & 3) >= 4;
4921}
4922
4924AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4925 Register Addr = Root.getReg();
4926 Register PtrBase;
4927 int64_t ConstOffset;
4928 int64_t ImmOffset = 0;
4929
4930 // Match the immediate offset first, which canonically is moved as low as
4931 // possible.
4932 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4933
4934 Register OrigAddr = Addr;
4935 if (ConstOffset != 0 &&
4936 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4937 Addr = PtrBase;
4938 ImmOffset = ConstOffset;
4939 }
4940
4941 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4942 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4943 return std::nullopt;
4944
4945 Register RHS = AddrDef->MI->getOperand(2).getReg();
4946 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4947 return std::nullopt;
4948
4949 Register LHS = AddrDef->MI->getOperand(1).getReg();
4950 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4951
4952 if (OrigAddr != Addr) {
4953 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4954 return std::nullopt;
4955 } else {
4956 if (!isFlatScratchBaseLegalSV(OrigAddr))
4957 return std::nullopt;
4958 }
4959
4960 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4961 return std::nullopt;
4962
4963 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4964 int FI = LHSDef->MI->getOperand(1).getIndex();
4965 return {{
4966 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4967 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4968 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4969 }};
4970 }
4971
4972 if (!isSGPR(LHS))
4973 return std::nullopt;
4974
4975 return {{
4976 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4977 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4978 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4979 }};
4980}
4981
4983AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4984 MachineInstr *MI = Root.getParent();
4985 MachineBasicBlock *MBB = MI->getParent();
4988
4989 int64_t Offset = 0;
4990 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4992 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4993
4994 // TODO: Should this be inside the render function? The iterator seems to
4995 // move.
4996 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4997 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4998 HighBits)
4999 .addImm(Offset & ~MaxOffset);
5000
5001 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5002 MIB.addReg(Info->getScratchRSrcReg());
5003 },
5004 [=](MachineInstrBuilder &MIB) { // vaddr
5005 MIB.addReg(HighBits);
5006 },
5007 [=](MachineInstrBuilder &MIB) { // soffset
5008 // Use constant zero for soffset and rely on eliminateFrameIndex
5009 // to choose the appropriate frame register if need be.
5010 MIB.addImm(0);
5011 },
5012 [=](MachineInstrBuilder &MIB) { // offset
5013 MIB.addImm(Offset & MaxOffset);
5014 }}};
5015 }
5016
5017 assert(Offset == 0 || Offset == -1);
5018
5019 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5020 // offsets.
5021 std::optional<int> FI;
5022 Register VAddr = Root.getReg();
5023
5024 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5025 Register PtrBase;
5026 int64_t ConstOffset;
5027 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5028 if (ConstOffset != 0) {
5029 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5031 KB->signBitIsZero(PtrBase))) {
5032 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5033 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5034 FI = PtrBaseDef->getOperand(1).getIndex();
5035 else
5036 VAddr = PtrBase;
5037 Offset = ConstOffset;
5038 }
5039 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5040 FI = RootDef->getOperand(1).getIndex();
5041 }
5042
5043 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5044 MIB.addReg(Info->getScratchRSrcReg());
5045 },
5046 [=](MachineInstrBuilder &MIB) { // vaddr
5047 if (FI)
5048 MIB.addFrameIndex(*FI);
5049 else
5050 MIB.addReg(VAddr);
5051 },
5052 [=](MachineInstrBuilder &MIB) { // soffset
5053 // Use constant zero for soffset and rely on eliminateFrameIndex
5054 // to choose the appropriate frame register if need be.
5055 MIB.addImm(0);
5056 },
5057 [=](MachineInstrBuilder &MIB) { // offset
5058 MIB.addImm(Offset);
5059 }}};
5060}
5061
5062bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5063 int64_t Offset) const {
5064 if (!isUInt<16>(Offset))
5065 return false;
5066
5068 return true;
5069
5070 // On Southern Islands instruction with a negative base value and an offset
5071 // don't seem to work.
5072 return KB->signBitIsZero(Base);
5073}
5074
5075bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5076 int64_t Offset1,
5077 unsigned Size) const {
5078 if (Offset0 % Size != 0 || Offset1 % Size != 0)
5079 return false;
5080 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
5081 return false;
5082
5084 return true;
5085
5086 // On Southern Islands instruction with a negative base value and an offset
5087 // don't seem to work.
5088 return KB->signBitIsZero(Base);
5089}
5090
5091// Return whether the operation has NoUnsignedWrap property.
5093 return Addr->getOpcode() == TargetOpcode::G_OR ||
5094 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5095 Addr->getFlag(MachineInstr::NoUWrap));
5096}
5097
5098// Check that the base address of flat scratch load/store in the form of `base +
5099// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5100// requirement). We always treat the first operand as the base address here.
5101bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
5102 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5103
5104 if (isNoUnsignedWrap(AddrMI))
5105 return true;
5106
5107 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5108 // values.
5109 if (STI.hasSignedScratchOffsets())
5110 return true;
5111
5112 Register LHS = AddrMI->getOperand(1).getReg();
5113 Register RHS = AddrMI->getOperand(2).getReg();
5114
5115 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5116 std::optional<ValueAndVReg> RhsValReg =
5118 // If the immediate offset is negative and within certain range, the base
5119 // address cannot also be negative. If the base is also negative, the sum
5120 // would be either negative or much larger than the valid range of scratch
5121 // memory a thread can access.
5122 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5123 RhsValReg->Value.getSExtValue() > -0x40000000)
5124 return true;
5125 }
5126
5127 return KB->signBitIsZero(LHS);
5128}
5129
5130// Check address value in SGPR/VGPR are legal for flat scratch in the form
5131// of: SGPR + VGPR.
5132bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
5133 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5134
5135 if (isNoUnsignedWrap(AddrMI))
5136 return true;
5137
5138 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5139 // values.
5140 if (STI.hasSignedScratchOffsets())
5141 return true;
5142
5143 Register LHS = AddrMI->getOperand(1).getReg();
5144 Register RHS = AddrMI->getOperand(2).getReg();
5145 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5146}
5147
5148// Check address value in SGPR/VGPR are legal for flat scratch in the form
5149// of: SGPR + VGPR + Imm.
5150bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5151 Register Addr) const {
5152 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5153 // values.
5154 if (STI.hasSignedScratchOffsets())
5155 return true;
5156
5157 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
5158 Register Base = AddrMI->getOperand(1).getReg();
5159 std::optional<DefinitionAndSourceRegister> BaseDef =
5161 std::optional<ValueAndVReg> RHSOffset =
5163 assert(RHSOffset);
5164
5165 // If the immediate offset is negative and within certain range, the base
5166 // address cannot also be negative. If the base is also negative, the sum
5167 // would be either negative or much larger than the valid range of scratch
5168 // memory a thread can access.
5169 if (isNoUnsignedWrap(BaseDef->MI) &&
5170 (isNoUnsignedWrap(AddrMI) ||
5171 (RHSOffset->Value.getSExtValue() < 0 &&
5172 RHSOffset->Value.getSExtValue() > -0x40000000)))
5173 return true;
5174
5175 Register LHS = BaseDef->MI->getOperand(1).getReg();
5176 Register RHS = BaseDef->MI->getOperand(2).getReg();
5177 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
5178}
5179
5180bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
5181 unsigned ShAmtBits) const {
5182 assert(MI.getOpcode() == TargetOpcode::G_AND);
5183
5184 std::optional<APInt> RHS =
5185 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
5186 if (!RHS)
5187 return false;
5188
5189 if (RHS->countr_one() >= ShAmtBits)
5190 return true;
5191
5192 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
5193 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5194}
5195
5197AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5198 MachineOperand &Root) const {
5199 Register Reg = Root.getReg();
5201
5202 std::optional<DefinitionAndSourceRegister> Def =
5203 getDefSrcRegIgnoringCopies(Reg, *MRI);
5204 assert(Def && "this shouldn't be an optional result");
5205 Reg = Def->Reg;
5206
5207 if (Register WaveBase = getWaveAddress(Def->MI)) {
5208 return {{
5209 [=](MachineInstrBuilder &MIB) { // rsrc
5210 MIB.addReg(Info->getScratchRSrcReg());
5211 },
5212 [=](MachineInstrBuilder &MIB) { // soffset
5213 MIB.addReg(WaveBase);
5214 },
5215 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
5216 }};
5217 }
5218
5219 int64_t Offset = 0;
5220
5221 // FIXME: Copy check is a hack
5223 if (mi_match(Reg, *MRI,
5224 m_GPtrAdd(m_Reg(BasePtr),
5226 if (!TII.isLegalMUBUFImmOffset(Offset))
5227 return {};
5228 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
5229 Register WaveBase = getWaveAddress(BasePtrDef);
5230 if (!WaveBase)
5231 return {};
5232
5233 return {{
5234 [=](MachineInstrBuilder &MIB) { // rsrc
5235 MIB.addReg(Info->getScratchRSrcReg());
5236 },
5237 [=](MachineInstrBuilder &MIB) { // soffset
5238 MIB.addReg(WaveBase);
5239 },
5240 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5241 }};
5242 }
5243
5244 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
5246 return {};
5247
5248 return {{
5249 [=](MachineInstrBuilder &MIB) { // rsrc
5250 MIB.addReg(Info->getScratchRSrcReg());
5251 },
5252 [=](MachineInstrBuilder &MIB) { // soffset
5253 MIB.addImm(0);
5254 },
5255 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
5256 }};
5257}
5258
5259std::pair<Register, unsigned>
5260AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
5261 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5262 int64_t ConstAddr = 0;
5263
5264 Register PtrBase;
5265 int64_t Offset;
5266 std::tie(PtrBase, Offset) =
5267 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5268
5269 if (Offset) {
5270 if (isDSOffsetLegal(PtrBase, Offset)) {
5271 // (add n0, c0)
5272 return std::pair(PtrBase, Offset);
5273 }
5274 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5275 // TODO
5276
5277
5278 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5279 // TODO
5280
5281 }
5282
5283 return std::pair(Root.getReg(), 0);
5284}
5285
5287AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
5288 Register Reg;
5289 unsigned Offset;
5290 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
5291 return {{
5292 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5293 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
5294 }};
5295}
5296
5298AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
5299 return selectDSReadWrite2(Root, 4);
5300}
5301
5303AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
5304 return selectDSReadWrite2(Root, 8);
5305}
5306
5308AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5309 unsigned Size) const {
5310 Register Reg;
5311 unsigned Offset;
5312 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
5313 return {{
5314 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5315 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5316 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
5317 }};
5318}
5319
5320std::pair<Register, unsigned>
5321AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5322 unsigned Size) const {
5323 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5324 int64_t ConstAddr = 0;
5325
5326 Register PtrBase;
5327 int64_t Offset;
5328 std::tie(PtrBase, Offset) =
5329 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5330
5331 if (Offset) {
5332 int64_t OffsetValue0 = Offset;
5333 int64_t OffsetValue1 = Offset + Size;
5334 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5335 // (add n0, c0)
5336 return std::pair(PtrBase, OffsetValue0 / Size);
5337 }
5338 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5339 // TODO
5340
5341 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5342 // TODO
5343
5344 }
5345
5346 return std::pair(Root.getReg(), 0);
5347}
5348
5349/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5350/// the base value with the constant offset. There may be intervening copies
5351/// between \p Root and the identified constant. Returns \p Root, 0 if this does
5352/// not match the pattern.
5353std::pair<Register, int64_t>
5354AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5355 Register Root, const MachineRegisterInfo &MRI) const {
5356 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
5357 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5358 return {Root, 0};
5359
5360 MachineOperand &RHS = RootI->getOperand(2);
5361 std::optional<ValueAndVReg> MaybeOffset =
5363 if (!MaybeOffset)
5364 return {Root, 0};
5365 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5366}
5367
5369 MIB.addImm(0);
5370}
5371
5372/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5373/// BasePtr is not valid, a null base pointer will be used.
5375 uint32_t FormatLo, uint32_t FormatHi,
5376 Register BasePtr) {
5377 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5378 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5379 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5380 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5381
5382 B.buildInstr(AMDGPU::S_MOV_B32)
5383 .addDef(RSrc2)
5384 .addImm(FormatLo);
5385 B.buildInstr(AMDGPU::S_MOV_B32)
5386 .addDef(RSrc3)
5387 .addImm(FormatHi);
5388
5389 // Build the half of the subregister with the constants before building the
5390 // full 128-bit register. If we are building multiple resource descriptors,
5391 // this will allow CSEing of the 2-component register.
5392 B.buildInstr(AMDGPU::REG_SEQUENCE)
5393 .addDef(RSrcHi)
5394 .addReg(RSrc2)
5395 .addImm(AMDGPU::sub0)
5396 .addReg(RSrc3)
5397 .addImm(AMDGPU::sub1);
5398
5399 Register RSrcLo = BasePtr;
5400 if (!BasePtr) {
5401 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5402 B.buildInstr(AMDGPU::S_MOV_B64)
5403 .addDef(RSrcLo)
5404 .addImm(0);
5405 }
5406
5407 B.buildInstr(AMDGPU::REG_SEQUENCE)
5408 .addDef(RSrc)
5409 .addReg(RSrcLo)
5410 .addImm(AMDGPU::sub0_sub1)
5411 .addReg(RSrcHi)
5412 .addImm(AMDGPU::sub2_sub3);
5413
5414 return RSrc;
5415}
5416
5418 const SIInstrInfo &TII, Register BasePtr) {
5419 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5420
5421 // FIXME: Why are half the "default" bits ignored based on the addressing
5422 // mode?
5423 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5424}
5425
5427 const SIInstrInfo &TII, Register BasePtr) {
5428 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5429
5430 // FIXME: Why are half the "default" bits ignored based on the addressing
5431 // mode?
5432 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5433}
5434
5435AMDGPUInstructionSelector::MUBUFAddressData
5436AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5437 MUBUFAddressData Data;
5438 Data.N0 = Src;
5439
5440 Register PtrBase;
5441 int64_t Offset;
5442
5443 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5444 if (isUInt<32>(Offset)) {
5445 Data.N0 = PtrBase;
5446 Data.Offset = Offset;
5447 }
5448
5449 if (MachineInstr *InputAdd
5450 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5451 Data.N2 = InputAdd->getOperand(1).getReg();
5452 Data.N3 = InputAdd->getOperand(2).getReg();
5453
5454 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5455 // FIXME: Don't know this was defined by operand 0
5456 //
5457 // TODO: Remove this when we have copy folding optimizations after
5458 // RegBankSelect.
5459 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5460 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5461 }
5462
5463 return Data;
5464}
5465
5466/// Return if the addr64 mubuf mode should be used for the given address.
5467bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5468 // (ptr_add N2, N3) -> addr64, or
5469 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5470 if (Addr.N2)
5471 return true;
5472
5473 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5474 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5475}
5476
5477/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5478/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5479/// component.
5480void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5481 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5482 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5483 return;
5484
5485 // Illegal offset, store it in soffset.
5486 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5487 B.buildInstr(AMDGPU::S_MOV_B32)
5488 .addDef(SOffset)
5489 .addImm(ImmOffset);
5490 ImmOffset = 0;
5491}
5492
5493bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5494 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5495 Register &SOffset, int64_t &Offset) const {
5496 // FIXME: Predicates should stop this from reaching here.
5497 // addr64 bit was removed for volcanic islands.
5498 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5499 return false;
5500
5501 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5502 if (!shouldUseAddr64(AddrData))
5503 return false;
5504
5505 Register N0 = AddrData.N0;
5506 Register N2 = AddrData.N2;
5507 Register N3 = AddrData.N3;
5508 Offset = AddrData.Offset;
5509
5510 // Base pointer for the SRD.
5511 Register SRDPtr;
5512
5513 if (N2) {
5514 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5515 assert(N3);
5516 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5517 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5518 // addr64, and construct the default resource from a 0 address.
5519 VAddr = N0;
5520 } else {
5521 SRDPtr = N3;
5522 VAddr = N2;
5523 }
5524 } else {
5525 // N2 is not divergent.
5526 SRDPtr = N2;
5527 VAddr = N3;
5528 }
5529 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5530 // Use the default null pointer in the resource
5531 VAddr = N0;
5532 } else {
5533 // N0 -> offset, or
5534 // (N0 + C1) -> offset
5535 SRDPtr = N0;
5536 }
5537
5538 MachineIRBuilder B(*Root.getParent());
5539 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5540 splitIllegalMUBUFOffset(B, SOffset, Offset);
5541 return true;
5542}
5543
5544bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5545 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5546 int64_t &Offset) const {
5547
5548 // FIXME: Pattern should not reach here.
5549 if (STI.useFlatForGlobal())
5550 return false;
5551
5552 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5553 if (shouldUseAddr64(AddrData))
5554 return false;
5555
5556 // N0 -> offset, or
5557 // (N0 + C1) -> offset
5558 Register SRDPtr = AddrData.N0;
5559 Offset = AddrData.Offset;
5560
5561 // TODO: Look through extensions for 32-bit soffset.
5562 MachineIRBuilder B(*Root.getParent());
5563
5564 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5565 splitIllegalMUBUFOffset(B, SOffset, Offset);
5566 return true;
5567}
5568
5570AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5571 Register VAddr;
5572 Register RSrcReg;
5573 Register SOffset;
5574 int64_t Offset = 0;
5575
5576 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5577 return {};
5578
5579 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5580 // pattern.
5581 return {{
5582 [=](MachineInstrBuilder &MIB) { // rsrc
5583 MIB.addReg(RSrcReg);
5584 },
5585 [=](MachineInstrBuilder &MIB) { // vaddr
5586 MIB.addReg(VAddr);
5587 },
5588 [=](MachineInstrBuilder &MIB) { // soffset
5589 if (SOffset)
5590 MIB.addReg(SOffset);
5591 else if (STI.hasRestrictedSOffset())
5592 MIB.addReg(AMDGPU::SGPR_NULL);
5593 else
5594 MIB.addImm(0);
5595 },
5596 [=](MachineInstrBuilder &MIB) { // offset
5597 MIB.addImm(Offset);
5598 },
5599 addZeroImm, // cpol
5600 addZeroImm, // tfe
5601 addZeroImm // swz
5602 }};
5603}
5604
5606AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5607 Register RSrcReg;
5608 Register SOffset;
5609 int64_t Offset = 0;
5610
5611 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5612 return {};
5613
5614 return {{
5615 [=](MachineInstrBuilder &MIB) { // rsrc
5616 MIB.addReg(RSrcReg);
5617 },
5618 [=](MachineInstrBuilder &MIB) { // soffset
5619 if (SOffset)
5620 MIB.addReg(SOffset);
5621 else if (STI.hasRestrictedSOffset())
5622 MIB.addReg(AMDGPU::SGPR_NULL);
5623 else
5624 MIB.addImm(0);
5625 },
5626 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5627 addZeroImm, // cpol
5628 addZeroImm, // tfe
5629 addZeroImm, // swz
5630 }};
5631}
5632
5634AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5635
5636 Register SOffset = Root.getReg();
5637
5638 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5639 SOffset = AMDGPU::SGPR_NULL;
5640
5641 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5642}
5643
5644/// Get an immediate that must be 32-bits, and treated as zero extended.
5645static std::optional<uint64_t>
5647 // getIConstantVRegVal sexts any values, so see if that matters.
5648 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5649 if (!OffsetVal || !isInt<32>(*OffsetVal))
5650 return std::nullopt;
5651 return Lo_32(*OffsetVal);
5652}
5653
5655AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5656 std::optional<uint64_t> OffsetVal =
5657 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
5658 if (!OffsetVal)
5659 return {};
5660
5661 std::optional<int64_t> EncodedImm =
5662 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5663 if (!EncodedImm)
5664 return {};
5665
5666 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5667}
5668
5670AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5672
5673 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5674 if (!OffsetVal)
5675 return {};
5676
5677 std::optional<int64_t> EncodedImm =
5679 if (!EncodedImm)
5680 return {};
5681
5682 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5683}
5684
5686AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5687 // Match the (soffset + offset) pair as a 32-bit register base and
5688 // an immediate offset.
5689 Register SOffset;
5690 unsigned Offset;
5691 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5692 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5693 if (!SOffset)
5694 return std::nullopt;
5695
5696 std::optional<int64_t> EncodedOffset =
5697 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5698 if (!EncodedOffset)
5699 return std::nullopt;
5700
5701 assert(MRI->getType(SOffset) == LLT::scalar(32));
5702 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5703 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5704}
5705
5706std::pair<Register, unsigned>
5707AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5708 bool &Matched) const {
5709 Matched = false;
5710
5711 Register Src;
5712 unsigned Mods;
5713 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5714
5715 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
5716 assert(MRI->getType(Src) == LLT::scalar(16));
5717
5718 // Only change Src if src modifier could be gained. In such cases new Src
5719 // could be sgpr but this does not violate constant bus restriction for
5720 // instruction that is being selected.
5721 Src = stripBitCast(Src, *MRI);
5722
5723 const auto CheckAbsNeg = [&]() {
5724 // Be careful about folding modifiers if we already have an abs. fneg is
5725 // applied last, so we don't want to apply an earlier fneg.
5726 if ((Mods & SISrcMods::ABS) == 0) {
5727 unsigned ModsTmp;
5728 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
5729
5730 if ((ModsTmp & SISrcMods::NEG) != 0)
5731 Mods ^= SISrcMods::NEG;
5732
5733 if ((ModsTmp & SISrcMods::ABS) != 0)
5734 Mods |= SISrcMods::ABS;
5735 }
5736 };
5737
5738 CheckAbsNeg();
5739
5740 // op_sel/op_sel_hi decide the source type and source.
5741 // If the source's op_sel_hi is set, it indicates to do a conversion from
5742 // fp16. If the sources's op_sel is set, it picks the high half of the
5743 // source register.
5744
5745 Mods |= SISrcMods::OP_SEL_1;
5746
5747 if (isExtractHiElt(*MRI, Src, Src)) {
5748 Mods |= SISrcMods::OP_SEL_0;
5749 CheckAbsNeg();
5750 }
5751
5752 Matched = true;
5753 }
5754
5755 return {Src, Mods};
5756}
5757
5759AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5760 MachineOperand &Root) const {
5761 Register Src;
5762 unsigned Mods;
5763 bool Matched;
5764 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5765 if (!Matched)
5766 return {};
5767
5768 return {{
5769 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5770 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5771 }};
5772}
5773
5775AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5776 Register Src;
5777 unsigned Mods;
5778 bool Matched;
5779 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5780
5781 return {{
5782 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5783 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5784 }};
5785}
5786
5787bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5788 MachineInstr &I, Intrinsic::ID IntrID) const {
5789 MachineBasicBlock *MBB = I.getParent();
5790 const DebugLoc &DL = I.getDebugLoc();
5791 Register CCReg = I.getOperand(0).getReg();
5792
5793 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5794 .addImm(I.getOperand(2).getImm());
5795
5796 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5797
5798 I.eraseFromParent();
5799 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5800 *MRI);
5801}
5802
5803bool AMDGPUInstructionSelector::selectSGetBarrierState(
5804 MachineInstr &I, Intrinsic::ID IntrID) const {
5805 MachineBasicBlock *MBB = I.getParent();
5806 const DebugLoc &DL = I.getDebugLoc();
5807 MachineOperand BarOp = I.getOperand(2);
5808 std::optional<int64_t> BarValImm =
5809 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5810
5811 if (!BarValImm) {
5812 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5813 .addReg(BarOp.getReg());
5814 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5815 }
5817 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
5818 : AMDGPU::S_GET_BARRIER_STATE_M0;
5819 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5820
5821 auto DstReg = I.getOperand(0).getReg();
5822 const TargetRegisterClass *DstRC =
5823 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5824 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5825 return false;
5826 MIB.addDef(DstReg);
5827 if (BarValImm) {
5828 MIB.addImm(*BarValImm);
5829 }
5830 I.eraseFromParent();
5831 return true;
5832}
5833
5834unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5835 if (HasInlineConst) {
5836 switch (IntrID) {
5837 default:
5838 llvm_unreachable("not a named barrier op");
5839 case Intrinsic::amdgcn_s_barrier_join:
5840 return AMDGPU::S_BARRIER_JOIN_IMM;
5841 case Intrinsic::amdgcn_s_wakeup_barrier:
5842 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5843 case Intrinsic::amdgcn_s_get_named_barrier_state:
5844 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5845 };
5846 } else {
5847 switch (IntrID) {
5848 default:
5849 llvm_unreachable("not a named barrier op");
5850 case Intrinsic::amdgcn_s_barrier_join:
5851 return AMDGPU::S_BARRIER_JOIN_M0;
5852 case Intrinsic::amdgcn_s_wakeup_barrier:
5853 return AMDGPU::S_WAKEUP_BARRIER_M0;
5854 case Intrinsic::amdgcn_s_get_named_barrier_state:
5855 return AMDGPU::S_GET_BARRIER_STATE_M0;
5856 };
5857 }
5858}
5859
5860bool AMDGPUInstructionSelector::selectNamedBarrierInit(
5861 MachineInstr &I, Intrinsic::ID IntrID) const {
5862 MachineBasicBlock *MBB = I.getParent();
5863 const DebugLoc &DL = I.getDebugLoc();
5864 MachineOperand BarOp = I.getOperand(1);
5865 MachineOperand CntOp = I.getOperand(2);
5866
5867 // BarID = (BarOp >> 4) & 0x3F
5868 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5869 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5870 .add(BarOp)
5871 .addImm(4u)
5872 .setOperandDead(3); // Dead scc
5873
5874 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5875 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5876 .addReg(TmpReg0)
5877 .addImm(0x3F)
5878 .setOperandDead(3); // Dead scc
5879
5880 // MO = ((CntOp & 0x3F) << shAmt) | BarID
5881 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5882 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
5883 .add(CntOp)
5884 .addImm(0x3F)
5885 .setOperandDead(3); // Dead scc
5886
5887 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5888 constexpr unsigned ShAmt = 16;
5889 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
5890 .addReg(TmpReg2)
5891 .addImm(ShAmt)
5892 .setOperandDead(3); // Dead scc
5893
5894 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5895 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
5896 .addReg(TmpReg1)
5897 .addReg(TmpReg3)
5898 .setOperandDead(3); // Dead scc;
5899
5900 auto CopyMIB =
5901 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
5902 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5903
5904 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
5905 ? AMDGPU::S_BARRIER_INIT_M0
5906 : AMDGPU::S_BARRIER_SIGNAL_M0;
5908 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5909
5910 I.eraseFromParent();
5911 return true;
5912}
5913
5914bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5915 MachineInstr &I, Intrinsic::ID IntrID) const {
5916 MachineBasicBlock *MBB = I.getParent();
5917 const DebugLoc &DL = I.getDebugLoc();
5918 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
5919 ? I.getOperand(2)
5920 : I.getOperand(1);
5921 std::optional<int64_t> BarValImm =
5922 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5923
5924 if (!BarValImm) {
5925 // BarID = (BarOp >> 4) & 0x3F
5926 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5927 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5928 .addReg(BarOp.getReg())
5929 .addImm(4u)
5930 .setOperandDead(3); // Dead scc;
5931
5932 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5933 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5934 .addReg(TmpReg0)
5935 .addImm(0x3F)
5936 .setOperandDead(3); // Dead scc;
5937
5938 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5939 .addReg(TmpReg1);
5940 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5941 }
5942
5944 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5945 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5946
5947 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
5948 auto DstReg = I.getOperand(0).getReg();
5949 const TargetRegisterClass *DstRC =
5950 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5951 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5952 return false;
5953 MIB.addDef(DstReg);
5954 }
5955
5956 if (BarValImm) {
5957 auto BarId = ((*BarValImm) >> 4) & 0x3F;
5958 MIB.addImm(BarId);
5959 }
5960
5961 I.eraseFromParent();
5962 return true;
5963}
5964
5965void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5966 const MachineInstr &MI,
5967 int OpIdx) const {
5968 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5969 "Expected G_CONSTANT");
5970 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5971}
5972
5973void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5974 const MachineInstr &MI,
5975 int OpIdx) const {
5976 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5977 "Expected G_CONSTANT");
5978 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5979}
5980
5981void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
5982 const MachineInstr &MI,
5983 int OpIdx) const {
5984 const MachineOperand &Op = MI.getOperand(1);
5985 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
5986 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5987}
5988
5989void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5990 const MachineInstr &MI,
5991 int OpIdx) const {
5992 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5993 "Expected G_CONSTANT");
5994 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5995}
5996
5997/// This only really exists to satisfy DAG type checking machinery, so is a
5998/// no-op here.
5999void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6000 const MachineInstr &MI,
6001 int OpIdx) const {
6002 const MachineOperand &Op = MI.getOperand(OpIdx);
6003 int64_t Imm;
6004 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6005 MIB.addImm(Imm);
6006 else
6007 MIB.addImm(Op.getImm());
6008}
6009
6010void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6011 const MachineInstr &MI,
6012 int OpIdx) const {
6013 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6014}
6015
6016void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6017 const MachineInstr &MI,
6018 int OpIdx) const {
6019 assert(OpIdx >= 0 && "expected to match an immediate operand");
6020 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6021}
6022
6023void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6024 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6025 assert(OpIdx >= 0 && "expected to match an immediate operand");
6026 MIB.addImm(
6027 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6028}
6029
6030void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6031 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6032 assert(OpIdx >= 0 && "expected to match an immediate operand");
6033 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6035 : (int64_t)SISrcMods::DST_OP_SEL);
6036}
6037
6038void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6039 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6040 assert(OpIdx >= 0 && "expected to match an immediate operand");
6041 MIB.addImm(
6042 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6043}
6044
6045void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6046 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6047 assert(OpIdx >= 0 && "expected to match an immediate operand");
6048 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6049 ? (int64_t)(SISrcMods::OP_SEL_0)
6050 : 0);
6051}
6052
6053void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6054 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6055 assert(OpIdx >= 0 && "expected to match an immediate operand");
6056 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6057 : 0);
6058}
6059
6060void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6061 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6062 assert(OpIdx >= 0 && "expected to match an immediate operand");
6063 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6064 : 0);
6065}
6066
6067void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6068 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6069 assert(OpIdx >= 0 && "expected to match an immediate operand");
6070 MIB.addImm(
6071 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6072}
6073
6074void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6075 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6076 assert(OpIdx >= 0 && "expected to match an immediate operand");
6077 MIB.addImm(
6078 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
6079}
6080
6081void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6082 const MachineInstr &MI,
6083 int OpIdx) const {
6084 assert(OpIdx >= 0 && "expected to match an immediate operand");
6085 MIB.addImm(MI.getOperand(OpIdx).getImm() &
6088}
6089
6090void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6091 const MachineInstr &MI,
6092 int OpIdx) const {
6093 assert(OpIdx >= 0 && "expected to match an immediate operand");
6094 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
6097 MIB.addImm(Swizzle);
6098}
6099
6100void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6101 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6102 assert(OpIdx >= 0 && "expected to match an immediate operand");
6103 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
6106 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
6107}
6108
6109void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6110 const MachineInstr &MI,
6111 int OpIdx) const {
6112 MIB.addFrameIndex(MI.getOperand(1).getIndex());
6113}
6114
6115void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6116 const MachineInstr &MI,
6117 int OpIdx) const {
6118 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
6119 int ExpVal = APF.getExactLog2Abs();
6120 assert(ExpVal != INT_MIN);
6121 MIB.addImm(ExpVal);
6122}
6123
6124void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6125 const MachineInstr &MI,
6126 int OpIdx) const {
6127 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
6128 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6129 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
6130 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
6131 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
6132}
6133
6134/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
6135void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6136 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6137 unsigned Val = MI.getOperand(OpIdx).getImm();
6138 unsigned New = 0;
6139 if (Val & 0x1)
6141 if (Val & 0x2)
6143 MIB.addImm(New);
6144}
6145
6146bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
6147 return TII.isInlineConstant(Imm);
6148}
6149
6150bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
6151 return TII.isInlineConstant(Imm);
6152}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1484
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:690
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:688
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:687
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:685
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:686
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:675
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:271
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:163
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:350
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:482
bool hasBitOp3Insts() const
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:563
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:287
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:730
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:277
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:270
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1069
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:578
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
MCRegister getExec() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:895
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:630
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:444
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:279
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:471
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:299
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:424
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
DWARFExpression::Operation Op
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:418
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:452
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:478
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.