LLVM 22.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
165 assert(Subtarget->useRealTrue16Insts());
166 const int64_t NoMods = 0;
167 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
168 .addImm(NoMods)
169 .addImm(1)
170 .addImm(NoMods)
171 .addReg(SrcReg)
172 .addImm(NoMods);
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
174 .addImm(NoMods)
175 .addImm(0)
176 .addImm(NoMods)
177 .addReg(MaskedReg)
178 .addImm(NoMods);
179 } else {
180 bool IsSGPR = TRI.isSGPRClass(SrcRC);
181 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
182 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
183 .addImm(1)
184 .addReg(SrcReg);
185 if (IsSGPR)
186 And.setOperandDead(3); // Dead scc
187
188 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
189 .addImm(0)
190 .addReg(MaskedReg);
191 }
192 }
193
194 if (!MRI->getRegClassOrNull(SrcReg))
195 MRI->setRegClass(SrcReg, SrcRC);
196 I.eraseFromParent();
197 return true;
198 }
199
200 const TargetRegisterClass *RC =
201 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
202 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
203 return false;
204
205 return true;
206 }
207
208 for (const MachineOperand &MO : I.operands()) {
209 if (MO.getReg().isPhysical())
210 continue;
211
212 const TargetRegisterClass *RC =
213 TRI.getConstrainedRegClassForOperand(MO, *MRI);
214 if (!RC)
215 continue;
216 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
217 }
218 return true;
219}
220
221bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
222 const DebugLoc &DL = I.getDebugLoc();
223 MachineBasicBlock *BB = I.getParent();
224 Register VCCReg = I.getOperand(1).getReg();
225 MachineInstr *Cmp;
226
227 // Set SCC as a side effect with S_CMP or S_OR.
228 if (STI.hasScalarCompareEq64()) {
229 unsigned CmpOpc =
230 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
231 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
232 } else {
233 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
234 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
235 .addReg(VCCReg)
236 .addReg(VCCReg);
237 }
238
239 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
240 return false;
241
242 Register DstReg = I.getOperand(0).getReg();
243 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
244
245 I.eraseFromParent();
246 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
247}
248
249bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
250 const DebugLoc &DL = I.getDebugLoc();
251 MachineBasicBlock *BB = I.getParent();
252
253 Register DstReg = I.getOperand(0).getReg();
254 Register SrcReg = I.getOperand(1).getReg();
255 std::optional<ValueAndVReg> Arg =
256 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
257
258 if (Arg) {
259 const int64_t Value = Arg->Value.getZExtValue();
260 if (Value == 0) {
261 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
262 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
263 } else {
264 assert(Value == 1);
265 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
266 }
267 I.eraseFromParent();
268 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
269 }
270
271 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
272 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
273
274 unsigned SelectOpcode =
275 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
276 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
277 .addReg(TRI.getExec())
278 .addImm(0);
279
280 I.eraseFromParent();
281 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
282}
283
284bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
285 Register DstReg = I.getOperand(0).getReg();
286 Register SrcReg = I.getOperand(1).getReg();
287
288 const DebugLoc &DL = I.getDebugLoc();
289 MachineBasicBlock *BB = I.getParent();
290
291 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
292 .addReg(SrcReg);
293
294 I.eraseFromParent();
295 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
296}
297
298bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
299 const Register DefReg = I.getOperand(0).getReg();
300 const LLT DefTy = MRI->getType(DefReg);
301
302 // S1 G_PHIs should not be selected in instruction-select, instead:
303 // - divergent S1 G_PHI should go through lane mask merging algorithm
304 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
305 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
306 if (DefTy == LLT::scalar(1))
307 return false;
308
309 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
310
311 const RegClassOrRegBank &RegClassOrBank =
312 MRI->getRegClassOrRegBank(DefReg);
313
314 const TargetRegisterClass *DefRC =
316 if (!DefRC) {
317 if (!DefTy.isValid()) {
318 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
319 return false;
320 }
321
322 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
323 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
324 if (!DefRC) {
325 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
326 return false;
327 }
328 }
329
330 // If inputs have register bank, assign corresponding reg class.
331 // Note: registers don't need to have the same reg bank.
332 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
333 const Register SrcReg = I.getOperand(i).getReg();
334
335 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
336 if (RB) {
337 const LLT SrcTy = MRI->getType(SrcReg);
338 const TargetRegisterClass *SrcRC =
339 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
340 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
341 return false;
342 }
343 }
344
345 I.setDesc(TII.get(TargetOpcode::PHI));
346 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
347}
348
350AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
351 const TargetRegisterClass &SubRC,
352 unsigned SubIdx) const {
353
354 MachineInstr *MI = MO.getParent();
355 MachineBasicBlock *BB = MO.getParent()->getParent();
356 Register DstReg = MRI->createVirtualRegister(&SubRC);
357
358 if (MO.isReg()) {
359 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
360 Register Reg = MO.getReg();
361 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
362 .addReg(Reg, 0, ComposedSubIdx);
363
364 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
365 MO.isKill(), MO.isDead(), MO.isUndef(),
366 MO.isEarlyClobber(), 0, MO.isDebug(),
367 MO.isInternalRead());
368 }
369
370 assert(MO.isImm());
371
372 APInt Imm(64, MO.getImm());
373
374 switch (SubIdx) {
375 default:
376 llvm_unreachable("do not know to split immediate with this sub index.");
377 case AMDGPU::sub0:
378 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
379 case AMDGPU::sub1:
380 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
381 }
382}
383
384static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
385 switch (Opc) {
386 case AMDGPU::G_AND:
387 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
388 case AMDGPU::G_OR:
389 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
390 case AMDGPU::G_XOR:
391 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
392 default:
393 llvm_unreachable("not a bit op");
394 }
395}
396
397bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
398 Register DstReg = I.getOperand(0).getReg();
399 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
400
401 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
402 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
403 DstRB->getID() != AMDGPU::VCCRegBankID)
404 return false;
405
406 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
407 STI.isWave64());
408 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
409
410 // Dead implicit-def of scc
411 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
412 true, // isImp
413 false, // isKill
414 true)); // isDead
415 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
416}
417
418bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
419 MachineBasicBlock *BB = I.getParent();
420 MachineFunction *MF = BB->getParent();
421 Register DstReg = I.getOperand(0).getReg();
422 const DebugLoc &DL = I.getDebugLoc();
423 LLT Ty = MRI->getType(DstReg);
424 if (Ty.isVector())
425 return false;
426
427 unsigned Size = Ty.getSizeInBits();
428 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
429 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
430 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
431
432 if (Size == 32) {
433 if (IsSALU) {
434 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
435 MachineInstr *Add =
436 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
437 .add(I.getOperand(1))
438 .add(I.getOperand(2))
439 .setOperandDead(3); // Dead scc
440 I.eraseFromParent();
441 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
442 }
443
444 if (STI.hasAddNoCarry()) {
445 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
446 I.setDesc(TII.get(Opc));
447 I.addOperand(*MF, MachineOperand::CreateImm(0));
448 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
449 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
450 }
451
452 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
453
454 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
455 MachineInstr *Add
456 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
457 .addDef(UnusedCarry, RegState::Dead)
458 .add(I.getOperand(1))
459 .add(I.getOperand(2))
460 .addImm(0);
461 I.eraseFromParent();
462 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
463 }
464
465 assert(!Sub && "illegal sub should not reach here");
466
467 const TargetRegisterClass &RC
468 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
469 const TargetRegisterClass &HalfRC
470 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
471
472 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
473 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
474 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
475 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
476
477 Register DstLo = MRI->createVirtualRegister(&HalfRC);
478 Register DstHi = MRI->createVirtualRegister(&HalfRC);
479
480 if (IsSALU) {
481 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
482 .add(Lo1)
483 .add(Lo2);
484 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
485 .add(Hi1)
486 .add(Hi2)
487 .setOperandDead(3); // Dead scc
488 } else {
489 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
490 Register CarryReg = MRI->createVirtualRegister(CarryRC);
491 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
492 .addDef(CarryReg)
493 .add(Lo1)
494 .add(Lo2)
495 .addImm(0);
496 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
497 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
498 .add(Hi1)
499 .add(Hi2)
500 .addReg(CarryReg, RegState::Kill)
501 .addImm(0);
502
503 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
504 return false;
505 }
506
507 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
508 .addReg(DstLo)
509 .addImm(AMDGPU::sub0)
510 .addReg(DstHi)
511 .addImm(AMDGPU::sub1);
512
513
514 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
515 return false;
516
517 I.eraseFromParent();
518 return true;
519}
520
521bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
522 MachineInstr &I) const {
523 MachineBasicBlock *BB = I.getParent();
524 MachineFunction *MF = BB->getParent();
525 const DebugLoc &DL = I.getDebugLoc();
526 Register Dst0Reg = I.getOperand(0).getReg();
527 Register Dst1Reg = I.getOperand(1).getReg();
528 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
529 I.getOpcode() == AMDGPU::G_UADDE;
530 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
531 I.getOpcode() == AMDGPU::G_USUBE;
532
533 if (isVCC(Dst1Reg, *MRI)) {
534 unsigned NoCarryOpc =
535 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
536 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
537 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
538 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
539 I.addOperand(*MF, MachineOperand::CreateImm(0));
540 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
541 }
542
543 Register Src0Reg = I.getOperand(2).getReg();
544 Register Src1Reg = I.getOperand(3).getReg();
545
546 if (HasCarryIn) {
547 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
548 .addReg(I.getOperand(4).getReg());
549 }
550
551 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
552 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
553
554 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
555 .add(I.getOperand(2))
556 .add(I.getOperand(3));
557
558 if (MRI->use_nodbg_empty(Dst1Reg)) {
559 CarryInst.setOperandDead(3); // Dead scc
560 } else {
561 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
562 .addReg(AMDGPU::SCC);
563 if (!MRI->getRegClassOrNull(Dst1Reg))
564 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
565 }
566
567 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
568 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
569 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
570 return false;
571
572 if (HasCarryIn &&
573 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
574 AMDGPU::SReg_32RegClass, *MRI))
575 return false;
576
577 I.eraseFromParent();
578 return true;
579}
580
581bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
582 MachineInstr &I) const {
583 MachineBasicBlock *BB = I.getParent();
584 MachineFunction *MF = BB->getParent();
585 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
586 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
587 MRI->use_nodbg_empty(I.getOperand(1).getReg());
588
589 unsigned Opc;
590 if (Subtarget->hasMADIntraFwdBug())
591 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
592 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
593 else if (UseNoCarry)
594 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
595 : AMDGPU::V_MAD_NC_I64_I32_e64;
596 else
597 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
598
599 if (UseNoCarry)
600 I.removeOperand(1);
601
602 I.setDesc(TII.get(Opc));
603 I.addOperand(*MF, MachineOperand::CreateImm(0));
604 I.addImplicitDefUseOperands(*MF);
605 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
606}
607
608// TODO: We should probably legalize these to only using 32-bit results.
609bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
610 MachineBasicBlock *BB = I.getParent();
611 Register DstReg = I.getOperand(0).getReg();
612 Register SrcReg = I.getOperand(1).getReg();
613 LLT DstTy = MRI->getType(DstReg);
614 LLT SrcTy = MRI->getType(SrcReg);
615 const unsigned SrcSize = SrcTy.getSizeInBits();
616 unsigned DstSize = DstTy.getSizeInBits();
617
618 // TODO: Should handle any multiple of 32 offset.
619 unsigned Offset = I.getOperand(2).getImm();
620 if (Offset % 32 != 0 || DstSize > 128)
621 return false;
622
623 // 16-bit operations really use 32-bit registers.
624 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
625 if (DstSize == 16)
626 DstSize = 32;
627
628 const TargetRegisterClass *DstRC =
629 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
630 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
631 return false;
632
633 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
634 const TargetRegisterClass *SrcRC =
635 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
636 if (!SrcRC)
637 return false;
639 DstSize / 32);
640 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
641 if (!SrcRC)
642 return false;
643
644 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
645 *SrcRC, I.getOperand(1));
646 const DebugLoc &DL = I.getDebugLoc();
647 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
648 .addReg(SrcReg, 0, SubReg);
649
650 I.eraseFromParent();
651 return true;
652}
653
654bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
655 MachineBasicBlock *BB = MI.getParent();
656 Register DstReg = MI.getOperand(0).getReg();
657 LLT DstTy = MRI->getType(DstReg);
658 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
659
660 const unsigned SrcSize = SrcTy.getSizeInBits();
661 if (SrcSize < 32)
662 return selectImpl(MI, *CoverageInfo);
663
664 const DebugLoc &DL = MI.getDebugLoc();
665 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
666 const unsigned DstSize = DstTy.getSizeInBits();
667 const TargetRegisterClass *DstRC =
668 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
669 if (!DstRC)
670 return false;
671
672 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
673 MachineInstrBuilder MIB =
674 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
675 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
676 MachineOperand &Src = MI.getOperand(I + 1);
677 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
678 MIB.addImm(SubRegs[I]);
679
680 const TargetRegisterClass *SrcRC
681 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
682 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
683 return false;
684 }
685
686 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
687 return false;
688
689 MI.eraseFromParent();
690 return true;
691}
692
693bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
694 MachineBasicBlock *BB = MI.getParent();
695 const int NumDst = MI.getNumOperands() - 1;
696
697 MachineOperand &Src = MI.getOperand(NumDst);
698
699 Register SrcReg = Src.getReg();
700 Register DstReg0 = MI.getOperand(0).getReg();
701 LLT DstTy = MRI->getType(DstReg0);
702 LLT SrcTy = MRI->getType(SrcReg);
703
704 const unsigned DstSize = DstTy.getSizeInBits();
705 const unsigned SrcSize = SrcTy.getSizeInBits();
706 const DebugLoc &DL = MI.getDebugLoc();
707 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
708
709 const TargetRegisterClass *SrcRC =
710 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
711 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
712 return false;
713
714 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
715 // source, and this relies on the fact that the same subregister indices are
716 // used for both.
717 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
718 for (int I = 0, E = NumDst; I != E; ++I) {
719 MachineOperand &Dst = MI.getOperand(I);
720 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
721 .addReg(SrcReg, 0, SubRegs[I]);
722
723 // Make sure the subregister index is valid for the source register.
724 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
725 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
726 return false;
727
728 const TargetRegisterClass *DstRC =
729 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
730 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
731 return false;
732 }
733
734 MI.eraseFromParent();
735 return true;
736}
737
738bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
739 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
740 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
741
742 Register Src0 = MI.getOperand(1).getReg();
743 Register Src1 = MI.getOperand(2).getReg();
744 LLT SrcTy = MRI->getType(Src0);
745 const unsigned SrcSize = SrcTy.getSizeInBits();
746
747 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
748 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
749 return selectG_MERGE_VALUES(MI);
750 }
751
752 // Selection logic below is for V2S16 only.
753 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
754 Register Dst = MI.getOperand(0).getReg();
755 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
756 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
757 SrcTy != LLT::scalar(32)))
758 return selectImpl(MI, *CoverageInfo);
759
760 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
761 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
762 return false;
763
764 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
765 DstBank->getID() == AMDGPU::VGPRRegBankID);
766 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
767
768 const DebugLoc &DL = MI.getDebugLoc();
769 MachineBasicBlock *BB = MI.getParent();
770
771 // First, before trying TableGen patterns, check if both sources are
772 // constants. In those cases, we can trivially compute the final constant
773 // and emit a simple move.
774 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
775 if (ConstSrc1) {
776 auto ConstSrc0 =
777 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
778 if (ConstSrc0) {
779 const int64_t K0 = ConstSrc0->Value.getSExtValue();
780 const int64_t K1 = ConstSrc1->Value.getSExtValue();
781 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
782 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
783 uint32_t Imm = Lo16 | (Hi16 << 16);
784
785 // VALU
786 if (IsVector) {
787 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
788 MI.eraseFromParent();
789 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
790 }
791
792 // SALU
793 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
794 MI.eraseFromParent();
795 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
796 }
797 }
798
799 // Now try TableGen patterns.
800 if (selectImpl(MI, *CoverageInfo))
801 return true;
802
803 // TODO: This should probably be a combine somewhere
804 // (build_vector $src0, undef) -> copy $src0
805 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
806 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
807 MI.setDesc(TII.get(AMDGPU::COPY));
808 MI.removeOperand(2);
809 const auto &RC =
810 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
811 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
812 RBI.constrainGenericRegister(Src0, RC, *MRI);
813 }
814
815 // TODO: Can be improved?
816 if (IsVector) {
817 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
818 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
819 .addImm(0xFFFF)
820 .addReg(Src0);
821 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
822 return false;
823
824 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
825 .addReg(Src1)
826 .addImm(16)
827 .addReg(TmpReg);
828 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
829 return false;
830
831 MI.eraseFromParent();
832 return true;
833 }
834
835 Register ShiftSrc0;
836 Register ShiftSrc1;
837
838 // With multiple uses of the shift, this will duplicate the shift and
839 // increase register pressure.
840 //
841 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
842 // => (S_PACK_HH_B32_B16 $src0, $src1)
843 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
844 // => (S_PACK_HL_B32_B16 $src0, $src1)
845 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
846 // => (S_PACK_LH_B32_B16 $src0, $src1)
847 // (build_vector $src0, $src1)
848 // => (S_PACK_LL_B32_B16 $src0, $src1)
849
850 bool Shift0 = mi_match(
851 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
852
853 bool Shift1 = mi_match(
854 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
855
856 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
857 if (Shift0 && Shift1) {
858 Opc = AMDGPU::S_PACK_HH_B32_B16;
859 MI.getOperand(1).setReg(ShiftSrc0);
860 MI.getOperand(2).setReg(ShiftSrc1);
861 } else if (Shift1) {
862 Opc = AMDGPU::S_PACK_LH_B32_B16;
863 MI.getOperand(2).setReg(ShiftSrc1);
864 } else if (Shift0) {
865 auto ConstSrc1 =
866 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
867 if (ConstSrc1 && ConstSrc1->Value == 0) {
868 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
869 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
870 .addReg(ShiftSrc0)
871 .addImm(16)
872 .setOperandDead(3); // Dead scc
873
874 MI.eraseFromParent();
875 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
876 }
877 if (STI.hasSPackHL()) {
878 Opc = AMDGPU::S_PACK_HL_B32_B16;
879 MI.getOperand(1).setReg(ShiftSrc0);
880 }
881 }
882
883 MI.setDesc(TII.get(Opc));
884 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
885}
886
887bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
888 const MachineOperand &MO = I.getOperand(0);
889
890 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
891 // regbank check here is to know why getConstrainedRegClassForOperand failed.
892 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
893 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
894 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
895 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
896 return true;
897 }
898
899 return false;
900}
901
902bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
903 MachineBasicBlock *BB = I.getParent();
904
905 Register DstReg = I.getOperand(0).getReg();
906 Register Src0Reg = I.getOperand(1).getReg();
907 Register Src1Reg = I.getOperand(2).getReg();
908 LLT Src1Ty = MRI->getType(Src1Reg);
909
910 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
911 unsigned InsSize = Src1Ty.getSizeInBits();
912
913 int64_t Offset = I.getOperand(3).getImm();
914
915 // FIXME: These cases should have been illegal and unnecessary to check here.
916 if (Offset % 32 != 0 || InsSize % 32 != 0)
917 return false;
918
919 // Currently not handled by getSubRegFromChannel.
920 if (InsSize > 128)
921 return false;
922
923 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
924 if (SubReg == AMDGPU::NoSubRegister)
925 return false;
926
927 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
928 const TargetRegisterClass *DstRC =
929 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
930 if (!DstRC)
931 return false;
932
933 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
934 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
935 const TargetRegisterClass *Src0RC =
936 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
937 const TargetRegisterClass *Src1RC =
938 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
939
940 // Deal with weird cases where the class only partially supports the subreg
941 // index.
942 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
943 if (!Src0RC || !Src1RC)
944 return false;
945
946 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
947 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
948 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
949 return false;
950
951 const DebugLoc &DL = I.getDebugLoc();
952 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
953 .addReg(Src0Reg)
954 .addReg(Src1Reg)
955 .addImm(SubReg);
956
957 I.eraseFromParent();
958 return true;
959}
960
961bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
962 Register DstReg = MI.getOperand(0).getReg();
963 Register SrcReg = MI.getOperand(1).getReg();
964 Register OffsetReg = MI.getOperand(2).getReg();
965 Register WidthReg = MI.getOperand(3).getReg();
966
967 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
968 "scalar BFX instructions are expanded in regbankselect");
969 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
970 "64-bit vector BFX instructions are expanded in regbankselect");
971
972 const DebugLoc &DL = MI.getDebugLoc();
973 MachineBasicBlock *MBB = MI.getParent();
974
975 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
976 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
977 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
978 .addReg(SrcReg)
979 .addReg(OffsetReg)
980 .addReg(WidthReg);
981 MI.eraseFromParent();
982 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
983}
984
985bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
986 if (STI.getLDSBankCount() != 16)
987 return selectImpl(MI, *CoverageInfo);
988
989 Register Dst = MI.getOperand(0).getReg();
990 Register Src0 = MI.getOperand(2).getReg();
991 Register M0Val = MI.getOperand(6).getReg();
992 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
993 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
994 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
995 return false;
996
997 // This requires 2 instructions. It is possible to write a pattern to support
998 // this, but the generated isel emitter doesn't correctly deal with multiple
999 // output instructions using the same physical register input. The copy to m0
1000 // is incorrectly placed before the second instruction.
1001 //
1002 // TODO: Match source modifiers.
1003
1004 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1005 const DebugLoc &DL = MI.getDebugLoc();
1006 MachineBasicBlock *MBB = MI.getParent();
1007
1008 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1009 .addReg(M0Val);
1010 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1011 .addImm(2)
1012 .addImm(MI.getOperand(4).getImm()) // $attr
1013 .addImm(MI.getOperand(3).getImm()); // $attrchan
1014
1015 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1016 .addImm(0) // $src0_modifiers
1017 .addReg(Src0) // $src0
1018 .addImm(MI.getOperand(4).getImm()) // $attr
1019 .addImm(MI.getOperand(3).getImm()) // $attrchan
1020 .addImm(0) // $src2_modifiers
1021 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1022 .addImm(MI.getOperand(5).getImm()) // $high
1023 .addImm(0) // $clamp
1024 .addImm(0); // $omod
1025
1026 MI.eraseFromParent();
1027 return true;
1028}
1029
1030// Writelane is special in that it can use SGPR and M0 (which would normally
1031// count as using the constant bus twice - but in this case it is allowed since
1032// the lane selector doesn't count as a use of the constant bus). However, it is
1033// still required to abide by the 1 SGPR rule. Fix this up if we might have
1034// multiple SGPRs.
1035bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1036 // With a constant bus limit of at least 2, there's no issue.
1037 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1038 return selectImpl(MI, *CoverageInfo);
1039
1040 MachineBasicBlock *MBB = MI.getParent();
1041 const DebugLoc &DL = MI.getDebugLoc();
1042 Register VDst = MI.getOperand(0).getReg();
1043 Register Val = MI.getOperand(2).getReg();
1044 Register LaneSelect = MI.getOperand(3).getReg();
1045 Register VDstIn = MI.getOperand(4).getReg();
1046
1047 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1048
1049 std::optional<ValueAndVReg> ConstSelect =
1050 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1051 if (ConstSelect) {
1052 // The selector has to be an inline immediate, so we can use whatever for
1053 // the other operands.
1054 MIB.addReg(Val);
1055 MIB.addImm(ConstSelect->Value.getSExtValue() &
1056 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1057 } else {
1058 std::optional<ValueAndVReg> ConstVal =
1060
1061 // If the value written is an inline immediate, we can get away without a
1062 // copy to m0.
1063 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1064 STI.hasInv2PiInlineImm())) {
1065 MIB.addImm(ConstVal->Value.getSExtValue());
1066 MIB.addReg(LaneSelect);
1067 } else {
1068 MIB.addReg(Val);
1069
1070 // If the lane selector was originally in a VGPR and copied with
1071 // readfirstlane, there's a hazard to read the same SGPR from the
1072 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1073 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1074
1075 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1076 .addReg(LaneSelect);
1077 MIB.addReg(AMDGPU::M0);
1078 }
1079 }
1080
1081 MIB.addReg(VDstIn);
1082
1083 MI.eraseFromParent();
1084 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1085}
1086
1087// We need to handle this here because tablegen doesn't support matching
1088// instructions with multiple outputs.
1089bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1090 Register Dst0 = MI.getOperand(0).getReg();
1091 Register Dst1 = MI.getOperand(1).getReg();
1092
1093 LLT Ty = MRI->getType(Dst0);
1094 unsigned Opc;
1095 if (Ty == LLT::scalar(32))
1096 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1097 else if (Ty == LLT::scalar(64))
1098 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1099 else
1100 return false;
1101
1102 // TODO: Match source modifiers.
1103
1104 const DebugLoc &DL = MI.getDebugLoc();
1105 MachineBasicBlock *MBB = MI.getParent();
1106
1107 Register Numer = MI.getOperand(3).getReg();
1108 Register Denom = MI.getOperand(4).getReg();
1109 unsigned ChooseDenom = MI.getOperand(5).getImm();
1110
1111 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1112
1113 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1114 .addDef(Dst1)
1115 .addImm(0) // $src0_modifiers
1116 .addUse(Src0) // $src0
1117 .addImm(0) // $src1_modifiers
1118 .addUse(Denom) // $src1
1119 .addImm(0) // $src2_modifiers
1120 .addUse(Numer) // $src2
1121 .addImm(0) // $clamp
1122 .addImm(0); // $omod
1123
1124 MI.eraseFromParent();
1125 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1126}
1127
1128bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1129 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1130 switch (IntrinsicID) {
1131 case Intrinsic::amdgcn_if_break: {
1132 MachineBasicBlock *BB = I.getParent();
1133
1134 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1135 // SelectionDAG uses for wave32 vs wave64.
1136 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1137 .add(I.getOperand(0))
1138 .add(I.getOperand(2))
1139 .add(I.getOperand(3));
1140
1141 Register DstReg = I.getOperand(0).getReg();
1142 Register Src0Reg = I.getOperand(2).getReg();
1143 Register Src1Reg = I.getOperand(3).getReg();
1144
1145 I.eraseFromParent();
1146
1147 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1148 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1149
1150 return true;
1151 }
1152 case Intrinsic::amdgcn_interp_p1_f16:
1153 return selectInterpP1F16(I);
1154 case Intrinsic::amdgcn_wqm:
1155 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1156 case Intrinsic::amdgcn_softwqm:
1157 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1158 case Intrinsic::amdgcn_strict_wwm:
1159 case Intrinsic::amdgcn_wwm:
1160 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1161 case Intrinsic::amdgcn_strict_wqm:
1162 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1163 case Intrinsic::amdgcn_writelane:
1164 return selectWritelane(I);
1165 case Intrinsic::amdgcn_div_scale:
1166 return selectDivScale(I);
1167 case Intrinsic::amdgcn_icmp:
1168 case Intrinsic::amdgcn_fcmp:
1169 if (selectImpl(I, *CoverageInfo))
1170 return true;
1171 return selectIntrinsicCmp(I);
1172 case Intrinsic::amdgcn_ballot:
1173 return selectBallot(I);
1174 case Intrinsic::amdgcn_reloc_constant:
1175 return selectRelocConstant(I);
1176 case Intrinsic::amdgcn_groupstaticsize:
1177 return selectGroupStaticSize(I);
1178 case Intrinsic::returnaddress:
1179 return selectReturnAddress(I);
1180 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1181 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1182 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1183 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1184 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1185 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1186 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1187 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1188 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1189 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1190 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1191 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1192 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1193 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1194 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1195 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1198 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1199 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1200 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1201 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1202 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1204 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1205 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1206 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1207 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1208 return selectSMFMACIntrin(I);
1209 case Intrinsic::amdgcn_permlane16_swap:
1210 case Intrinsic::amdgcn_permlane32_swap:
1211 return selectPermlaneSwapIntrin(I, IntrinsicID);
1212 default:
1213 return selectImpl(I, *CoverageInfo);
1214 }
1215}
1216
1218 const GCNSubtarget &ST) {
1219 if (Size != 16 && Size != 32 && Size != 64)
1220 return -1;
1221
1222 if (Size == 16 && !ST.has16BitInsts())
1223 return -1;
1224
1225 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1226 unsigned FakeS16Opc, unsigned S32Opc,
1227 unsigned S64Opc) {
1228 if (Size == 16)
1229 return ST.hasTrue16BitInsts()
1230 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1231 : S16Opc;
1232 if (Size == 32)
1233 return S32Opc;
1234 return S64Opc;
1235 };
1236
1237 switch (P) {
1238 default:
1239 llvm_unreachable("Unknown condition code!");
1240 case CmpInst::ICMP_NE:
1241 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1242 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1243 AMDGPU::V_CMP_NE_U64_e64);
1244 case CmpInst::ICMP_EQ:
1245 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1246 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1247 AMDGPU::V_CMP_EQ_U64_e64);
1248 case CmpInst::ICMP_SGT:
1249 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1250 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1251 AMDGPU::V_CMP_GT_I64_e64);
1252 case CmpInst::ICMP_SGE:
1253 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1254 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1255 AMDGPU::V_CMP_GE_I64_e64);
1256 case CmpInst::ICMP_SLT:
1257 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1258 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1259 AMDGPU::V_CMP_LT_I64_e64);
1260 case CmpInst::ICMP_SLE:
1261 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1262 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1263 AMDGPU::V_CMP_LE_I64_e64);
1264 case CmpInst::ICMP_UGT:
1265 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1266 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1267 AMDGPU::V_CMP_GT_U64_e64);
1268 case CmpInst::ICMP_UGE:
1269 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1270 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1271 AMDGPU::V_CMP_GE_U64_e64);
1272 case CmpInst::ICMP_ULT:
1273 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1274 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1275 AMDGPU::V_CMP_LT_U64_e64);
1276 case CmpInst::ICMP_ULE:
1277 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1278 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1279 AMDGPU::V_CMP_LE_U64_e64);
1280
1281 case CmpInst::FCMP_OEQ:
1282 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1283 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1284 AMDGPU::V_CMP_EQ_F64_e64);
1285 case CmpInst::FCMP_OGT:
1286 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1287 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1288 AMDGPU::V_CMP_GT_F64_e64);
1289 case CmpInst::FCMP_OGE:
1290 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1291 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1292 AMDGPU::V_CMP_GE_F64_e64);
1293 case CmpInst::FCMP_OLT:
1294 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1295 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1296 AMDGPU::V_CMP_LT_F64_e64);
1297 case CmpInst::FCMP_OLE:
1298 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1299 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1300 AMDGPU::V_CMP_LE_F64_e64);
1301 case CmpInst::FCMP_ONE:
1302 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1303 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1304 AMDGPU::V_CMP_NEQ_F64_e64);
1305 case CmpInst::FCMP_ORD:
1306 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1307 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1308 AMDGPU::V_CMP_O_F64_e64);
1309 case CmpInst::FCMP_UNO:
1310 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1311 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1312 AMDGPU::V_CMP_U_F64_e64);
1313 case CmpInst::FCMP_UEQ:
1314 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1315 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1316 AMDGPU::V_CMP_NLG_F64_e64);
1317 case CmpInst::FCMP_UGT:
1318 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1319 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1320 AMDGPU::V_CMP_NLE_F64_e64);
1321 case CmpInst::FCMP_UGE:
1322 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1323 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1324 AMDGPU::V_CMP_NLT_F64_e64);
1325 case CmpInst::FCMP_ULT:
1326 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1327 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1328 AMDGPU::V_CMP_NGE_F64_e64);
1329 case CmpInst::FCMP_ULE:
1330 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1331 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1332 AMDGPU::V_CMP_NGT_F64_e64);
1333 case CmpInst::FCMP_UNE:
1334 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1335 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1336 AMDGPU::V_CMP_NEQ_F64_e64);
1337 case CmpInst::FCMP_TRUE:
1338 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1339 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1340 AMDGPU::V_CMP_TRU_F64_e64);
1342 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1343 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1344 AMDGPU::V_CMP_F_F64_e64);
1345 }
1346}
1347
1348int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1349 unsigned Size) const {
1350 if (Size == 64) {
1351 if (!STI.hasScalarCompareEq64())
1352 return -1;
1353
1354 switch (P) {
1355 case CmpInst::ICMP_NE:
1356 return AMDGPU::S_CMP_LG_U64;
1357 case CmpInst::ICMP_EQ:
1358 return AMDGPU::S_CMP_EQ_U64;
1359 default:
1360 return -1;
1361 }
1362 }
1363
1364 if (Size == 32) {
1365 switch (P) {
1366 case CmpInst::ICMP_NE:
1367 return AMDGPU::S_CMP_LG_U32;
1368 case CmpInst::ICMP_EQ:
1369 return AMDGPU::S_CMP_EQ_U32;
1370 case CmpInst::ICMP_SGT:
1371 return AMDGPU::S_CMP_GT_I32;
1372 case CmpInst::ICMP_SGE:
1373 return AMDGPU::S_CMP_GE_I32;
1374 case CmpInst::ICMP_SLT:
1375 return AMDGPU::S_CMP_LT_I32;
1376 case CmpInst::ICMP_SLE:
1377 return AMDGPU::S_CMP_LE_I32;
1378 case CmpInst::ICMP_UGT:
1379 return AMDGPU::S_CMP_GT_U32;
1380 case CmpInst::ICMP_UGE:
1381 return AMDGPU::S_CMP_GE_U32;
1382 case CmpInst::ICMP_ULT:
1383 return AMDGPU::S_CMP_LT_U32;
1384 case CmpInst::ICMP_ULE:
1385 return AMDGPU::S_CMP_LE_U32;
1386 case CmpInst::FCMP_OEQ:
1387 return AMDGPU::S_CMP_EQ_F32;
1388 case CmpInst::FCMP_OGT:
1389 return AMDGPU::S_CMP_GT_F32;
1390 case CmpInst::FCMP_OGE:
1391 return AMDGPU::S_CMP_GE_F32;
1392 case CmpInst::FCMP_OLT:
1393 return AMDGPU::S_CMP_LT_F32;
1394 case CmpInst::FCMP_OLE:
1395 return AMDGPU::S_CMP_LE_F32;
1396 case CmpInst::FCMP_ONE:
1397 return AMDGPU::S_CMP_LG_F32;
1398 case CmpInst::FCMP_ORD:
1399 return AMDGPU::S_CMP_O_F32;
1400 case CmpInst::FCMP_UNO:
1401 return AMDGPU::S_CMP_U_F32;
1402 case CmpInst::FCMP_UEQ:
1403 return AMDGPU::S_CMP_NLG_F32;
1404 case CmpInst::FCMP_UGT:
1405 return AMDGPU::S_CMP_NLE_F32;
1406 case CmpInst::FCMP_UGE:
1407 return AMDGPU::S_CMP_NLT_F32;
1408 case CmpInst::FCMP_ULT:
1409 return AMDGPU::S_CMP_NGE_F32;
1410 case CmpInst::FCMP_ULE:
1411 return AMDGPU::S_CMP_NGT_F32;
1412 case CmpInst::FCMP_UNE:
1413 return AMDGPU::S_CMP_NEQ_F32;
1414 default:
1415 llvm_unreachable("Unknown condition code!");
1416 }
1417 }
1418
1419 if (Size == 16) {
1420 if (!STI.hasSALUFloatInsts())
1421 return -1;
1422
1423 switch (P) {
1424 case CmpInst::FCMP_OEQ:
1425 return AMDGPU::S_CMP_EQ_F16;
1426 case CmpInst::FCMP_OGT:
1427 return AMDGPU::S_CMP_GT_F16;
1428 case CmpInst::FCMP_OGE:
1429 return AMDGPU::S_CMP_GE_F16;
1430 case CmpInst::FCMP_OLT:
1431 return AMDGPU::S_CMP_LT_F16;
1432 case CmpInst::FCMP_OLE:
1433 return AMDGPU::S_CMP_LE_F16;
1434 case CmpInst::FCMP_ONE:
1435 return AMDGPU::S_CMP_LG_F16;
1436 case CmpInst::FCMP_ORD:
1437 return AMDGPU::S_CMP_O_F16;
1438 case CmpInst::FCMP_UNO:
1439 return AMDGPU::S_CMP_U_F16;
1440 case CmpInst::FCMP_UEQ:
1441 return AMDGPU::S_CMP_NLG_F16;
1442 case CmpInst::FCMP_UGT:
1443 return AMDGPU::S_CMP_NLE_F16;
1444 case CmpInst::FCMP_UGE:
1445 return AMDGPU::S_CMP_NLT_F16;
1446 case CmpInst::FCMP_ULT:
1447 return AMDGPU::S_CMP_NGE_F16;
1448 case CmpInst::FCMP_ULE:
1449 return AMDGPU::S_CMP_NGT_F16;
1450 case CmpInst::FCMP_UNE:
1451 return AMDGPU::S_CMP_NEQ_F16;
1452 default:
1453 llvm_unreachable("Unknown condition code!");
1454 }
1455 }
1456
1457 return -1;
1458}
1459
1460bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1461
1462 MachineBasicBlock *BB = I.getParent();
1463 const DebugLoc &DL = I.getDebugLoc();
1464
1465 Register SrcReg = I.getOperand(2).getReg();
1466 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1467
1468 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1469
1470 Register CCReg = I.getOperand(0).getReg();
1471 if (!isVCC(CCReg, *MRI)) {
1472 int Opcode = getS_CMPOpcode(Pred, Size);
1473 if (Opcode == -1)
1474 return false;
1475 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1476 .add(I.getOperand(2))
1477 .add(I.getOperand(3));
1478 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1479 .addReg(AMDGPU::SCC);
1480 bool Ret =
1481 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1482 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1483 I.eraseFromParent();
1484 return Ret;
1485 }
1486
1487 if (I.getOpcode() == AMDGPU::G_FCMP)
1488 return false;
1489
1490 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1491 if (Opcode == -1)
1492 return false;
1493
1494 MachineInstrBuilder ICmp;
1495 // t16 instructions
1496 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1497 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1498 .addImm(0)
1499 .add(I.getOperand(2))
1500 .addImm(0)
1501 .add(I.getOperand(3))
1502 .addImm(0); // op_sel
1503 } else {
1504 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1505 .add(I.getOperand(2))
1506 .add(I.getOperand(3));
1507 }
1508
1509 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1510 *TRI.getBoolRC(), *MRI);
1511 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1512 I.eraseFromParent();
1513 return Ret;
1514}
1515
1516bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1517 Register Dst = I.getOperand(0).getReg();
1518 if (isVCC(Dst, *MRI))
1519 return false;
1520
1521 LLT DstTy = MRI->getType(Dst);
1522 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1523 return false;
1524
1525 MachineBasicBlock *BB = I.getParent();
1526 const DebugLoc &DL = I.getDebugLoc();
1527 Register SrcReg = I.getOperand(2).getReg();
1528 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1529
1530 // i1 inputs are not supported in GlobalISel.
1531 if (Size == 1)
1532 return false;
1533
1534 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1535 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1536 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1537 I.eraseFromParent();
1538 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1539 }
1540
1541 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1542 if (Opcode == -1)
1543 return false;
1544
1545 MachineInstrBuilder SelectedMI;
1546 MachineOperand &LHS = I.getOperand(2);
1547 MachineOperand &RHS = I.getOperand(3);
1548 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1549 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1550 Register Src0Reg =
1551 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1552 Register Src1Reg =
1553 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1554 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1555 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1556 SelectedMI.addImm(Src0Mods);
1557 SelectedMI.addReg(Src0Reg);
1558 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1559 SelectedMI.addImm(Src1Mods);
1560 SelectedMI.addReg(Src1Reg);
1561 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1562 SelectedMI.addImm(0); // clamp
1563 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1564 SelectedMI.addImm(0); // op_sel
1565
1566 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1567 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1568 return false;
1569
1570 I.eraseFromParent();
1571 return true;
1572}
1573
1574// Ballot has to zero bits in input lane-mask that are zero in current exec,
1575// Done as AND with exec. For inputs that are results of instruction that
1576// implicitly use same exec, for example compares in same basic block or SCC to
1577// VCC copy, use copy.
1580 MachineInstr *MI = MRI.getVRegDef(Reg);
1581 if (MI->getParent() != MBB)
1582 return false;
1583
1584 // Lane mask generated by SCC to VCC copy.
1585 if (MI->getOpcode() == AMDGPU::COPY) {
1586 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1587 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1588 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1589 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1590 return true;
1591 }
1592
1593 // Lane mask generated using compare with same exec.
1594 if (isa<GAnyCmp>(MI))
1595 return true;
1596
1597 Register LHS, RHS;
1598 // Look through AND.
1599 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1600 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1602
1603 return false;
1604}
1605
1606bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1607 MachineBasicBlock *BB = I.getParent();
1608 const DebugLoc &DL = I.getDebugLoc();
1609 Register DstReg = I.getOperand(0).getReg();
1610 Register SrcReg = I.getOperand(2).getReg();
1611 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1612 const unsigned WaveSize = STI.getWavefrontSize();
1613
1614 // In the common case, the return type matches the wave size.
1615 // However we also support emitting i64 ballots in wave32 mode.
1616 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1617 return false;
1618
1619 std::optional<ValueAndVReg> Arg =
1621
1622 Register Dst = DstReg;
1623 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1624 if (BallotSize != WaveSize) {
1625 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1626 }
1627
1628 if (Arg) {
1629 const int64_t Value = Arg->Value.getZExtValue();
1630 if (Value == 0) {
1631 // Dst = S_MOV 0
1632 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1633 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1634 } else {
1635 // Dst = COPY EXEC
1636 assert(Value == 1);
1637 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1638 }
1639 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1640 return false;
1641 } else {
1642 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1643 // Dst = COPY SrcReg
1644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1645 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1646 return false;
1647 } else {
1648 // Dst = S_AND SrcReg, EXEC
1649 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1650 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1651 .addReg(SrcReg)
1652 .addReg(TRI.getExec())
1653 .setOperandDead(3); // Dead scc
1654 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1655 return false;
1656 }
1657 }
1658
1659 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1660 if (BallotSize != WaveSize) {
1661 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1662 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1663 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1664 .addReg(Dst)
1665 .addImm(AMDGPU::sub0)
1666 .addReg(HiReg)
1667 .addImm(AMDGPU::sub1);
1668 }
1669
1670 I.eraseFromParent();
1671 return true;
1672}
1673
1674bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1675 Register DstReg = I.getOperand(0).getReg();
1676 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1677 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1678 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1679 return false;
1680
1681 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1682
1683 Module *M = MF->getFunction().getParent();
1684 const MDNode *Metadata = I.getOperand(2).getMetadata();
1685 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1686 auto *RelocSymbol = cast<GlobalVariable>(
1687 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1688
1689 MachineBasicBlock *BB = I.getParent();
1690 BuildMI(*BB, &I, I.getDebugLoc(),
1691 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1693
1694 I.eraseFromParent();
1695 return true;
1696}
1697
1698bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1699 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1700
1701 Register DstReg = I.getOperand(0).getReg();
1702 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1703 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1704 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1705
1706 MachineBasicBlock *MBB = I.getParent();
1707 const DebugLoc &DL = I.getDebugLoc();
1708
1709 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1710
1711 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1712 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1713 MIB.addImm(MFI->getLDSSize());
1714 } else {
1715 Module *M = MF->getFunction().getParent();
1716 const GlobalValue *GV =
1717 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1719 }
1720
1721 I.eraseFromParent();
1722 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1723}
1724
1725bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1726 MachineBasicBlock *MBB = I.getParent();
1727 MachineFunction &MF = *MBB->getParent();
1728 const DebugLoc &DL = I.getDebugLoc();
1729
1730 MachineOperand &Dst = I.getOperand(0);
1731 Register DstReg = Dst.getReg();
1732 unsigned Depth = I.getOperand(2).getImm();
1733
1734 const TargetRegisterClass *RC
1735 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1736 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1737 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1738 return false;
1739
1740 // Check for kernel and shader functions
1741 if (Depth != 0 ||
1742 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1743 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1744 .addImm(0);
1745 I.eraseFromParent();
1746 return true;
1747 }
1748
1749 MachineFrameInfo &MFI = MF.getFrameInfo();
1750 // There is a call to @llvm.returnaddress in this function
1751 MFI.setReturnAddressIsTaken(true);
1752
1753 // Get the return address reg and mark it as an implicit live-in
1754 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1755 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1756 AMDGPU::SReg_64RegClass, DL);
1757 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1758 .addReg(LiveIn);
1759 I.eraseFromParent();
1760 return true;
1761}
1762
1763bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1764 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1765 // SelectionDAG uses for wave32 vs wave64.
1766 MachineBasicBlock *BB = MI.getParent();
1767 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1768 .add(MI.getOperand(1));
1769
1770 Register Reg = MI.getOperand(1).getReg();
1771 MI.eraseFromParent();
1772
1773 if (!MRI->getRegClassOrNull(Reg))
1774 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1775 return true;
1776}
1777
1778bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1779 MachineInstr &MI, Intrinsic::ID IntrID) const {
1780 MachineBasicBlock *MBB = MI.getParent();
1781 MachineFunction *MF = MBB->getParent();
1782 const DebugLoc &DL = MI.getDebugLoc();
1783
1784 unsigned IndexOperand = MI.getOperand(7).getImm();
1785 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1786 bool WaveDone = MI.getOperand(9).getImm() != 0;
1787
1788 if (WaveDone && !WaveRelease) {
1789 // TODO: Move this to IR verifier
1790 const Function &Fn = MF->getFunction();
1791 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1792 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1793 }
1794
1795 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1796 IndexOperand &= ~0x3f;
1797 unsigned CountDw = 0;
1798
1799 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1800 CountDw = (IndexOperand >> 24) & 0xf;
1801 IndexOperand &= ~(0xf << 24);
1802
1803 if (CountDw < 1 || CountDw > 4) {
1804 const Function &Fn = MF->getFunction();
1805 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1806 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1807 CountDw = 1;
1808 }
1809 }
1810
1811 if (IndexOperand) {
1812 const Function &Fn = MF->getFunction();
1813 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1814 Fn, "ds_ordered_count: bad index operand", DL));
1815 }
1816
1817 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1818 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1819
1820 unsigned Offset0 = OrderedCountIndex << 2;
1821 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1822
1823 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1824 Offset1 |= (CountDw - 1) << 6;
1825
1826 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1827 Offset1 |= ShaderType << 2;
1828
1829 unsigned Offset = Offset0 | (Offset1 << 8);
1830
1831 Register M0Val = MI.getOperand(2).getReg();
1832 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1833 .addReg(M0Val);
1834
1835 Register DstReg = MI.getOperand(0).getReg();
1836 Register ValReg = MI.getOperand(3).getReg();
1837 MachineInstrBuilder DS =
1838 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1839 .addReg(ValReg)
1840 .addImm(Offset)
1841 .cloneMemRefs(MI);
1842
1843 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1844 return false;
1845
1846 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1847 MI.eraseFromParent();
1848 return Ret;
1849}
1850
1851static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1852 switch (IntrID) {
1853 case Intrinsic::amdgcn_ds_gws_init:
1854 return AMDGPU::DS_GWS_INIT;
1855 case Intrinsic::amdgcn_ds_gws_barrier:
1856 return AMDGPU::DS_GWS_BARRIER;
1857 case Intrinsic::amdgcn_ds_gws_sema_v:
1858 return AMDGPU::DS_GWS_SEMA_V;
1859 case Intrinsic::amdgcn_ds_gws_sema_br:
1860 return AMDGPU::DS_GWS_SEMA_BR;
1861 case Intrinsic::amdgcn_ds_gws_sema_p:
1862 return AMDGPU::DS_GWS_SEMA_P;
1863 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1864 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1865 default:
1866 llvm_unreachable("not a gws intrinsic");
1867 }
1868}
1869
1870bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1871 Intrinsic::ID IID) const {
1872 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1873 !STI.hasGWSSemaReleaseAll()))
1874 return false;
1875
1876 // intrinsic ID, vsrc, offset
1877 const bool HasVSrc = MI.getNumOperands() == 3;
1878 assert(HasVSrc || MI.getNumOperands() == 2);
1879
1880 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1881 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1882 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1883 return false;
1884
1885 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1886 unsigned ImmOffset;
1887
1888 MachineBasicBlock *MBB = MI.getParent();
1889 const DebugLoc &DL = MI.getDebugLoc();
1890
1891 MachineInstr *Readfirstlane = nullptr;
1892
1893 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1894 // incoming offset, in case there's an add of a constant. We'll have to put it
1895 // back later.
1896 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1897 Readfirstlane = OffsetDef;
1898 BaseOffset = OffsetDef->getOperand(1).getReg();
1899 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1900 }
1901
1902 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1903 // If we have a constant offset, try to use the 0 in m0 as the base.
1904 // TODO: Look into changing the default m0 initialization value. If the
1905 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1906 // the immediate offset.
1907
1908 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1909 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1910 .addImm(0);
1911 } else {
1912 std::tie(BaseOffset, ImmOffset) =
1913 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1914
1915 if (Readfirstlane) {
1916 // We have the constant offset now, so put the readfirstlane back on the
1917 // variable component.
1918 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1919 return false;
1920
1921 Readfirstlane->getOperand(1).setReg(BaseOffset);
1922 BaseOffset = Readfirstlane->getOperand(0).getReg();
1923 } else {
1924 if (!RBI.constrainGenericRegister(BaseOffset,
1925 AMDGPU::SReg_32RegClass, *MRI))
1926 return false;
1927 }
1928
1929 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1930 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1931 .addReg(BaseOffset)
1932 .addImm(16)
1933 .setOperandDead(3); // Dead scc
1934
1935 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1936 .addReg(M0Base);
1937 }
1938
1939 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1940 // offset field) % 64. Some versions of the programming guide omit the m0
1941 // part, or claim it's from offset 0.
1942 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1943
1944 if (HasVSrc) {
1945 Register VSrc = MI.getOperand(1).getReg();
1946 MIB.addReg(VSrc);
1947
1948 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1949 return false;
1950 }
1951
1952 MIB.addImm(ImmOffset)
1953 .cloneMemRefs(MI);
1954
1955 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1956
1957 MI.eraseFromParent();
1958 return true;
1959}
1960
1961bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1962 bool IsAppend) const {
1963 Register PtrBase = MI.getOperand(2).getReg();
1964 LLT PtrTy = MRI->getType(PtrBase);
1965 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1966
1967 unsigned Offset;
1968 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1969
1970 // TODO: Should this try to look through readfirstlane like GWS?
1971 if (!isDSOffsetLegal(PtrBase, Offset)) {
1972 PtrBase = MI.getOperand(2).getReg();
1973 Offset = 0;
1974 }
1975
1976 MachineBasicBlock *MBB = MI.getParent();
1977 const DebugLoc &DL = MI.getDebugLoc();
1978 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1979
1980 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1981 .addReg(PtrBase);
1982 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1983 return false;
1984
1985 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1986 .addImm(Offset)
1987 .addImm(IsGDS ? -1 : 0)
1988 .cloneMemRefs(MI);
1989 MI.eraseFromParent();
1990 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1991}
1992
1993bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1994 MachineFunction *MF = MI.getParent()->getParent();
1995 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1996
1997 MFInfo->setInitWholeWave();
1998 return selectImpl(MI, *CoverageInfo);
1999}
2000
2001static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2002 bool &IsTexFail) {
2003 if (TexFailCtrl)
2004 IsTexFail = true;
2005
2006 TFE = TexFailCtrl & 0x1;
2007 TexFailCtrl &= ~(uint64_t)0x1;
2008 LWE = TexFailCtrl & 0x2;
2009 TexFailCtrl &= ~(uint64_t)0x2;
2010
2011 return TexFailCtrl == 0;
2012}
2013
2014bool AMDGPUInstructionSelector::selectImageIntrinsic(
2015 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2016 MachineBasicBlock *MBB = MI.getParent();
2017 const DebugLoc &DL = MI.getDebugLoc();
2018 unsigned IntrOpcode = Intr->BaseOpcode;
2019
2020 // For image atomic: use no-return opcode if result is unused.
2021 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2022 Register ResultDef = MI.getOperand(0).getReg();
2023 if (MRI->use_nodbg_empty(ResultDef))
2024 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2025 }
2026
2027 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2029
2030 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2031 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2032 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2033 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2034
2035 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2036
2037 Register VDataIn = AMDGPU::NoRegister;
2038 Register VDataOut = AMDGPU::NoRegister;
2039 LLT VDataTy;
2040 int NumVDataDwords = -1;
2041 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2042 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2043
2044 bool Unorm;
2045 if (!BaseOpcode->Sampler)
2046 Unorm = true;
2047 else
2048 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2049
2050 bool TFE;
2051 bool LWE;
2052 bool IsTexFail = false;
2053 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2054 TFE, LWE, IsTexFail))
2055 return false;
2056
2057 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2058 const bool IsA16 = (Flags & 1) != 0;
2059 const bool IsG16 = (Flags & 2) != 0;
2060
2061 // A16 implies 16 bit gradients if subtarget doesn't support G16
2062 if (IsA16 && !STI.hasG16() && !IsG16)
2063 return false;
2064
2065 unsigned DMask = 0;
2066 unsigned DMaskLanes = 0;
2067
2068 if (BaseOpcode->Atomic) {
2069 if (!BaseOpcode->NoReturn)
2070 VDataOut = MI.getOperand(0).getReg();
2071 VDataIn = MI.getOperand(2).getReg();
2072 LLT Ty = MRI->getType(VDataIn);
2073
2074 // Be careful to allow atomic swap on 16-bit element vectors.
2075 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2076 Ty.getSizeInBits() == 128 :
2077 Ty.getSizeInBits() == 64;
2078
2079 if (BaseOpcode->AtomicX2) {
2080 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2081
2082 DMask = Is64Bit ? 0xf : 0x3;
2083 NumVDataDwords = Is64Bit ? 4 : 2;
2084 } else {
2085 DMask = Is64Bit ? 0x3 : 0x1;
2086 NumVDataDwords = Is64Bit ? 2 : 1;
2087 }
2088 } else {
2089 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2090 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2091
2092 if (BaseOpcode->Store) {
2093 VDataIn = MI.getOperand(1).getReg();
2094 VDataTy = MRI->getType(VDataIn);
2095 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2096 } else if (BaseOpcode->NoReturn) {
2097 NumVDataDwords = 0;
2098 } else {
2099 VDataOut = MI.getOperand(0).getReg();
2100 VDataTy = MRI->getType(VDataOut);
2101 NumVDataDwords = DMaskLanes;
2102
2103 if (IsD16 && !STI.hasUnpackedD16VMem())
2104 NumVDataDwords = (DMaskLanes + 1) / 2;
2105 }
2106 }
2107
2108 // Set G16 opcode
2109 if (Subtarget->hasG16() && IsG16) {
2110 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2112 assert(G16MappingInfo);
2113 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2114 }
2115
2116 // TODO: Check this in verifier.
2117 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2118
2119 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2120 // Keep GLC only when the atomic's result is actually used.
2121 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2123 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2125 return false;
2126
2127 int NumVAddrRegs = 0;
2128 int NumVAddrDwords = 0;
2129 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2130 // Skip the $noregs and 0s inserted during legalization.
2131 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2132 if (!AddrOp.isReg())
2133 continue; // XXX - Break?
2134
2135 Register Addr = AddrOp.getReg();
2136 if (!Addr)
2137 break;
2138
2139 ++NumVAddrRegs;
2140 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2141 }
2142
2143 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2144 // NSA, these should have been packed into a single value in the first
2145 // address register
2146 const bool UseNSA =
2147 NumVAddrRegs != 1 &&
2148 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2149 : NumVAddrDwords == NumVAddrRegs);
2150 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2151 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2152 return false;
2153 }
2154
2155 if (IsTexFail)
2156 ++NumVDataDwords;
2157
2158 int Opcode = -1;
2159 if (IsGFX12Plus) {
2160 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2161 NumVDataDwords, NumVAddrDwords);
2162 } else if (IsGFX11Plus) {
2163 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2164 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2165 : AMDGPU::MIMGEncGfx11Default,
2166 NumVDataDwords, NumVAddrDwords);
2167 } else if (IsGFX10Plus) {
2168 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2169 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2170 : AMDGPU::MIMGEncGfx10Default,
2171 NumVDataDwords, NumVAddrDwords);
2172 } else {
2173 if (Subtarget->hasGFX90AInsts()) {
2174 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2175 NumVDataDwords, NumVAddrDwords);
2176 if (Opcode == -1) {
2177 LLVM_DEBUG(
2178 dbgs()
2179 << "requested image instruction is not supported on this GPU\n");
2180 return false;
2181 }
2182 }
2183 if (Opcode == -1 &&
2184 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2185 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2186 NumVDataDwords, NumVAddrDwords);
2187 if (Opcode == -1)
2188 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2189 NumVDataDwords, NumVAddrDwords);
2190 }
2191 if (Opcode == -1)
2192 return false;
2193
2194 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2195 .cloneMemRefs(MI);
2196
2197 if (VDataOut) {
2198 if (BaseOpcode->AtomicX2) {
2199 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2200
2201 Register TmpReg = MRI->createVirtualRegister(
2202 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2203 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2204
2205 MIB.addDef(TmpReg);
2206 if (!MRI->use_empty(VDataOut)) {
2207 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2208 .addReg(TmpReg, RegState::Kill, SubReg);
2209 }
2210
2211 } else {
2212 MIB.addDef(VDataOut); // vdata output
2213 }
2214 }
2215
2216 if (VDataIn)
2217 MIB.addReg(VDataIn); // vdata input
2218
2219 for (int I = 0; I != NumVAddrRegs; ++I) {
2220 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2221 if (SrcOp.isReg()) {
2222 assert(SrcOp.getReg() != 0);
2223 MIB.addReg(SrcOp.getReg());
2224 }
2225 }
2226
2227 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2228 if (BaseOpcode->Sampler)
2229 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2230
2231 MIB.addImm(DMask); // dmask
2232
2233 if (IsGFX10Plus)
2234 MIB.addImm(DimInfo->Encoding);
2235 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2236 MIB.addImm(Unorm);
2237
2238 MIB.addImm(CPol);
2239 MIB.addImm(IsA16 && // a16 or r128
2240 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2241 if (IsGFX10Plus)
2242 MIB.addImm(IsA16 ? -1 : 0);
2243
2244 if (!Subtarget->hasGFX90AInsts()) {
2245 MIB.addImm(TFE); // tfe
2246 } else if (TFE) {
2247 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2248 return false;
2249 }
2250
2251 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2252 MIB.addImm(LWE); // lwe
2253 if (!IsGFX10Plus)
2254 MIB.addImm(DimInfo->DA ? -1 : 0);
2255 if (BaseOpcode->HasD16)
2256 MIB.addImm(IsD16 ? -1 : 0);
2257
2258 MI.eraseFromParent();
2259 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2260 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2261 return true;
2262}
2263
2264// We need to handle this here because tablegen doesn't support matching
2265// instructions with multiple outputs.
2266bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2267 MachineInstr &MI) const {
2268 Register Dst0 = MI.getOperand(0).getReg();
2269 Register Dst1 = MI.getOperand(1).getReg();
2270
2271 const DebugLoc &DL = MI.getDebugLoc();
2272 MachineBasicBlock *MBB = MI.getParent();
2273
2274 Register Addr = MI.getOperand(3).getReg();
2275 Register Data0 = MI.getOperand(4).getReg();
2276 Register Data1 = MI.getOperand(5).getReg();
2277 unsigned Offset = MI.getOperand(6).getImm();
2278
2279 unsigned Opc;
2280 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2281 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2282 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2283 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2284 break;
2285 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2286 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2287 break;
2288 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2289 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2290 break;
2291 }
2292
2293 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2294 .addDef(Dst1)
2295 .addUse(Addr)
2296 .addUse(Data0)
2297 .addUse(Data1)
2298 .addImm(Offset)
2299 .cloneMemRefs(MI);
2300
2301 MI.eraseFromParent();
2302 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2303}
2304
2305bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2306 MachineInstr &I) const {
2307 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2308 switch (IntrinsicID) {
2309 case Intrinsic::amdgcn_end_cf:
2310 return selectEndCfIntrinsic(I);
2311 case Intrinsic::amdgcn_ds_ordered_add:
2312 case Intrinsic::amdgcn_ds_ordered_swap:
2313 return selectDSOrderedIntrinsic(I, IntrinsicID);
2314 case Intrinsic::amdgcn_ds_gws_init:
2315 case Intrinsic::amdgcn_ds_gws_barrier:
2316 case Intrinsic::amdgcn_ds_gws_sema_v:
2317 case Intrinsic::amdgcn_ds_gws_sema_br:
2318 case Intrinsic::amdgcn_ds_gws_sema_p:
2319 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2320 return selectDSGWSIntrinsic(I, IntrinsicID);
2321 case Intrinsic::amdgcn_ds_append:
2322 return selectDSAppendConsume(I, true);
2323 case Intrinsic::amdgcn_ds_consume:
2324 return selectDSAppendConsume(I, false);
2325 case Intrinsic::amdgcn_init_whole_wave:
2326 return selectInitWholeWave(I);
2327 case Intrinsic::amdgcn_raw_buffer_load_lds:
2328 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2329 case Intrinsic::amdgcn_struct_buffer_load_lds:
2330 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2331 return selectBufferLoadLds(I);
2332 // Until we can store both the address space of the global and the LDS
2333 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2334 // that the argument is a global pointer (buffer pointers have been handled by
2335 // a LLVM IR-level lowering).
2336 case Intrinsic::amdgcn_load_to_lds:
2337 case Intrinsic::amdgcn_global_load_lds:
2338 return selectGlobalLoadLds(I);
2339 case Intrinsic::amdgcn_exp_compr:
2340 if (!STI.hasCompressedExport()) {
2341 Function &F = I.getMF()->getFunction();
2342 F.getContext().diagnose(
2343 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2344 I.getDebugLoc(), DS_Error));
2345 return false;
2346 }
2347 break;
2348 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2349 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2350 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2351 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2352 return selectDSBvhStackIntrinsic(I);
2353 case Intrinsic::amdgcn_s_barrier_init:
2354 case Intrinsic::amdgcn_s_barrier_signal_var:
2355 return selectNamedBarrierInit(I, IntrinsicID);
2356 case Intrinsic::amdgcn_s_barrier_join:
2357 case Intrinsic::amdgcn_s_get_named_barrier_state:
2358 return selectNamedBarrierInst(I, IntrinsicID);
2359 case Intrinsic::amdgcn_s_get_barrier_state:
2360 return selectSGetBarrierState(I, IntrinsicID);
2361 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2362 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2363 }
2364 return selectImpl(I, *CoverageInfo);
2365}
2366
2367bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2368 if (selectImpl(I, *CoverageInfo))
2369 return true;
2370
2371 MachineBasicBlock *BB = I.getParent();
2372 const DebugLoc &DL = I.getDebugLoc();
2373
2374 Register DstReg = I.getOperand(0).getReg();
2375 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2376 assert(Size <= 32 || Size == 64);
2377 const MachineOperand &CCOp = I.getOperand(1);
2378 Register CCReg = CCOp.getReg();
2379 if (!isVCC(CCReg, *MRI)) {
2380 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2381 AMDGPU::S_CSELECT_B32;
2382 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2383 .addReg(CCReg);
2384
2385 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2386 // bank, because it does not cover the register class that we used to represent
2387 // for it. So we need to manually set the register class here.
2388 if (!MRI->getRegClassOrNull(CCReg))
2389 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2390 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2391 .add(I.getOperand(2))
2392 .add(I.getOperand(3));
2393
2394 bool Ret = false;
2395 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2396 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2397 I.eraseFromParent();
2398 return Ret;
2399 }
2400
2401 // Wide VGPR select should have been split in RegBankSelect.
2402 if (Size > 32)
2403 return false;
2404
2405 MachineInstr *Select =
2406 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2407 .addImm(0)
2408 .add(I.getOperand(3))
2409 .addImm(0)
2410 .add(I.getOperand(2))
2411 .add(I.getOperand(1));
2412
2413 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2414 I.eraseFromParent();
2415 return Ret;
2416}
2417
2418bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2419 Register DstReg = I.getOperand(0).getReg();
2420 Register SrcReg = I.getOperand(1).getReg();
2421 const LLT DstTy = MRI->getType(DstReg);
2422 const LLT SrcTy = MRI->getType(SrcReg);
2423 const LLT S1 = LLT::scalar(1);
2424
2425 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2426 const RegisterBank *DstRB;
2427 if (DstTy == S1) {
2428 // This is a special case. We don't treat s1 for legalization artifacts as
2429 // vcc booleans.
2430 DstRB = SrcRB;
2431 } else {
2432 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2433 if (SrcRB != DstRB)
2434 return false;
2435 }
2436
2437 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2438
2439 unsigned DstSize = DstTy.getSizeInBits();
2440 unsigned SrcSize = SrcTy.getSizeInBits();
2441
2442 const TargetRegisterClass *SrcRC =
2443 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2444 const TargetRegisterClass *DstRC =
2445 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2446 if (!SrcRC || !DstRC)
2447 return false;
2448
2449 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2450 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2451 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2452 return false;
2453 }
2454
2455 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2456 assert(STI.useRealTrue16Insts());
2457 const DebugLoc &DL = I.getDebugLoc();
2458 MachineBasicBlock *MBB = I.getParent();
2459 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2460 .addReg(SrcReg, 0, AMDGPU::lo16);
2461 I.eraseFromParent();
2462 return true;
2463 }
2464
2465 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2466 MachineBasicBlock *MBB = I.getParent();
2467 const DebugLoc &DL = I.getDebugLoc();
2468
2469 Register LoReg = MRI->createVirtualRegister(DstRC);
2470 Register HiReg = MRI->createVirtualRegister(DstRC);
2471 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2472 .addReg(SrcReg, 0, AMDGPU::sub0);
2473 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2474 .addReg(SrcReg, 0, AMDGPU::sub1);
2475
2476 if (IsVALU && STI.hasSDWA()) {
2477 // Write the low 16-bits of the high element into the high 16-bits of the
2478 // low element.
2479 MachineInstr *MovSDWA =
2480 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2481 .addImm(0) // $src0_modifiers
2482 .addReg(HiReg) // $src0
2483 .addImm(0) // $clamp
2484 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2485 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2486 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2487 .addReg(LoReg, RegState::Implicit);
2488 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2489 } else {
2490 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2491 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2492 Register ImmReg = MRI->createVirtualRegister(DstRC);
2493 if (IsVALU) {
2494 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2495 .addImm(16)
2496 .addReg(HiReg);
2497 } else {
2498 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2499 .addReg(HiReg)
2500 .addImm(16)
2501 .setOperandDead(3); // Dead scc
2502 }
2503
2504 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2505 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2506 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2507
2508 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2509 .addImm(0xffff);
2510 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2511 .addReg(LoReg)
2512 .addReg(ImmReg);
2513 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2514 .addReg(TmpReg0)
2515 .addReg(TmpReg1);
2516
2517 if (!IsVALU) {
2518 And.setOperandDead(3); // Dead scc
2519 Or.setOperandDead(3); // Dead scc
2520 }
2521 }
2522
2523 I.eraseFromParent();
2524 return true;
2525 }
2526
2527 if (!DstTy.isScalar())
2528 return false;
2529
2530 if (SrcSize > 32) {
2531 unsigned SubRegIdx = DstSize < 32
2532 ? static_cast<unsigned>(AMDGPU::sub0)
2533 : TRI.getSubRegFromChannel(0, DstSize / 32);
2534 if (SubRegIdx == AMDGPU::NoSubRegister)
2535 return false;
2536
2537 // Deal with weird cases where the class only partially supports the subreg
2538 // index.
2539 const TargetRegisterClass *SrcWithSubRC
2540 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2541 if (!SrcWithSubRC)
2542 return false;
2543
2544 if (SrcWithSubRC != SrcRC) {
2545 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2546 return false;
2547 }
2548
2549 I.getOperand(1).setSubReg(SubRegIdx);
2550 }
2551
2552 I.setDesc(TII.get(TargetOpcode::COPY));
2553 return true;
2554}
2555
2556/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2557static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2559 int SignedMask = static_cast<int>(Mask);
2560 return SignedMask >= -16 && SignedMask <= 64;
2561}
2562
2563// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2564const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2566 const TargetRegisterInfo &TRI) const {
2567 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2568 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2569 return RB;
2570
2571 // Ignore the type, since we don't use vcc in artifacts.
2572 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2573 return &RBI.getRegBankFromRegClass(*RC, LLT());
2574 return nullptr;
2575}
2576
2577bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2578 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2579 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2580 const DebugLoc &DL = I.getDebugLoc();
2581 MachineBasicBlock &MBB = *I.getParent();
2582 const Register DstReg = I.getOperand(0).getReg();
2583 const Register SrcReg = I.getOperand(1).getReg();
2584
2585 const LLT DstTy = MRI->getType(DstReg);
2586 const LLT SrcTy = MRI->getType(SrcReg);
2587 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2588 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2589 const unsigned DstSize = DstTy.getSizeInBits();
2590 if (!DstTy.isScalar())
2591 return false;
2592
2593 // Artifact casts should never use vcc.
2594 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2595
2596 // FIXME: This should probably be illegal and split earlier.
2597 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2598 if (DstSize <= 32)
2599 return selectCOPY(I);
2600
2601 const TargetRegisterClass *SrcRC =
2602 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2603 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2604 const TargetRegisterClass *DstRC =
2605 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2606
2607 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2608 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2609 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2610 .addReg(SrcReg)
2611 .addImm(AMDGPU::sub0)
2612 .addReg(UndefReg)
2613 .addImm(AMDGPU::sub1);
2614 I.eraseFromParent();
2615
2616 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2617 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2618 }
2619
2620 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2621 // 64-bit should have been split up in RegBankSelect
2622
2623 // Try to use an and with a mask if it will save code size.
2624 unsigned Mask;
2625 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2626 MachineInstr *ExtI =
2627 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2628 .addImm(Mask)
2629 .addReg(SrcReg);
2630 I.eraseFromParent();
2631 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2632 }
2633
2634 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2635 MachineInstr *ExtI =
2636 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2637 .addReg(SrcReg)
2638 .addImm(0) // Offset
2639 .addImm(SrcSize); // Width
2640 I.eraseFromParent();
2641 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2642 }
2643
2644 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2645 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2646 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2647 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2648 return false;
2649
2650 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2651 const unsigned SextOpc = SrcSize == 8 ?
2652 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2653 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2654 .addReg(SrcReg);
2655 I.eraseFromParent();
2656 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2657 }
2658
2659 // Using a single 32-bit SALU to calculate the high half is smaller than
2660 // S_BFE with a literal constant operand.
2661 if (DstSize > 32 && SrcSize == 32) {
2662 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2663 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2664 if (Signed) {
2665 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2666 .addReg(SrcReg, 0, SubReg)
2667 .addImm(31)
2668 .setOperandDead(3); // Dead scc
2669 } else {
2670 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2671 .addImm(0);
2672 }
2673 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2674 .addReg(SrcReg, 0, SubReg)
2675 .addImm(AMDGPU::sub0)
2676 .addReg(HiReg)
2677 .addImm(AMDGPU::sub1);
2678 I.eraseFromParent();
2679 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2680 *MRI);
2681 }
2682
2683 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2684 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2685
2686 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2687 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2688 // We need a 64-bit register source, but the high bits don't matter.
2689 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2690 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2691 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2692
2693 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2694 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2695 .addReg(SrcReg, 0, SubReg)
2696 .addImm(AMDGPU::sub0)
2697 .addReg(UndefReg)
2698 .addImm(AMDGPU::sub1);
2699
2700 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2701 .addReg(ExtReg)
2702 .addImm(SrcSize << 16);
2703
2704 I.eraseFromParent();
2705 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2706 }
2707
2708 unsigned Mask;
2709 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2710 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2711 .addReg(SrcReg)
2712 .addImm(Mask)
2713 .setOperandDead(3); // Dead scc
2714 } else {
2715 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2716 .addReg(SrcReg)
2717 .addImm(SrcSize << 16);
2718 }
2719
2720 I.eraseFromParent();
2721 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2722 }
2723
2724 return false;
2725}
2726
2730
2732 Register BitcastSrc;
2733 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2734 Reg = BitcastSrc;
2735 return Reg;
2736}
2737
2739 Register &Out) {
2740 Register Trunc;
2741 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2742 return false;
2743
2744 Register LShlSrc;
2745 Register Cst;
2746 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2747 Cst = stripCopy(Cst, MRI);
2748 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2749 Out = stripBitCast(LShlSrc, MRI);
2750 return true;
2751 }
2752 }
2753
2754 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2755 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2756 return false;
2757
2758 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2759 LLT::fixed_vector(2, 16));
2760
2761 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2762 assert(Mask.size() == 2);
2763
2764 if (Mask[0] == 1 && Mask[1] <= 1) {
2765 Out = Shuffle->getOperand(0).getReg();
2766 return true;
2767 }
2768
2769 return false;
2770}
2771
2772bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2773 if (!Subtarget->hasSALUFloatInsts())
2774 return false;
2775
2776 Register Dst = I.getOperand(0).getReg();
2777 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2778 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2779 return false;
2780
2781 Register Src = I.getOperand(1).getReg();
2782
2783 if (MRI->getType(Dst) == LLT::scalar(32) &&
2784 MRI->getType(Src) == LLT::scalar(16)) {
2785 if (isExtractHiElt(*MRI, Src, Src)) {
2786 MachineBasicBlock *BB = I.getParent();
2787 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2788 .addUse(Src);
2789 I.eraseFromParent();
2790 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2791 }
2792 }
2793
2794 return false;
2795}
2796
2797bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2798 // Only manually handle the f64 SGPR case.
2799 //
2800 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2801 // the bit ops theoretically have a second result due to the implicit def of
2802 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2803 // that is easy by disabling the check. The result works, but uses a
2804 // nonsensical sreg32orlds_and_sreg_1 regclass.
2805 //
2806 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2807 // the variadic REG_SEQUENCE operands.
2808
2809 Register Dst = MI.getOperand(0).getReg();
2810 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2811 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2812 MRI->getType(Dst) != LLT::scalar(64))
2813 return false;
2814
2815 Register Src = MI.getOperand(1).getReg();
2816 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2817 if (Fabs)
2818 Src = Fabs->getOperand(1).getReg();
2819
2820 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2821 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2822 return false;
2823
2824 MachineBasicBlock *BB = MI.getParent();
2825 const DebugLoc &DL = MI.getDebugLoc();
2826 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2827 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2828 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2829 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2830
2831 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2832 .addReg(Src, 0, AMDGPU::sub0);
2833 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2834 .addReg(Src, 0, AMDGPU::sub1);
2835 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2836 .addImm(0x80000000);
2837
2838 // Set or toggle sign bit.
2839 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2840 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2841 .addReg(HiReg)
2842 .addReg(ConstReg)
2843 .setOperandDead(3); // Dead scc
2844 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2845 .addReg(LoReg)
2846 .addImm(AMDGPU::sub0)
2847 .addReg(OpReg)
2848 .addImm(AMDGPU::sub1);
2849 MI.eraseFromParent();
2850 return true;
2851}
2852
2853// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2854bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2855 Register Dst = MI.getOperand(0).getReg();
2856 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2857 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2858 MRI->getType(Dst) != LLT::scalar(64))
2859 return false;
2860
2861 Register Src = MI.getOperand(1).getReg();
2862 MachineBasicBlock *BB = MI.getParent();
2863 const DebugLoc &DL = MI.getDebugLoc();
2864 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2865 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2866 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2867 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2868
2869 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2870 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2871 return false;
2872
2873 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2874 .addReg(Src, 0, AMDGPU::sub0);
2875 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2876 .addReg(Src, 0, AMDGPU::sub1);
2877 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2878 .addImm(0x7fffffff);
2879
2880 // Clear sign bit.
2881 // TODO: Should this used S_BITSET0_*?
2882 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2883 .addReg(HiReg)
2884 .addReg(ConstReg)
2885 .setOperandDead(3); // Dead scc
2886 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2887 .addReg(LoReg)
2888 .addImm(AMDGPU::sub0)
2889 .addReg(OpReg)
2890 .addImm(AMDGPU::sub1);
2891
2892 MI.eraseFromParent();
2893 return true;
2894}
2895
2896static bool isConstant(const MachineInstr &MI) {
2897 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2898}
2899
2900void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2901 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2902
2903 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2904 const MachineInstr *PtrMI =
2905 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2906
2907 assert(PtrMI);
2908
2909 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2910 return;
2911
2912 GEPInfo GEPInfo;
2913
2914 for (unsigned i = 1; i != 3; ++i) {
2915 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2916 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2917 assert(OpDef);
2918 if (i == 2 && isConstant(*OpDef)) {
2919 // TODO: Could handle constant base + variable offset, but a combine
2920 // probably should have commuted it.
2921 assert(GEPInfo.Imm == 0);
2922 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2923 continue;
2924 }
2925 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2926 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2927 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2928 else
2929 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2930 }
2931
2932 AddrInfo.push_back(GEPInfo);
2933 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2934}
2935
2936bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2937 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2938}
2939
2940bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2941 if (!MI.hasOneMemOperand())
2942 return false;
2943
2944 const MachineMemOperand *MMO = *MI.memoperands_begin();
2945 const Value *Ptr = MMO->getValue();
2946
2947 // UndefValue means this is a load of a kernel input. These are uniform.
2948 // Sometimes LDS instructions have constant pointers.
2949 // If Ptr is null, then that means this mem operand contains a
2950 // PseudoSourceValue like GOT.
2952 return true;
2953
2955 return true;
2956
2957 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2958 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2959 AMDGPU::SGPRRegBankID;
2960
2962 return I && I->getMetadata("amdgpu.uniform");
2963}
2964
2965bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2966 for (const GEPInfo &GEPInfo : AddrInfo) {
2967 if (!GEPInfo.VgprParts.empty())
2968 return true;
2969 }
2970 return false;
2971}
2972
2973void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2974 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2975 unsigned AS = PtrTy.getAddressSpace();
2977 STI.ldsRequiresM0Init()) {
2978 MachineBasicBlock *BB = I.getParent();
2979
2980 // If DS instructions require M0 initialization, insert it before selecting.
2981 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2982 .addImm(-1);
2983 }
2984}
2985
2986bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2987 MachineInstr &I) const {
2988 initM0(I);
2989 return selectImpl(I, *CoverageInfo);
2990}
2991
2993 if (Reg.isPhysical())
2994 return false;
2995
2996 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2997 const unsigned Opcode = MI.getOpcode();
2998
2999 if (Opcode == AMDGPU::COPY)
3000 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3001
3002 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3003 Opcode == AMDGPU::G_XOR)
3004 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3005 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3006
3007 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3008 return GI->is(Intrinsic::amdgcn_class);
3009
3010 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3011}
3012
3013bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3014 MachineBasicBlock *BB = I.getParent();
3015 MachineOperand &CondOp = I.getOperand(0);
3016 Register CondReg = CondOp.getReg();
3017 const DebugLoc &DL = I.getDebugLoc();
3018
3019 unsigned BrOpcode;
3020 Register CondPhysReg;
3021 const TargetRegisterClass *ConstrainRC;
3022
3023 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3024 // whether the branch is uniform when selecting the instruction. In
3025 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3026 // RegBankSelect knows what it's doing if the branch condition is scc, even
3027 // though it currently does not.
3028 if (!isVCC(CondReg, *MRI)) {
3029 if (MRI->getType(CondReg) != LLT::scalar(32))
3030 return false;
3031
3032 CondPhysReg = AMDGPU::SCC;
3033 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3034 ConstrainRC = &AMDGPU::SReg_32RegClass;
3035 } else {
3036 // FIXME: Should scc->vcc copies and with exec?
3037
3038 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3039 // need to insert an and with exec.
3040 if (!isVCmpResult(CondReg, *MRI)) {
3041 const bool Is64 = STI.isWave64();
3042 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3043 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3044
3045 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3046 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3047 .addReg(CondReg)
3048 .addReg(Exec)
3049 .setOperandDead(3); // Dead scc
3050 CondReg = TmpReg;
3051 }
3052
3053 CondPhysReg = TRI.getVCC();
3054 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3055 ConstrainRC = TRI.getBoolRC();
3056 }
3057
3058 if (!MRI->getRegClassOrNull(CondReg))
3059 MRI->setRegClass(CondReg, ConstrainRC);
3060
3061 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3062 .addReg(CondReg);
3063 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3064 .addMBB(I.getOperand(1).getMBB());
3065
3066 I.eraseFromParent();
3067 return true;
3068}
3069
3070bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3071 MachineInstr &I) const {
3072 Register DstReg = I.getOperand(0).getReg();
3073 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3074 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3075 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3076 if (IsVGPR)
3077 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3078
3079 return RBI.constrainGenericRegister(
3080 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3081}
3082
3083bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3084 Register DstReg = I.getOperand(0).getReg();
3085 Register SrcReg = I.getOperand(1).getReg();
3086 Register MaskReg = I.getOperand(2).getReg();
3087 LLT Ty = MRI->getType(DstReg);
3088 LLT MaskTy = MRI->getType(MaskReg);
3089 MachineBasicBlock *BB = I.getParent();
3090 const DebugLoc &DL = I.getDebugLoc();
3091
3092 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3093 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3094 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3095 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3096 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3097 return false;
3098
3099 // Try to avoid emitting a bit operation when we only need to touch half of
3100 // the 64-bit pointer.
3101 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3102 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3103 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3104
3105 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3106 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3107
3108 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3109 !CanCopyLow32 && !CanCopyHi32) {
3110 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3111 .addReg(SrcReg)
3112 .addReg(MaskReg)
3113 .setOperandDead(3); // Dead scc
3114 I.eraseFromParent();
3115 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3116 }
3117
3118 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3119 const TargetRegisterClass &RegRC
3120 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3121
3122 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3123 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3124 const TargetRegisterClass *MaskRC =
3125 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3126
3127 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3128 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3129 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3130 return false;
3131
3132 if (Ty.getSizeInBits() == 32) {
3133 assert(MaskTy.getSizeInBits() == 32 &&
3134 "ptrmask should have been narrowed during legalize");
3135
3136 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3137 .addReg(SrcReg)
3138 .addReg(MaskReg);
3139
3140 if (!IsVGPR)
3141 NewOp.setOperandDead(3); // Dead scc
3142 I.eraseFromParent();
3143 return true;
3144 }
3145
3146 Register HiReg = MRI->createVirtualRegister(&RegRC);
3147 Register LoReg = MRI->createVirtualRegister(&RegRC);
3148
3149 // Extract the subregisters from the source pointer.
3150 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3151 .addReg(SrcReg, 0, AMDGPU::sub0);
3152 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3153 .addReg(SrcReg, 0, AMDGPU::sub1);
3154
3155 Register MaskedLo, MaskedHi;
3156
3157 if (CanCopyLow32) {
3158 // If all the bits in the low half are 1, we only need a copy for it.
3159 MaskedLo = LoReg;
3160 } else {
3161 // Extract the mask subregister and apply the and.
3162 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3163 MaskedLo = MRI->createVirtualRegister(&RegRC);
3164
3165 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3166 .addReg(MaskReg, 0, AMDGPU::sub0);
3167 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3168 .addReg(LoReg)
3169 .addReg(MaskLo);
3170 }
3171
3172 if (CanCopyHi32) {
3173 // If all the bits in the high half are 1, we only need a copy for it.
3174 MaskedHi = HiReg;
3175 } else {
3176 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3177 MaskedHi = MRI->createVirtualRegister(&RegRC);
3178
3179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3180 .addReg(MaskReg, 0, AMDGPU::sub1);
3181 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3182 .addReg(HiReg)
3183 .addReg(MaskHi);
3184 }
3185
3186 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3187 .addReg(MaskedLo)
3188 .addImm(AMDGPU::sub0)
3189 .addReg(MaskedHi)
3190 .addImm(AMDGPU::sub1);
3191 I.eraseFromParent();
3192 return true;
3193}
3194
3195/// Return the register to use for the index value, and the subregister to use
3196/// for the indirectly accessed register.
3197static std::pair<Register, unsigned>
3199 const TargetRegisterClass *SuperRC, Register IdxReg,
3200 unsigned EltSize, GISelValueTracking &ValueTracking) {
3201 Register IdxBaseReg;
3202 int Offset;
3203
3204 std::tie(IdxBaseReg, Offset) =
3205 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3206 if (IdxBaseReg == AMDGPU::NoRegister) {
3207 // This will happen if the index is a known constant. This should ordinarily
3208 // be legalized out, but handle it as a register just in case.
3209 assert(Offset == 0);
3210 IdxBaseReg = IdxReg;
3211 }
3212
3213 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3214
3215 // Skip out of bounds offsets, or else we would end up using an undefined
3216 // register.
3217 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3218 return std::pair(IdxReg, SubRegs[0]);
3219 return std::pair(IdxBaseReg, SubRegs[Offset]);
3220}
3221
3222bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3223 MachineInstr &MI) const {
3224 Register DstReg = MI.getOperand(0).getReg();
3225 Register SrcReg = MI.getOperand(1).getReg();
3226 Register IdxReg = MI.getOperand(2).getReg();
3227
3228 LLT DstTy = MRI->getType(DstReg);
3229 LLT SrcTy = MRI->getType(SrcReg);
3230
3231 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3232 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3233 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3234
3235 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3236 // into a waterfall loop.
3237 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3238 return false;
3239
3240 const TargetRegisterClass *SrcRC =
3241 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3242 const TargetRegisterClass *DstRC =
3243 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3244 if (!SrcRC || !DstRC)
3245 return false;
3246 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3247 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3248 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3249 return false;
3250
3251 MachineBasicBlock *BB = MI.getParent();
3252 const DebugLoc &DL = MI.getDebugLoc();
3253 const bool Is64 = DstTy.getSizeInBits() == 64;
3254
3255 unsigned SubReg;
3256 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3257 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3258
3259 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3260 if (DstTy.getSizeInBits() != 32 && !Is64)
3261 return false;
3262
3263 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3264 .addReg(IdxReg);
3265
3266 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3267 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3268 .addReg(SrcReg, 0, SubReg)
3269 .addReg(SrcReg, RegState::Implicit);
3270 MI.eraseFromParent();
3271 return true;
3272 }
3273
3274 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3275 return false;
3276
3277 if (!STI.useVGPRIndexMode()) {
3278 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3279 .addReg(IdxReg);
3280 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3281 .addReg(SrcReg, 0, SubReg)
3282 .addReg(SrcReg, RegState::Implicit);
3283 MI.eraseFromParent();
3284 return true;
3285 }
3286
3287 const MCInstrDesc &GPRIDXDesc =
3288 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3289 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3290 .addReg(SrcReg)
3291 .addReg(IdxReg)
3292 .addImm(SubReg);
3293
3294 MI.eraseFromParent();
3295 return true;
3296}
3297
3298// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3299bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3300 MachineInstr &MI) const {
3301 Register DstReg = MI.getOperand(0).getReg();
3302 Register VecReg = MI.getOperand(1).getReg();
3303 Register ValReg = MI.getOperand(2).getReg();
3304 Register IdxReg = MI.getOperand(3).getReg();
3305
3306 LLT VecTy = MRI->getType(DstReg);
3307 LLT ValTy = MRI->getType(ValReg);
3308 unsigned VecSize = VecTy.getSizeInBits();
3309 unsigned ValSize = ValTy.getSizeInBits();
3310
3311 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3312 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3313 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3314
3315 assert(VecTy.getElementType() == ValTy);
3316
3317 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3318 // into a waterfall loop.
3319 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3320 return false;
3321
3322 const TargetRegisterClass *VecRC =
3323 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3324 const TargetRegisterClass *ValRC =
3325 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3326
3327 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3328 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3329 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3330 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3331 return false;
3332
3333 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3334 return false;
3335
3336 unsigned SubReg;
3337 std::tie(IdxReg, SubReg) =
3338 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3339
3340 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3341 STI.useVGPRIndexMode();
3342
3343 MachineBasicBlock *BB = MI.getParent();
3344 const DebugLoc &DL = MI.getDebugLoc();
3345
3346 if (!IndexMode) {
3347 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3348 .addReg(IdxReg);
3349
3350 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3351 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3352 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3353 .addReg(VecReg)
3354 .addReg(ValReg)
3355 .addImm(SubReg);
3356 MI.eraseFromParent();
3357 return true;
3358 }
3359
3360 const MCInstrDesc &GPRIDXDesc =
3361 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3362 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3363 .addReg(VecReg)
3364 .addReg(ValReg)
3365 .addReg(IdxReg)
3366 .addImm(SubReg);
3367
3368 MI.eraseFromParent();
3369 return true;
3370}
3371
3372bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3373 if (!Subtarget->hasVMemToLDSLoad())
3374 return false;
3375 unsigned Opc;
3376 unsigned Size = MI.getOperand(3).getImm();
3377
3378 // The struct intrinsic variants add one additional operand over raw.
3379 const bool HasVIndex = MI.getNumOperands() == 9;
3380 Register VIndex;
3381 int OpOffset = 0;
3382 if (HasVIndex) {
3383 VIndex = MI.getOperand(4).getReg();
3384 OpOffset = 1;
3385 }
3386
3387 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3388 std::optional<ValueAndVReg> MaybeVOffset =
3390 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3391
3392 switch (Size) {
3393 default:
3394 return false;
3395 case 1:
3396 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3397 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3398 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3399 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3400 break;
3401 case 2:
3402 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3403 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3404 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3405 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3406 break;
3407 case 4:
3408 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3409 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3410 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3411 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3412 break;
3413 case 12:
3414 if (!Subtarget->hasLDSLoadB96_B128())
3415 return false;
3416
3417 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3418 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3419 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3420 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3421 break;
3422 case 16:
3423 if (!Subtarget->hasLDSLoadB96_B128())
3424 return false;
3425
3426 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3427 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3428 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3429 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3430 break;
3431 }
3432
3433 MachineBasicBlock *MBB = MI.getParent();
3434 const DebugLoc &DL = MI.getDebugLoc();
3435 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3436 .add(MI.getOperand(2));
3437
3438 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3439
3440 if (HasVIndex && HasVOffset) {
3441 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3442 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3443 .addReg(VIndex)
3444 .addImm(AMDGPU::sub0)
3445 .addReg(VOffset)
3446 .addImm(AMDGPU::sub1);
3447
3448 MIB.addReg(IdxReg);
3449 } else if (HasVIndex) {
3450 MIB.addReg(VIndex);
3451 } else if (HasVOffset) {
3452 MIB.addReg(VOffset);
3453 }
3454
3455 MIB.add(MI.getOperand(1)); // rsrc
3456 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3457 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3458 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3459 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3460 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3461 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3462 MIB.addImm(
3463 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3464 ? 1
3465 : 0); // swz
3466
3467 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3468 // Don't set the offset value here because the pointer points to the base of
3469 // the buffer.
3470 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3471
3472 MachinePointerInfo StorePtrI = LoadPtrI;
3473 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3477
3478 auto F = LoadMMO->getFlags() &
3480 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3481 Size, LoadMMO->getBaseAlign());
3482
3483 MachineMemOperand *StoreMMO =
3484 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3485 sizeof(int32_t), LoadMMO->getBaseAlign());
3486
3487 MIB.setMemRefs({LoadMMO, StoreMMO});
3488
3489 MI.eraseFromParent();
3490 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3491}
3492
3493/// Match a zero extend from a 32-bit value to 64-bits.
3494Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3495 Register ZExtSrc;
3496 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3497 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3498
3499 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3500 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3501 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3502 return Register();
3503
3504 assert(Def->getNumOperands() == 3 &&
3505 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3506 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3507 return Def->getOperand(1).getReg();
3508 }
3509
3510 return Register();
3511}
3512
3513/// Match a sign extend from a 32-bit value to 64-bits.
3514Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3515 Register SExtSrc;
3516 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3517 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3518
3519 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3520 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3521 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3522 return Register();
3523
3524 assert(Def->getNumOperands() == 3 &&
3525 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3526 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3527 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3528 m_SpecificICst(31))))
3529 return Def->getOperand(1).getReg();
3530
3531 if (VT->signBitIsZero(Reg))
3532 return matchZeroExtendFromS32(Reg);
3533
3534 return Register();
3535}
3536
3537/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3538/// is 32-bit.
3540AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3541 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3542 : matchZeroExtendFromS32(Reg);
3543}
3544
3545/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3546/// is 32-bit.
3548AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3549 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3550 : matchSignExtendFromS32(Reg);
3551}
3552
3554AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3555 bool IsSigned) const {
3556 if (IsSigned)
3557 return matchSignExtendFromS32OrS32(Reg);
3558
3559 return matchZeroExtendFromS32OrS32(Reg);
3560}
3561
3562Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3563 Register AnyExtSrc;
3564 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3565 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3566
3567 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3568 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3569 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3570 return Register();
3571
3572 assert(Def->getNumOperands() == 3 &&
3573 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3574
3575 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3576 return Def->getOperand(1).getReg();
3577
3578 return Register();
3579}
3580
3581bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3582 if (!Subtarget->hasVMemToLDSLoad())
3583 return false;
3584
3585 unsigned Opc;
3586 unsigned Size = MI.getOperand(3).getImm();
3587
3588 switch (Size) {
3589 default:
3590 return false;
3591 case 1:
3592 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3593 break;
3594 case 2:
3595 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3596 break;
3597 case 4:
3598 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3599 break;
3600 case 12:
3601 if (!Subtarget->hasLDSLoadB96_B128())
3602 return false;
3603 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3604 break;
3605 case 16:
3606 if (!Subtarget->hasLDSLoadB96_B128())
3607 return false;
3608 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3609 break;
3610 }
3611
3612 MachineBasicBlock *MBB = MI.getParent();
3613 const DebugLoc &DL = MI.getDebugLoc();
3614 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3615 .add(MI.getOperand(2));
3616
3617 Register Addr = MI.getOperand(1).getReg();
3618 Register VOffset;
3619 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3620 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3621 if (!isSGPR(Addr)) {
3622 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3623 if (isSGPR(AddrDef->Reg)) {
3624 Addr = AddrDef->Reg;
3625 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3626 Register SAddr =
3627 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3628 if (isSGPR(SAddr)) {
3629 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3630 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3631 Addr = SAddr;
3632 VOffset = Off;
3633 }
3634 }
3635 }
3636 }
3637
3638 if (isSGPR(Addr)) {
3640 if (!VOffset) {
3641 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3642 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3643 .addImm(0);
3644 }
3645 }
3646
3647 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3648 .addReg(Addr);
3649
3650 if (isSGPR(Addr))
3651 MIB.addReg(VOffset);
3652
3653 MIB.add(MI.getOperand(4)); // offset
3654
3655 unsigned Aux = MI.getOperand(5).getImm();
3656 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3657
3658 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3659 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3660 LoadPtrI.Offset = MI.getOperand(4).getImm();
3661 MachinePointerInfo StorePtrI = LoadPtrI;
3662 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3666 auto F = LoadMMO->getFlags() &
3668 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3669 Size, LoadMMO->getBaseAlign());
3670 MachineMemOperand *StoreMMO =
3671 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3672 sizeof(int32_t), Align(4));
3673
3674 MIB.setMemRefs({LoadMMO, StoreMMO});
3675
3676 MI.eraseFromParent();
3677 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3678}
3679
3680bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3681 MachineInstr &MI) const {
3682 unsigned OpcodeOpIdx =
3683 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3684 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3685 MI.removeOperand(OpcodeOpIdx);
3686 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3687 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3688}
3689
3690// FIXME: This should be removed and let the patterns select. We just need the
3691// AGPR/VGPR combination versions.
3692bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3693 unsigned Opc;
3694 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3695 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3696 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3697 break;
3698 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3699 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3700 break;
3701 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3702 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3703 break;
3704 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3705 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3706 break;
3707 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3708 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3709 break;
3710 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3711 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3712 break;
3713 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3714 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3715 break;
3716 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3717 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3718 break;
3719 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3720 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3721 break;
3722 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3723 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3724 break;
3725 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3726 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3727 break;
3728 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3729 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3730 break;
3731 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3732 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3733 break;
3734 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3735 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3736 break;
3737 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3738 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3739 break;
3740 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3741 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3742 break;
3743 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3744 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3745 break;
3746 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3747 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3748 break;
3749 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3750 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3751 break;
3752 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3753 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3754 break;
3755 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3756 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3757 break;
3758 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3759 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3760 break;
3761 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3762 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3763 break;
3764 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3765 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3766 break;
3767 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3768 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3769 break;
3770 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3771 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3772 break;
3773 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3774 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3775 break;
3776 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3777 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3778 break;
3779 default:
3780 llvm_unreachable("unhandled smfmac intrinsic");
3781 }
3782
3783 auto VDst_In = MI.getOperand(4);
3784
3785 MI.setDesc(TII.get(Opc));
3786 MI.removeOperand(4); // VDst_In
3787 MI.removeOperand(1); // Intrinsic ID
3788 MI.addOperand(VDst_In); // Readd VDst_In to the end
3789 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3790 return true;
3791}
3792
3793bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3794 MachineInstr &MI, Intrinsic::ID IntrID) const {
3795 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3796 !Subtarget->hasPermlane16Swap())
3797 return false;
3798 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3799 !Subtarget->hasPermlane32Swap())
3800 return false;
3801
3802 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3803 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3804 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3805
3806 MI.removeOperand(2);
3807 MI.setDesc(TII.get(Opcode));
3808 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3809
3810 MachineOperand &FI = MI.getOperand(4);
3812
3813 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3814}
3815
3816bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3817 Register DstReg = MI.getOperand(0).getReg();
3818 Register SrcReg = MI.getOperand(1).getReg();
3819 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3820 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3821 MachineBasicBlock *MBB = MI.getParent();
3822 const DebugLoc &DL = MI.getDebugLoc();
3823
3824 if (IsVALU) {
3825 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3826 .addImm(Subtarget->getWavefrontSizeLog2())
3827 .addReg(SrcReg);
3828 } else {
3829 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3830 .addReg(SrcReg)
3831 .addImm(Subtarget->getWavefrontSizeLog2())
3832 .setOperandDead(3); // Dead scc
3833 }
3834
3835 const TargetRegisterClass &RC =
3836 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3837 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3838 return false;
3839
3840 MI.eraseFromParent();
3841 return true;
3842}
3843
3844// Match BITOP3 operation and return a number of matched instructions plus
3845// truth table.
3846static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3848 const MachineRegisterInfo &MRI) {
3849 unsigned NumOpcodes = 0;
3850 uint8_t LHSBits, RHSBits;
3851
3852 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3853 // Define truth table given Src0, Src1, Src2 bits permutations:
3854 // 0 0 0
3855 // 0 0 1
3856 // 0 1 0
3857 // 0 1 1
3858 // 1 0 0
3859 // 1 0 1
3860 // 1 1 0
3861 // 1 1 1
3862 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3863
3864 if (mi_match(Op, MRI, m_AllOnesInt())) {
3865 Bits = 0xff;
3866 return true;
3867 }
3868 if (mi_match(Op, MRI, m_ZeroInt())) {
3869 Bits = 0;
3870 return true;
3871 }
3872
3873 for (unsigned I = 0; I < Src.size(); ++I) {
3874 // Try to find existing reused operand
3875 if (Src[I] == Op) {
3876 Bits = SrcBits[I];
3877 return true;
3878 }
3879 // Try to replace parent operator
3880 if (Src[I] == R) {
3881 Bits = SrcBits[I];
3882 Src[I] = Op;
3883 return true;
3884 }
3885 }
3886
3887 if (Src.size() == 3) {
3888 // No room left for operands. Try one last time, there can be a 'not' of
3889 // one of our source operands. In this case we can compute the bits
3890 // without growing Src vector.
3891 Register LHS;
3892 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3894 for (unsigned I = 0; I < Src.size(); ++I) {
3895 if (Src[I] == LHS) {
3896 Bits = ~SrcBits[I];
3897 return true;
3898 }
3899 }
3900 }
3901
3902 return false;
3903 }
3904
3905 Bits = SrcBits[Src.size()];
3906 Src.push_back(Op);
3907 return true;
3908 };
3909
3910 MachineInstr *MI = MRI.getVRegDef(R);
3911 switch (MI->getOpcode()) {
3912 case TargetOpcode::G_AND:
3913 case TargetOpcode::G_OR:
3914 case TargetOpcode::G_XOR: {
3915 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3916 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3917
3918 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3919 if (!getOperandBits(LHS, LHSBits) ||
3920 !getOperandBits(RHS, RHSBits)) {
3921 Src = Backup;
3922 return std::make_pair(0, 0);
3923 }
3924
3925 // Recursion is naturally limited by the size of the operand vector.
3926 auto Op = BitOp3_Op(LHS, Src, MRI);
3927 if (Op.first) {
3928 NumOpcodes += Op.first;
3929 LHSBits = Op.second;
3930 }
3931
3932 Op = BitOp3_Op(RHS, Src, MRI);
3933 if (Op.first) {
3934 NumOpcodes += Op.first;
3935 RHSBits = Op.second;
3936 }
3937 break;
3938 }
3939 default:
3940 return std::make_pair(0, 0);
3941 }
3942
3943 uint8_t TTbl;
3944 switch (MI->getOpcode()) {
3945 case TargetOpcode::G_AND:
3946 TTbl = LHSBits & RHSBits;
3947 break;
3948 case TargetOpcode::G_OR:
3949 TTbl = LHSBits | RHSBits;
3950 break;
3951 case TargetOpcode::G_XOR:
3952 TTbl = LHSBits ^ RHSBits;
3953 break;
3954 default:
3955 break;
3956 }
3957
3958 return std::make_pair(NumOpcodes + 1, TTbl);
3959}
3960
3961bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3962 if (!Subtarget->hasBitOp3Insts())
3963 return false;
3964
3965 Register DstReg = MI.getOperand(0).getReg();
3966 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3967 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3968 if (!IsVALU)
3969 return false;
3970
3972 uint8_t TTbl;
3973 unsigned NumOpcodes;
3974
3975 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3976
3977 // Src.empty() case can happen if all operands are all zero or all ones.
3978 // Normally it shall be optimized out before reaching this.
3979 if (NumOpcodes < 2 || Src.empty())
3980 return false;
3981
3982 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3983 if (NumOpcodes == 2 && IsB32) {
3984 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3985 // asm more readable. This cannot be modeled with AddedComplexity because
3986 // selector does not know how many operations did we match.
3987 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3988 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3989 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3990 return false;
3991 } else if (NumOpcodes < 4) {
3992 // For a uniform case threshold should be higher to account for moves
3993 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3994 // in SGPRs and a readtfirstlane after.
3995 return false;
3996 }
3997
3998 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3999 if (!IsB32 && STI.hasTrue16BitInsts())
4000 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4001 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4002 unsigned CBL = STI.getConstantBusLimit(Opc);
4003 MachineBasicBlock *MBB = MI.getParent();
4004 const DebugLoc &DL = MI.getDebugLoc();
4005
4006 for (unsigned I = 0; I < Src.size(); ++I) {
4007 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4008 if (RB->getID() != AMDGPU::SGPRRegBankID)
4009 continue;
4010 if (CBL > 0) {
4011 --CBL;
4012 continue;
4013 }
4014 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4015 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4016 .addReg(Src[I]);
4017 Src[I] = NewReg;
4018 }
4019
4020 // Last operand can be ignored, turning a ternary operation into a binary.
4021 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4022 // 'c' with 'a' here without changing the answer. In some pathological
4023 // cases it should be possible to get an operation with a single operand
4024 // too if optimizer would not catch it.
4025 while (Src.size() < 3)
4026 Src.push_back(Src[0]);
4027
4028 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4029 if (!IsB32)
4030 MIB.addImm(0); // src_mod0
4031 MIB.addReg(Src[0]);
4032 if (!IsB32)
4033 MIB.addImm(0); // src_mod1
4034 MIB.addReg(Src[1]);
4035 if (!IsB32)
4036 MIB.addImm(0); // src_mod2
4037 MIB.addReg(Src[2])
4038 .addImm(TTbl);
4039 if (!IsB32)
4040 MIB.addImm(0); // op_sel
4041
4042 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4043 MI.eraseFromParent();
4044
4045 return true;
4046}
4047
4048bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4049 Register SrcReg = MI.getOperand(0).getReg();
4050 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4051 return false;
4052
4053 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4054 Register SP =
4055 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4056 Register WaveAddr = getWaveAddress(DefMI);
4057 MachineBasicBlock *MBB = MI.getParent();
4058 const DebugLoc &DL = MI.getDebugLoc();
4059
4060 if (!WaveAddr) {
4061 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4062 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4063 .addReg(SrcReg)
4064 .addImm(Subtarget->getWavefrontSizeLog2())
4065 .setOperandDead(3); // Dead scc
4066 }
4067
4068 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4069 .addReg(WaveAddr);
4070
4071 MI.eraseFromParent();
4072 return true;
4073}
4074
4076
4077 if (!I.isPreISelOpcode()) {
4078 if (I.isCopy())
4079 return selectCOPY(I);
4080 return true;
4081 }
4082
4083 switch (I.getOpcode()) {
4084 case TargetOpcode::G_AND:
4085 case TargetOpcode::G_OR:
4086 case TargetOpcode::G_XOR:
4087 if (selectBITOP3(I))
4088 return true;
4089 if (selectImpl(I, *CoverageInfo))
4090 return true;
4091 return selectG_AND_OR_XOR(I);
4092 case TargetOpcode::G_ADD:
4093 case TargetOpcode::G_SUB:
4094 case TargetOpcode::G_PTR_ADD:
4095 if (selectImpl(I, *CoverageInfo))
4096 return true;
4097 return selectG_ADD_SUB(I);
4098 case TargetOpcode::G_UADDO:
4099 case TargetOpcode::G_USUBO:
4100 case TargetOpcode::G_UADDE:
4101 case TargetOpcode::G_USUBE:
4102 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4103 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4104 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4105 return selectG_AMDGPU_MAD_64_32(I);
4106 case TargetOpcode::G_INTTOPTR:
4107 case TargetOpcode::G_BITCAST:
4108 case TargetOpcode::G_PTRTOINT:
4109 case TargetOpcode::G_FREEZE:
4110 return selectCOPY(I);
4111 case TargetOpcode::G_FNEG:
4112 if (selectImpl(I, *CoverageInfo))
4113 return true;
4114 return selectG_FNEG(I);
4115 case TargetOpcode::G_FABS:
4116 if (selectImpl(I, *CoverageInfo))
4117 return true;
4118 return selectG_FABS(I);
4119 case TargetOpcode::G_EXTRACT:
4120 return selectG_EXTRACT(I);
4121 case TargetOpcode::G_MERGE_VALUES:
4122 case TargetOpcode::G_CONCAT_VECTORS:
4123 return selectG_MERGE_VALUES(I);
4124 case TargetOpcode::G_UNMERGE_VALUES:
4125 return selectG_UNMERGE_VALUES(I);
4126 case TargetOpcode::G_BUILD_VECTOR:
4127 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4128 return selectG_BUILD_VECTOR(I);
4129 case TargetOpcode::G_IMPLICIT_DEF:
4130 return selectG_IMPLICIT_DEF(I);
4131 case TargetOpcode::G_INSERT:
4132 return selectG_INSERT(I);
4133 case TargetOpcode::G_INTRINSIC:
4134 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4135 return selectG_INTRINSIC(I);
4136 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4137 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4138 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4139 case TargetOpcode::G_ICMP:
4140 case TargetOpcode::G_FCMP:
4141 if (selectG_ICMP_or_FCMP(I))
4142 return true;
4143 return selectImpl(I, *CoverageInfo);
4144 case TargetOpcode::G_LOAD:
4145 case TargetOpcode::G_ZEXTLOAD:
4146 case TargetOpcode::G_SEXTLOAD:
4147 case TargetOpcode::G_STORE:
4148 case TargetOpcode::G_ATOMIC_CMPXCHG:
4149 case TargetOpcode::G_ATOMICRMW_XCHG:
4150 case TargetOpcode::G_ATOMICRMW_ADD:
4151 case TargetOpcode::G_ATOMICRMW_SUB:
4152 case TargetOpcode::G_ATOMICRMW_AND:
4153 case TargetOpcode::G_ATOMICRMW_OR:
4154 case TargetOpcode::G_ATOMICRMW_XOR:
4155 case TargetOpcode::G_ATOMICRMW_MIN:
4156 case TargetOpcode::G_ATOMICRMW_MAX:
4157 case TargetOpcode::G_ATOMICRMW_UMIN:
4158 case TargetOpcode::G_ATOMICRMW_UMAX:
4159 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4160 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4161 case TargetOpcode::G_ATOMICRMW_FADD:
4162 case TargetOpcode::G_ATOMICRMW_FMIN:
4163 case TargetOpcode::G_ATOMICRMW_FMAX:
4164 return selectG_LOAD_STORE_ATOMICRMW(I);
4165 case TargetOpcode::G_SELECT:
4166 return selectG_SELECT(I);
4167 case TargetOpcode::G_TRUNC:
4168 return selectG_TRUNC(I);
4169 case TargetOpcode::G_SEXT:
4170 case TargetOpcode::G_ZEXT:
4171 case TargetOpcode::G_ANYEXT:
4172 case TargetOpcode::G_SEXT_INREG:
4173 // This is a workaround. For extension from type i1, `selectImpl()` uses
4174 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4175 // i1 can only be hold in a SGPR class.
4176 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4177 selectImpl(I, *CoverageInfo))
4178 return true;
4179 return selectG_SZA_EXT(I);
4180 case TargetOpcode::G_FPEXT:
4181 if (selectG_FPEXT(I))
4182 return true;
4183 return selectImpl(I, *CoverageInfo);
4184 case TargetOpcode::G_BRCOND:
4185 return selectG_BRCOND(I);
4186 case TargetOpcode::G_GLOBAL_VALUE:
4187 return selectG_GLOBAL_VALUE(I);
4188 case TargetOpcode::G_PTRMASK:
4189 return selectG_PTRMASK(I);
4190 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4191 return selectG_EXTRACT_VECTOR_ELT(I);
4192 case TargetOpcode::G_INSERT_VECTOR_ELT:
4193 return selectG_INSERT_VECTOR_ELT(I);
4194 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4195 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4196 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4197 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4198 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4199 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4201 assert(Intr && "not an image intrinsic with image pseudo");
4202 return selectImageIntrinsic(I, Intr);
4203 }
4204 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4205 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4206 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4207 return selectBVHIntersectRayIntrinsic(I);
4208 case AMDGPU::G_SBFX:
4209 case AMDGPU::G_UBFX:
4210 return selectG_SBFX_UBFX(I);
4211 case AMDGPU::G_SI_CALL:
4212 I.setDesc(TII.get(AMDGPU::SI_CALL));
4213 return true;
4214 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4215 return selectWaveAddress(I);
4216 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4217 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4218 return true;
4219 }
4220 case AMDGPU::G_STACKRESTORE:
4221 return selectStackRestore(I);
4222 case AMDGPU::G_PHI:
4223 return selectPHI(I);
4224 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4225 return selectCOPY_SCC_VCC(I);
4226 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4227 return selectCOPY_VCC_SCC(I);
4228 case AMDGPU::G_AMDGPU_READANYLANE:
4229 return selectReadAnyLane(I);
4230 case TargetOpcode::G_CONSTANT:
4231 case TargetOpcode::G_FCONSTANT:
4232 default:
4233 return selectImpl(I, *CoverageInfo);
4234 }
4235 return false;
4236}
4237
4239AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4240 return {{
4241 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4242 }};
4243
4244}
4245
4246std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4247 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4248 unsigned Mods = 0;
4249 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4250
4251 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4252 Src = MI->getOperand(1).getReg();
4253 Mods |= SISrcMods::NEG;
4254 MI = getDefIgnoringCopies(Src, *MRI);
4255 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4256 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4257 // denormal mode, but we're implicitly canonicalizing in a source operand.
4258 const ConstantFP *LHS =
4259 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4260 if (LHS && LHS->isZero()) {
4261 Mods |= SISrcMods::NEG;
4262 Src = MI->getOperand(2).getReg();
4263 }
4264 }
4265
4266 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4267 Src = MI->getOperand(1).getReg();
4268 Mods |= SISrcMods::ABS;
4269 }
4270
4271 if (OpSel)
4272 Mods |= SISrcMods::OP_SEL_0;
4273
4274 return std::pair(Src, Mods);
4275}
4276
4277Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4278 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4279 bool ForceVGPR) const {
4280 if ((Mods != 0 || ForceVGPR) &&
4281 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4282
4283 // If we looked through copies to find source modifiers on an SGPR operand,
4284 // we now have an SGPR register source. To avoid potentially violating the
4285 // constant bus restriction, we need to insert a copy to a VGPR.
4286 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4287 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4288 TII.get(AMDGPU::COPY), VGPRSrc)
4289 .addReg(Src);
4290 Src = VGPRSrc;
4291 }
4292
4293 return Src;
4294}
4295
4296///
4297/// This will select either an SGPR or VGPR operand and will save us from
4298/// having to write an extra tablegen pattern.
4300AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4301 return {{
4302 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4303 }};
4304}
4305
4307AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4308 Register Src;
4309 unsigned Mods;
4310 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4311
4312 return {{
4313 [=](MachineInstrBuilder &MIB) {
4314 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4315 },
4316 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4317 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4318 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4319 }};
4320}
4321
4323AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4324 Register Src;
4325 unsigned Mods;
4326 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4327 /*IsCanonicalizing=*/true,
4328 /*AllowAbs=*/false);
4329
4330 return {{
4331 [=](MachineInstrBuilder &MIB) {
4332 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4333 },
4334 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4335 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4336 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4337 }};
4338}
4339
4341AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4342 return {{
4343 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4344 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4345 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4346 }};
4347}
4348
4350AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4351 Register Src;
4352 unsigned Mods;
4353 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4354
4355 return {{
4356 [=](MachineInstrBuilder &MIB) {
4357 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4358 },
4359 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4360 }};
4361}
4362
4364AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4365 MachineOperand &Root) const {
4366 Register Src;
4367 unsigned Mods;
4368 std::tie(Src, Mods) =
4369 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4370
4371 return {{
4372 [=](MachineInstrBuilder &MIB) {
4373 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4374 },
4375 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4376 }};
4377}
4378
4380AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4381 Register Src;
4382 unsigned Mods;
4383 std::tie(Src, Mods) =
4384 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4385 /*AllowAbs=*/false);
4386
4387 return {{
4388 [=](MachineInstrBuilder &MIB) {
4389 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4390 },
4391 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4392 }};
4393}
4394
4396AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4397 Register Reg = Root.getReg();
4398 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4399 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4400 return {};
4401 return {{
4402 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4403 }};
4404}
4405
4406enum class SrcStatus {
4411 // This means current op = [op_upper, op_lower] and src = -op_lower.
4414 // This means current op = [op_upper, op_lower] and src = [op_upper,
4415 // -op_lower].
4423};
4424/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4425static bool isTruncHalf(const MachineInstr *MI,
4426 const MachineRegisterInfo &MRI) {
4427 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4428 return false;
4429
4430 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4431 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4432 return DstSize * 2 == SrcSize;
4433}
4434
4435/// Test if the MI is logic shift right with half bits,
4436/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4437static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4438 if (MI->getOpcode() != AMDGPU::G_LSHR)
4439 return false;
4440
4441 Register ShiftSrc;
4442 std::optional<ValueAndVReg> ShiftAmt;
4443 if (mi_match(MI->getOperand(0).getReg(), MRI,
4444 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4445 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4446 unsigned Shift = ShiftAmt->Value.getZExtValue();
4447 return Shift * 2 == SrcSize;
4448 }
4449 return false;
4450}
4451
4452/// Test if the MI is shift left with half bits,
4453/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4454static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4455 if (MI->getOpcode() != AMDGPU::G_SHL)
4456 return false;
4457
4458 Register ShiftSrc;
4459 std::optional<ValueAndVReg> ShiftAmt;
4460 if (mi_match(MI->getOperand(0).getReg(), MRI,
4461 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4462 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4463 unsigned Shift = ShiftAmt->Value.getZExtValue();
4464 return Shift * 2 == SrcSize;
4465 }
4466 return false;
4467}
4468
4469/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4470static bool isUnmergeHalf(const MachineInstr *MI,
4471 const MachineRegisterInfo &MRI) {
4472 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4473 return false;
4474 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4475 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4476}
4477
4479
4481 const MachineRegisterInfo &MRI) {
4482 LLT OpTy = MRI.getType(Reg);
4483 if (OpTy.isScalar())
4484 return TypeClass::SCALAR;
4485 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4488}
4489
4491 const MachineRegisterInfo &MRI) {
4493 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4494 return SrcStatus::INVALID;
4495
4496 switch (S) {
4497 case SrcStatus::IS_SAME:
4498 if (NegType == TypeClass::VECTOR_OF_TWO) {
4499 // Vector of 2:
4500 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4501 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4502 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4503 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4505 }
4506 if (NegType == TypeClass::SCALAR) {
4507 // Scalar:
4508 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4509 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4510 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4511 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4512 return SrcStatus::IS_HI_NEG;
4513 }
4514 break;
4516 if (NegType == TypeClass::VECTOR_OF_TWO) {
4517 // Vector of 2:
4518 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4519 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4520 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4521 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4522 return SrcStatus::IS_LO_NEG;
4523 }
4524 if (NegType == TypeClass::SCALAR) {
4525 // Scalar:
4526 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4527 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4528 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4529 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4530 return SrcStatus::IS_SAME;
4531 }
4532 break;
4534 if (NegType == TypeClass::VECTOR_OF_TWO) {
4535 // Vector of 2:
4536 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4537 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4538 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4539 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4540 return SrcStatus::IS_HI_NEG;
4541 }
4542 if (NegType == TypeClass::SCALAR) {
4543 // Scalar:
4544 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4545 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4546 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4547 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4549 }
4550 break;
4552 if (NegType == TypeClass::VECTOR_OF_TWO) {
4553 // Vector of 2:
4554 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4555 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4556 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4557 // [SrcHi, SrcLo] = [OpHi, OpLo]
4558 return SrcStatus::IS_SAME;
4559 }
4560 if (NegType == TypeClass::SCALAR) {
4561 // Scalar:
4562 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4563 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4564 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4565 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4566 return SrcStatus::IS_LO_NEG;
4567 }
4568 break;
4570 // Vector of 2:
4571 // Src = CurrUpper
4572 // Curr = [CurrUpper, CurrLower]
4573 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4574 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4575 // Src = -OpUpper
4576 //
4577 // Scalar:
4578 // Src = CurrUpper
4579 // Curr = [CurrUpper, CurrLower]
4580 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4581 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4582 // Src = -OpUpper
4585 if (NegType == TypeClass::VECTOR_OF_TWO) {
4586 // Vector of 2:
4587 // Src = CurrLower
4588 // Curr = [CurrUpper, CurrLower]
4589 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4590 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4591 // Src = -OpLower
4593 }
4594 if (NegType == TypeClass::SCALAR) {
4595 // Scalar:
4596 // Src = CurrLower
4597 // Curr = [CurrUpper, CurrLower]
4598 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4599 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4600 // Src = OpLower
4602 }
4603 break;
4605 // Vector of 2:
4606 // Src = -CurrUpper
4607 // Curr = [CurrUpper, CurrLower]
4608 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4609 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4610 // Src = -(-OpUpper) = OpUpper
4611 //
4612 // Scalar:
4613 // Src = -CurrUpper
4614 // Curr = [CurrUpper, CurrLower]
4615 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4616 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4617 // Src = -(-OpUpper) = OpUpper
4620 if (NegType == TypeClass::VECTOR_OF_TWO) {
4621 // Vector of 2:
4622 // Src = -CurrLower
4623 // Curr = [CurrUpper, CurrLower]
4624 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4625 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4626 // Src = -(-OpLower) = OpLower
4628 }
4629 if (NegType == TypeClass::SCALAR) {
4630 // Scalar:
4631 // Src = -CurrLower
4632 // Curr = [CurrUpper, CurrLower]
4633 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4634 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4635 // Src = -OpLower
4637 }
4638 break;
4639 default:
4640 break;
4641 }
4642 llvm_unreachable("unexpected SrcStatus & NegType combination");
4643}
4644
4645static std::optional<std::pair<Register, SrcStatus>>
4646calcNextStatus(std::pair<Register, SrcStatus> Curr,
4647 const MachineRegisterInfo &MRI) {
4648 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4649
4650 unsigned Opc = MI->getOpcode();
4651
4652 // Handle general Opc cases.
4653 switch (Opc) {
4654 case AMDGPU::G_BITCAST:
4655 return std::optional<std::pair<Register, SrcStatus>>(
4656 {MI->getOperand(1).getReg(), Curr.second});
4657 case AMDGPU::COPY:
4658 if (MI->getOperand(1).getReg().isPhysical())
4659 return std::nullopt;
4660 return std::optional<std::pair<Register, SrcStatus>>(
4661 {MI->getOperand(1).getReg(), Curr.second});
4662 case AMDGPU::G_FNEG: {
4663 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4664 if (Stat == SrcStatus::INVALID)
4665 return std::nullopt;
4666 return std::optional<std::pair<Register, SrcStatus>>(
4667 {MI->getOperand(1).getReg(), Stat});
4668 }
4669 default:
4670 break;
4671 }
4672
4673 // Calc next Stat from current Stat.
4674 switch (Curr.second) {
4675 case SrcStatus::IS_SAME:
4676 if (isTruncHalf(MI, MRI))
4677 return std::optional<std::pair<Register, SrcStatus>>(
4678 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4679 else if (isUnmergeHalf(MI, MRI)) {
4680 if (Curr.first == MI->getOperand(0).getReg())
4681 return std::optional<std::pair<Register, SrcStatus>>(
4682 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4683 return std::optional<std::pair<Register, SrcStatus>>(
4684 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4685 }
4686 break;
4688 if (isTruncHalf(MI, MRI)) {
4689 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4690 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4691 // = [OpLowerHi, OpLowerLo]
4692 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4693 // = [-OpLowerHi, OpLowerLo]
4694 // = -OpLower
4695 return std::optional<std::pair<Register, SrcStatus>>(
4696 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4697 }
4698 if (isUnmergeHalf(MI, MRI)) {
4699 if (Curr.first == MI->getOperand(0).getReg())
4700 return std::optional<std::pair<Register, SrcStatus>>(
4701 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4702 return std::optional<std::pair<Register, SrcStatus>>(
4703 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4704 }
4705 break;
4707 if (isShlHalf(MI, MRI))
4708 return std::optional<std::pair<Register, SrcStatus>>(
4709 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4710 break;
4712 if (isLshrHalf(MI, MRI))
4713 return std::optional<std::pair<Register, SrcStatus>>(
4714 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4715 break;
4717 if (isShlHalf(MI, MRI))
4718 return std::optional<std::pair<Register, SrcStatus>>(
4719 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4720 break;
4722 if (isLshrHalf(MI, MRI))
4723 return std::optional<std::pair<Register, SrcStatus>>(
4724 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4725 break;
4726 default:
4727 break;
4728 }
4729 return std::nullopt;
4730}
4731
4732/// This is used to control valid status that current MI supports. For example,
4733/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4734/// bit on VOP3P.
4735/// The class can be further extended to recognize support on SEL, NEG, ABS bit
4736/// for different MI on different arch
4738private:
4739 bool HasNeg = false;
4740 // Assume all complex pattern of VOP3P have opsel.
4741 bool HasOpsel = true;
4742
4743public:
4745 const MachineInstr *MI = MRI.getVRegDef(Reg);
4746 unsigned Opc = MI->getOpcode();
4747
4748 if (Opc < TargetOpcode::GENERIC_OP_END) {
4749 // Keep same for generic op.
4750 HasNeg = true;
4751 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4752 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4753 // Only float point intrinsic has neg & neg_hi bits.
4754 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4755 HasNeg = true;
4756 }
4757 }
4758 bool checkOptions(SrcStatus Stat) const {
4759 if (!HasNeg &&
4760 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4761 return false;
4762 }
4763 if (!HasOpsel &&
4764 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4765 return false;
4766 }
4767 return true;
4768 }
4769};
4770
4773 int MaxDepth = 3) {
4774 int Depth = 0;
4775 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4777
4778 while (Depth <= MaxDepth && Curr.has_value()) {
4779 Depth++;
4780 if (SO.checkOptions(Curr.value().second))
4781 Statlist.push_back(Curr.value());
4782 Curr = calcNextStatus(Curr.value(), MRI);
4783 }
4784
4785 return Statlist;
4786}
4787
4788static std::pair<Register, SrcStatus>
4790 int MaxDepth = 3) {
4791 int Depth = 0;
4792 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4793 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4794
4795 while (Depth <= MaxDepth && Curr.has_value()) {
4796 Depth++;
4797 SrcStatus Stat = Curr.value().second;
4798 if (SO.checkOptions(Stat)) {
4799 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4801 LastSameOrNeg = Curr.value();
4802 }
4803 Curr = calcNextStatus(Curr.value(), MRI);
4804 }
4805
4806 return LastSameOrNeg;
4807}
4808
4809static bool isSameBitWidth(Register Reg1, Register Reg2,
4810 const MachineRegisterInfo &MRI) {
4811 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4812 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4813 return Width1 == Width2;
4814}
4815
4816static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
4817 // SrcStatus::IS_LOWER_HALF remain 0.
4818 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
4819 Mods ^= SISrcMods::NEG_HI;
4820 Mods |= SISrcMods::OP_SEL_1;
4821 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
4822 Mods |= SISrcMods::OP_SEL_1;
4823 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
4824 Mods ^= SISrcMods::NEG_HI;
4825 else if (HiStat == SrcStatus::IS_HI_NEG)
4826 Mods ^= SISrcMods::NEG_HI;
4827
4828 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
4829 Mods ^= SISrcMods::NEG;
4830 Mods |= SISrcMods::OP_SEL_0;
4831 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
4832 Mods |= SISrcMods::OP_SEL_0;
4833 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
4834 Mods |= SISrcMods::NEG;
4835 else if (LoStat == SrcStatus::IS_HI_NEG)
4836 Mods ^= SISrcMods::NEG;
4837
4838 return Mods;
4839}
4840
4841static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
4842 Register RootReg, const SIInstrInfo &TII,
4843 const MachineRegisterInfo &MRI) {
4844 auto IsHalfState = [](SrcStatus S) {
4847 };
4848 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
4849 IsHalfState(HiStat);
4850}
4851
4852std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
4853 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
4854 unsigned Mods = 0;
4855 // No modification if Root type is not form of <2 x Type>.
4856 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
4857 Mods |= SISrcMods::OP_SEL_1;
4858 return {RootReg, Mods};
4859 }
4860
4861 SearchOptions SO(RootReg, MRI);
4862
4863 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
4864
4865 if (Stat.second == SrcStatus::IS_BOTH_NEG)
4867 else if (Stat.second == SrcStatus::IS_HI_NEG)
4868 Mods ^= SISrcMods::NEG_HI;
4869 else if (Stat.second == SrcStatus::IS_LO_NEG)
4870 Mods ^= SISrcMods::NEG;
4871
4872 MachineInstr *MI = MRI.getVRegDef(Stat.first);
4873
4874 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
4875 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
4876 Mods |= SISrcMods::OP_SEL_1;
4877 return {Stat.first, Mods};
4878 }
4879
4881 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
4882
4883 if (StatlistHi.empty()) {
4884 Mods |= SISrcMods::OP_SEL_1;
4885 return {Stat.first, Mods};
4886 }
4887
4889 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
4890
4891 if (StatlistLo.empty()) {
4892 Mods |= SISrcMods::OP_SEL_1;
4893 return {Stat.first, Mods};
4894 }
4895
4896 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
4897 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
4898 if (StatlistHi[I].first == StatlistLo[J].first &&
4899 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
4900 StatlistHi[I].first, RootReg, TII, MRI))
4901 return {StatlistHi[I].first,
4902 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
4903 }
4904 }
4905 // Packed instructions do not have abs modifiers.
4906 Mods |= SISrcMods::OP_SEL_1;
4907
4908 return {Stat.first, Mods};
4909}
4910
4911// Removed unused function `getAllKindImm` to eliminate dead code.
4912
4913static bool checkRB(Register Reg, unsigned int RBNo,
4914 const AMDGPURegisterBankInfo &RBI,
4915 const MachineRegisterInfo &MRI,
4916 const TargetRegisterInfo &TRI) {
4917 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
4918 return RB->getID() == RBNo;
4919}
4920
4921// This function is used to get the correct register bank for returned reg.
4922// Assume:
4923// 1. VOP3P is always legal for VGPR.
4924// 2. RootOp's regbank is legal.
4925// Thus
4926// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
4927// 2. If RootOp is VGPR, then NewOp must be VGPR.
4929 const AMDGPURegisterBankInfo &RBI,
4931 const TargetRegisterInfo &TRI,
4932 const SIInstrInfo &TII) {
4933 // RootOp can only be VGPR or SGPR (some hand written cases such as.
4934 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
4935 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
4936 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
4937 return NewReg;
4938
4939 MachineInstr *MI = MRI.getVRegDef(RootReg);
4940 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
4941 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
4942 return RootReg;
4943 }
4944
4945 MachineBasicBlock *BB = MI->getParent();
4946 Register DstReg = MRI.cloneVirtualRegister(RootReg);
4947
4949 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
4950 .addReg(NewReg);
4951
4952 // Only accept VGPR.
4953 return MIB->getOperand(0).getReg();
4954}
4955
4957AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
4958 bool IsDOT) const {
4959 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
4960 Register Reg;
4961 unsigned Mods;
4962 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
4963
4964 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
4965 return {{
4966 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4967 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4968 }};
4969}
4970
4972AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4973
4974 return selectVOP3PRetHelper(Root);
4975}
4976
4978AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4979
4980 return selectVOP3PRetHelper(Root, true);
4981}
4982
4984AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4985 MachineOperand &Root) const {
4986 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4987 "expected i1 value");
4988 unsigned Mods = SISrcMods::OP_SEL_1;
4989 if (Root.getImm() != 0)
4990 Mods |= SISrcMods::OP_SEL_0;
4991
4992 return {{
4993 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4994 }};
4995}
4996
4998 MachineInstr *InsertPt,
5000 const TargetRegisterClass *DstRegClass;
5001 switch (Elts.size()) {
5002 case 8:
5003 DstRegClass = &AMDGPU::VReg_256RegClass;
5004 break;
5005 case 4:
5006 DstRegClass = &AMDGPU::VReg_128RegClass;
5007 break;
5008 case 2:
5009 DstRegClass = &AMDGPU::VReg_64RegClass;
5010 break;
5011 default:
5012 llvm_unreachable("unhandled Reg sequence size");
5013 }
5014
5015 MachineIRBuilder B(*InsertPt);
5016 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5017 .addDef(MRI.createVirtualRegister(DstRegClass));
5018 for (unsigned i = 0; i < Elts.size(); ++i) {
5019 MIB.addReg(Elts[i]);
5021 }
5022 return MIB->getOperand(0).getReg();
5023}
5024
5025static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5027 MachineInstr *InsertPt,
5029 if (ModOpcode == TargetOpcode::G_FNEG) {
5030 Mods |= SISrcMods::NEG;
5031 // Check if all elements also have abs modifier
5032 SmallVector<Register, 8> NegAbsElts;
5033 for (auto El : Elts) {
5034 Register FabsSrc;
5035 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5036 break;
5037 NegAbsElts.push_back(FabsSrc);
5038 }
5039 if (Elts.size() != NegAbsElts.size()) {
5040 // Neg
5041 Src = buildRegSequence(Elts, InsertPt, MRI);
5042 } else {
5043 // Neg and Abs
5044 Mods |= SISrcMods::NEG_HI;
5045 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5046 }
5047 } else {
5048 assert(ModOpcode == TargetOpcode::G_FABS);
5049 // Abs
5050 Mods |= SISrcMods::NEG_HI;
5051 Src = buildRegSequence(Elts, InsertPt, MRI);
5052 }
5053}
5054
5056AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5057 Register Src = Root.getReg();
5058 unsigned Mods = SISrcMods::OP_SEL_1;
5060
5061 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5062 assert(BV->getNumSources() > 0);
5063 // Based on first element decide which mod we match, neg or abs
5064 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5065 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5066 ? AMDGPU::G_FNEG
5067 : AMDGPU::G_FABS;
5068 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5069 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5070 if (ElF32->getOpcode() != ModOpcode)
5071 break;
5072 EltsF32.push_back(ElF32->getOperand(1).getReg());
5073 }
5074
5075 // All elements had ModOpcode modifier
5076 if (BV->getNumSources() == EltsF32.size()) {
5077 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5078 *MRI);
5079 }
5080 }
5081
5082 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5083 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5084}
5085
5087AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5088 Register Src = Root.getReg();
5089 unsigned Mods = SISrcMods::OP_SEL_1;
5090 SmallVector<Register, 8> EltsV2F16;
5091
5092 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5093 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5094 Register FNegSrc;
5095 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5096 break;
5097 EltsV2F16.push_back(FNegSrc);
5098 }
5099
5100 // All elements had ModOpcode modifier
5101 if (CV->getNumSources() == EltsV2F16.size()) {
5102 Mods |= SISrcMods::NEG;
5103 Mods |= SISrcMods::NEG_HI;
5104 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5105 }
5106 }
5107
5108 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5109 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5110}
5111
5113AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5114 Register Src = Root.getReg();
5115 unsigned Mods = SISrcMods::OP_SEL_1;
5116 SmallVector<Register, 8> EltsV2F16;
5117
5118 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5119 assert(CV->getNumSources() > 0);
5120 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5121 // Based on first element decide which mod we match, neg or abs
5122 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5123 ? AMDGPU::G_FNEG
5124 : AMDGPU::G_FABS;
5125
5126 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5127 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5128 if (ElV2F16->getOpcode() != ModOpcode)
5129 break;
5130 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5131 }
5132
5133 // All elements had ModOpcode modifier
5134 if (CV->getNumSources() == EltsV2F16.size()) {
5135 MachineIRBuilder B(*Root.getParent());
5136 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5137 *MRI);
5138 }
5139 }
5140
5141 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5142 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5143}
5144
5146AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5147 std::optional<FPValueAndVReg> FPValReg;
5148 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5149 if (TII.isInlineConstant(FPValReg->Value)) {
5150 return {{[=](MachineInstrBuilder &MIB) {
5151 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5152 }}};
5153 }
5154 // Non-inlineable splat floats should not fall-through for integer immediate
5155 // checks.
5156 return {};
5157 }
5158
5159 APInt ICst;
5160 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5161 if (TII.isInlineConstant(ICst)) {
5162 return {
5163 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5164 }
5165 }
5166
5167 return {};
5168}
5169
5171AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5172 Register Src =
5173 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5174 unsigned Key = 0;
5175
5176 Register ShiftSrc;
5177 std::optional<ValueAndVReg> ShiftAmt;
5178 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5179 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5180 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5181 Key = ShiftAmt->Value.getZExtValue() / 8;
5182 Src = ShiftSrc;
5183 }
5184
5185 return {{
5186 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5187 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5188 }};
5189}
5190
5192AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5193
5194 Register Src =
5195 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5196 unsigned Key = 0;
5197
5198 Register ShiftSrc;
5199 std::optional<ValueAndVReg> ShiftAmt;
5200 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5201 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5202 ShiftAmt->Value.getZExtValue() == 16) {
5203 Src = ShiftSrc;
5204 Key = 1;
5205 }
5206
5207 return {{
5208 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5209 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5210 }};
5211}
5212
5214AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5215 Register Src =
5216 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5217 unsigned Key = 0;
5218
5219 Register S32 = matchZeroExtendFromS32(Src);
5220 if (!S32)
5221 S32 = matchAnyExtendFromS32(Src);
5222
5223 if (S32) {
5224 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5225 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5226 assert(Def->getNumOperands() == 3);
5227 Register DstReg1 = Def->getOperand(1).getReg();
5228 if (mi_match(S32, *MRI,
5229 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5230 Src = Def->getOperand(2).getReg();
5231 Key = 1;
5232 }
5233 }
5234 }
5235
5236 return {{
5237 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5238 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5239 }};
5240}
5241
5243AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5244 Register Src;
5245 unsigned Mods;
5246 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5247
5248 // FIXME: Handle op_sel
5249 return {{
5250 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5251 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5252 }};
5253}
5254
5255// FIXME-TRUE16 remove when fake16 is removed
5257AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5258 Register Src;
5259 unsigned Mods;
5260 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5261 /*IsCanonicalizing=*/true,
5262 /*AllowAbs=*/false,
5263 /*OpSel=*/false);
5264
5265 return {{
5266 [=](MachineInstrBuilder &MIB) {
5267 MIB.addReg(
5268 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5269 },
5270 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5271 }};
5272}
5273
5275AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5276 Register Src;
5277 unsigned Mods;
5278 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5279 /*IsCanonicalizing=*/true,
5280 /*AllowAbs=*/false,
5281 /*OpSel=*/true);
5282
5283 return {{
5284 [=](MachineInstrBuilder &MIB) {
5285 MIB.addReg(
5286 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5287 },
5288 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5289 }};
5290}
5291
5292// Given \p Offset and load specified by the \p Root operand check if \p Offset
5293// is a multiple of the load byte size. If it is update \p Offset to a
5294// pre-scaled value and return true.
5295bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5297 bool IsSigned) const {
5298 if (!Subtarget->hasScaleOffset())
5299 return false;
5300
5301 const MachineInstr &MI = *Root.getParent();
5302 MachineMemOperand *MMO = *MI.memoperands_begin();
5303
5304 if (!MMO->getSize().hasValue())
5305 return false;
5306
5307 uint64_t Size = MMO->getSize().getValue();
5308
5309 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5310 if (!OffsetReg)
5311 OffsetReg = Offset;
5312
5313 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5314 OffsetReg = Def->Reg;
5315
5316 Register Op0;
5317 MachineInstr *Mul;
5318 bool ScaleOffset =
5319 (isPowerOf2_64(Size) &&
5320 mi_match(OffsetReg, *MRI,
5321 m_GShl(m_Reg(Op0),
5324 mi_match(OffsetReg, *MRI,
5326 m_Copy(m_SpecificICst(Size))))) ||
5327 mi_match(
5328 OffsetReg, *MRI,
5329 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5330 m_Reg(Op0), m_SpecificICst(Size))) ||
5331 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5332 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5333 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5334 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5335 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5336 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5337 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5338 mi_match(Mul->getOperand(3).getReg(), *MRI,
5340 m_Copy(m_SpecificICst(Size))))) &&
5341 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5342
5343 if (ScaleOffset)
5344 Offset = Op0;
5345
5346 return ScaleOffset;
5347}
5348
5349bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5350 Register &Base,
5351 Register *SOffset,
5352 int64_t *Offset,
5353 bool *ScaleOffset) const {
5354 MachineInstr *MI = Root.getParent();
5355 MachineBasicBlock *MBB = MI->getParent();
5356
5357 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5358 // then we can select all ptr + 32-bit offsets.
5359 SmallVector<GEPInfo, 4> AddrInfo;
5360 getAddrModeInfo(*MI, *MRI, AddrInfo);
5361
5362 if (AddrInfo.empty())
5363 return false;
5364
5365 const GEPInfo &GEPI = AddrInfo[0];
5366 std::optional<int64_t> EncodedImm;
5367
5368 if (ScaleOffset)
5369 *ScaleOffset = false;
5370
5371 if (SOffset && Offset) {
5372 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5373 /*HasSOffset=*/true);
5374 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5375 AddrInfo.size() > 1) {
5376 const GEPInfo &GEPI2 = AddrInfo[1];
5377 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5378 Register OffsetReg = GEPI2.SgprParts[1];
5379 if (ScaleOffset)
5380 *ScaleOffset =
5381 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5382 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5383 if (OffsetReg) {
5384 Base = GEPI2.SgprParts[0];
5385 *SOffset = OffsetReg;
5386 *Offset = *EncodedImm;
5387 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5388 return true;
5389
5390 // For unbuffered smem loads, it is illegal for the Immediate Offset
5391 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5392 // is negative. Handle the case where the Immediate Offset + SOffset
5393 // is negative.
5394 auto SKnown = VT->getKnownBits(*SOffset);
5395 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5396 return false;
5397
5398 return true;
5399 }
5400 }
5401 }
5402 return false;
5403 }
5404
5405 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5406 /*HasSOffset=*/false);
5407 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5408 Base = GEPI.SgprParts[0];
5409 *Offset = *EncodedImm;
5410 return true;
5411 }
5412
5413 // SGPR offset is unsigned.
5414 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5415 GEPI.Imm != 0) {
5416 // If we make it this far we have a load with an 32-bit immediate offset.
5417 // It is OK to select this using a sgpr offset, because we have already
5418 // failed trying to select this load into one of the _IMM variants since
5419 // the _IMM Patterns are considered before the _SGPR patterns.
5420 Base = GEPI.SgprParts[0];
5421 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5422 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5423 .addImm(GEPI.Imm);
5424 return true;
5425 }
5426
5427 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5428 Register OffsetReg = GEPI.SgprParts[1];
5429 if (ScaleOffset)
5430 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5431 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5432 if (OffsetReg) {
5433 Base = GEPI.SgprParts[0];
5434 *SOffset = OffsetReg;
5435 return true;
5436 }
5437 }
5438
5439 return false;
5440}
5441
5443AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5444 Register Base;
5445 int64_t Offset;
5446 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5447 /* ScaleOffset */ nullptr))
5448 return std::nullopt;
5449
5450 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5451 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5452}
5453
5455AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5456 SmallVector<GEPInfo, 4> AddrInfo;
5457 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5458
5459 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5460 return std::nullopt;
5461
5462 const GEPInfo &GEPInfo = AddrInfo[0];
5463 Register PtrReg = GEPInfo.SgprParts[0];
5464 std::optional<int64_t> EncodedImm =
5465 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5466 if (!EncodedImm)
5467 return std::nullopt;
5468
5469 return {{
5470 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5471 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5472 }};
5473}
5474
5476AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5477 Register Base, SOffset;
5478 bool ScaleOffset;
5479 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5480 &ScaleOffset))
5481 return std::nullopt;
5482
5483 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5484 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5485 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5486 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5487}
5488
5490AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5491 Register Base, SOffset;
5492 int64_t Offset;
5493 bool ScaleOffset;
5494 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5495 return std::nullopt;
5496
5497 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5498 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5499 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5500 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5501 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5502}
5503
5504std::pair<Register, int>
5505AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5506 uint64_t FlatVariant) const {
5507 MachineInstr *MI = Root.getParent();
5508
5509 auto Default = std::pair(Root.getReg(), 0);
5510
5511 if (!STI.hasFlatInstOffsets())
5512 return Default;
5513
5514 Register PtrBase;
5515 int64_t ConstOffset;
5516 bool IsInBounds;
5517 std::tie(PtrBase, ConstOffset, IsInBounds) =
5518 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5519
5520 // Adding the offset to the base address with an immediate in a FLAT
5521 // instruction must not change the memory aperture in which the address falls.
5522 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5523 // instructions.
5524 if (ConstOffset == 0 ||
5525 (FlatVariant == SIInstrFlags::FlatScratch &&
5526 !isFlatScratchBaseLegal(Root.getReg())) ||
5527 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5528 return Default;
5529
5530 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5531 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5532 return Default;
5533
5534 return std::pair(PtrBase, ConstOffset);
5535}
5536
5538AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5539 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5540
5541 return {{
5542 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5543 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5544 }};
5545}
5546
5548AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5549 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5550
5551 return {{
5552 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5553 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5554 }};
5555}
5556
5558AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5559 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5560
5561 return {{
5562 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5563 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5564 }};
5565}
5566
5567// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5569AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5570 unsigned CPolBits,
5571 bool NeedIOffset) const {
5572 Register Addr = Root.getReg();
5573 Register PtrBase;
5574 int64_t ConstOffset;
5575 int64_t ImmOffset = 0;
5576
5577 // Match the immediate offset first, which canonically is moved as low as
5578 // possible.
5579 std::tie(PtrBase, ConstOffset, std::ignore) =
5580 getPtrBaseWithConstantOffset(Addr, *MRI);
5581
5582 if (ConstOffset != 0) {
5583 if (NeedIOffset &&
5584 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5586 Addr = PtrBase;
5587 ImmOffset = ConstOffset;
5588 } else {
5589 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5590 if (isSGPR(PtrBaseDef->Reg)) {
5591 if (ConstOffset > 0) {
5592 // Offset is too large.
5593 //
5594 // saddr + large_offset -> saddr +
5595 // (voffset = large_offset & ~MaxOffset) +
5596 // (large_offset & MaxOffset);
5597 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5598 if (NeedIOffset) {
5599 std::tie(SplitImmOffset, RemainderOffset) =
5600 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5602 }
5603
5604 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5605 : isUInt<32>(RemainderOffset)) {
5606 MachineInstr *MI = Root.getParent();
5607 MachineBasicBlock *MBB = MI->getParent();
5608 Register HighBits =
5609 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5610
5611 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5612 HighBits)
5613 .addImm(RemainderOffset);
5614
5615 if (NeedIOffset)
5616 return {{
5617 [=](MachineInstrBuilder &MIB) {
5618 MIB.addReg(PtrBase);
5619 }, // saddr
5620 [=](MachineInstrBuilder &MIB) {
5621 MIB.addReg(HighBits);
5622 }, // voffset
5623 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5624 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5625 }};
5626 return {{
5627 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5628 [=](MachineInstrBuilder &MIB) {
5629 MIB.addReg(HighBits);
5630 }, // voffset
5631 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5632 }};
5633 }
5634 }
5635
5636 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5637 // is 1 we would need to perform 1 or 2 extra moves for each half of
5638 // the constant and it is better to do a scalar add and then issue a
5639 // single VALU instruction to materialize zero. Otherwise it is less
5640 // instructions to perform VALU adds with immediates or inline literals.
5641 unsigned NumLiterals =
5642 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5643 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5644 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5645 return std::nullopt;
5646 }
5647 }
5648 }
5649
5650 // Match the variable offset.
5651 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5652 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5653 // Look through the SGPR->VGPR copy.
5654 Register SAddr =
5655 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5656
5657 if (isSGPR(SAddr)) {
5658 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5659
5660 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5661 // inserted later.
5662 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5663 Subtarget->hasSignedGVSOffset());
5664 if (Register VOffset = matchExtendFromS32OrS32(
5665 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5666 if (NeedIOffset)
5667 return {{[=](MachineInstrBuilder &MIB) { // saddr
5668 MIB.addReg(SAddr);
5669 },
5670 [=](MachineInstrBuilder &MIB) { // voffset
5671 MIB.addReg(VOffset);
5672 },
5673 [=](MachineInstrBuilder &MIB) { // offset
5674 MIB.addImm(ImmOffset);
5675 },
5676 [=](MachineInstrBuilder &MIB) { // cpol
5677 MIB.addImm(CPolBits |
5678 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5679 }}};
5680 return {{[=](MachineInstrBuilder &MIB) { // saddr
5681 MIB.addReg(SAddr);
5682 },
5683 [=](MachineInstrBuilder &MIB) { // voffset
5684 MIB.addReg(VOffset);
5685 },
5686 [=](MachineInstrBuilder &MIB) { // cpol
5687 MIB.addImm(CPolBits |
5688 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5689 }}};
5690 }
5691 }
5692 }
5693
5694 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5695 // drop this.
5696 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5697 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5698 return std::nullopt;
5699
5700 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5701 // moves required to copy a 64-bit SGPR to VGPR.
5702 MachineInstr *MI = Root.getParent();
5703 MachineBasicBlock *MBB = MI->getParent();
5704 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5705
5706 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5707 .addImm(0);
5708
5709 if (NeedIOffset)
5710 return {{
5711 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5712 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5713 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5714 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5715 }};
5716 return {{
5717 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5718 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5719 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5720 }};
5721}
5722
5724AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5725 return selectGlobalSAddr(Root, 0);
5726}
5727
5729AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5730 const MachineInstr &I = *Root.getParent();
5731
5732 // We are assuming CPol is always the last operand of the intrinsic.
5733 auto PassedCPol =
5734 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5735 return selectGlobalSAddr(Root, PassedCPol);
5736}
5737
5739AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
5740 const MachineInstr &I = *Root.getParent();
5741
5742 // We are assuming CPol is second from last operand of the intrinsic.
5743 auto PassedCPol =
5744 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5745 return selectGlobalSAddr(Root, PassedCPol);
5746}
5747
5749AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
5750 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
5751}
5752
5754AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5755 MachineOperand &Root) const {
5756 const MachineInstr &I = *Root.getParent();
5757
5758 // We are assuming CPol is always the last operand of the intrinsic.
5759 auto PassedCPol =
5760 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5761 return selectGlobalSAddr(Root, PassedCPol, false);
5762}
5763
5765AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
5766 MachineOperand &Root) const {
5767 const MachineInstr &I = *Root.getParent();
5768
5769 // We are assuming CPol is second from last operand of the intrinsic.
5770 auto PassedCPol =
5771 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5772 return selectGlobalSAddr(Root, PassedCPol, false);
5773}
5774
5776AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5777 Register Addr = Root.getReg();
5778 Register PtrBase;
5779 int64_t ConstOffset;
5780 int64_t ImmOffset = 0;
5781
5782 // Match the immediate offset first, which canonically is moved as low as
5783 // possible.
5784 std::tie(PtrBase, ConstOffset, std::ignore) =
5785 getPtrBaseWithConstantOffset(Addr, *MRI);
5786
5787 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5788 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5790 Addr = PtrBase;
5791 ImmOffset = ConstOffset;
5792 }
5793
5794 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5795 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5796 int FI = AddrDef->MI->getOperand(1).getIndex();
5797 return {{
5798 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5799 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5800 }};
5801 }
5802
5803 Register SAddr = AddrDef->Reg;
5804
5805 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5806 Register LHS = AddrDef->MI->getOperand(1).getReg();
5807 Register RHS = AddrDef->MI->getOperand(2).getReg();
5808 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5809 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5810
5811 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5812 isSGPR(RHSDef->Reg)) {
5813 int FI = LHSDef->MI->getOperand(1).getIndex();
5814 MachineInstr &I = *Root.getParent();
5815 MachineBasicBlock *BB = I.getParent();
5816 const DebugLoc &DL = I.getDebugLoc();
5817 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5818
5819 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
5820 .addFrameIndex(FI)
5821 .addReg(RHSDef->Reg)
5822 .setOperandDead(3); // Dead scc
5823 }
5824 }
5825
5826 if (!isSGPR(SAddr))
5827 return std::nullopt;
5828
5829 return {{
5830 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
5831 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5832 }};
5833}
5834
5835// Check whether the flat scratch SVS swizzle bug affects this access.
5836bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5837 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5838 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5839 return false;
5840
5841 // The bug affects the swizzling of SVS accesses if there is any carry out
5842 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5843 // voffset to (soffset + inst_offset).
5844 auto VKnown = VT->getKnownBits(VAddr);
5845 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
5846 KnownBits::makeConstant(APInt(32, ImmOffset)));
5847 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5848 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5849 return (VMax & 3) + (SMax & 3) >= 4;
5850}
5851
5853AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5854 Register Addr = Root.getReg();
5855 Register PtrBase;
5856 int64_t ConstOffset;
5857 int64_t ImmOffset = 0;
5858
5859 // Match the immediate offset first, which canonically is moved as low as
5860 // possible.
5861 std::tie(PtrBase, ConstOffset, std::ignore) =
5862 getPtrBaseWithConstantOffset(Addr, *MRI);
5863
5864 Register OrigAddr = Addr;
5865 if (ConstOffset != 0 &&
5866 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5868 Addr = PtrBase;
5869 ImmOffset = ConstOffset;
5870 }
5871
5872 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5873 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5874 return std::nullopt;
5875
5876 Register RHS = AddrDef->MI->getOperand(2).getReg();
5877 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5878 return std::nullopt;
5879
5880 Register LHS = AddrDef->MI->getOperand(1).getReg();
5881 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5882
5883 if (OrigAddr != Addr) {
5884 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5885 return std::nullopt;
5886 } else {
5887 if (!isFlatScratchBaseLegalSV(OrigAddr))
5888 return std::nullopt;
5889 }
5890
5891 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5892 return std::nullopt;
5893
5894 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
5896 : 0;
5897
5898 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5899 int FI = LHSDef->MI->getOperand(1).getIndex();
5900 return {{
5901 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5902 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5903 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5904 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5905 }};
5906 }
5907
5908 if (!isSGPR(LHS))
5909 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
5910 LHS = Def->Reg;
5911
5912 if (!isSGPR(LHS))
5913 return std::nullopt;
5914
5915 return {{
5916 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5917 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5918 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5919 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5920 }};
5921}
5922
5924AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5925 MachineInstr *MI = Root.getParent();
5926 MachineBasicBlock *MBB = MI->getParent();
5927 MachineFunction *MF = MBB->getParent();
5928 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5929
5930 int64_t Offset = 0;
5931 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5932 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5933 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5934
5935 // TODO: Should this be inside the render function? The iterator seems to
5936 // move.
5937 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5938 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5939 HighBits)
5940 .addImm(Offset & ~MaxOffset);
5941
5942 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5943 MIB.addReg(Info->getScratchRSrcReg());
5944 },
5945 [=](MachineInstrBuilder &MIB) { // vaddr
5946 MIB.addReg(HighBits);
5947 },
5948 [=](MachineInstrBuilder &MIB) { // soffset
5949 // Use constant zero for soffset and rely on eliminateFrameIndex
5950 // to choose the appropriate frame register if need be.
5951 MIB.addImm(0);
5952 },
5953 [=](MachineInstrBuilder &MIB) { // offset
5954 MIB.addImm(Offset & MaxOffset);
5955 }}};
5956 }
5957
5958 assert(Offset == 0 || Offset == -1);
5959
5960 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5961 // offsets.
5962 std::optional<int> FI;
5963 Register VAddr = Root.getReg();
5964
5965 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5966 Register PtrBase;
5967 int64_t ConstOffset;
5968 std::tie(PtrBase, ConstOffset, std::ignore) =
5969 getPtrBaseWithConstantOffset(VAddr, *MRI);
5970 if (ConstOffset != 0) {
5971 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5972 (!STI.privateMemoryResourceIsRangeChecked() ||
5973 VT->signBitIsZero(PtrBase))) {
5974 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5975 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5976 FI = PtrBaseDef->getOperand(1).getIndex();
5977 else
5978 VAddr = PtrBase;
5979 Offset = ConstOffset;
5980 }
5981 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5982 FI = RootDef->getOperand(1).getIndex();
5983 }
5984
5985 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5986 MIB.addReg(Info->getScratchRSrcReg());
5987 },
5988 [=](MachineInstrBuilder &MIB) { // vaddr
5989 if (FI)
5990 MIB.addFrameIndex(*FI);
5991 else
5992 MIB.addReg(VAddr);
5993 },
5994 [=](MachineInstrBuilder &MIB) { // soffset
5995 // Use constant zero for soffset and rely on eliminateFrameIndex
5996 // to choose the appropriate frame register if need be.
5997 MIB.addImm(0);
5998 },
5999 [=](MachineInstrBuilder &MIB) { // offset
6000 MIB.addImm(Offset);
6001 }}};
6002}
6003
6004bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6005 int64_t Offset) const {
6006 if (!isUInt<16>(Offset))
6007 return false;
6008
6009 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6010 return true;
6011
6012 // On Southern Islands instruction with a negative base value and an offset
6013 // don't seem to work.
6014 return VT->signBitIsZero(Base);
6015}
6016
6017bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6018 int64_t Offset1,
6019 unsigned Size) const {
6020 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6021 return false;
6022 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6023 return false;
6024
6025 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6026 return true;
6027
6028 // On Southern Islands instruction with a negative base value and an offset
6029 // don't seem to work.
6030 return VT->signBitIsZero(Base);
6031}
6032
6033// Return whether the operation has NoUnsignedWrap property.
6034static bool isNoUnsignedWrap(MachineInstr *Addr) {
6035 return Addr->getOpcode() == TargetOpcode::G_OR ||
6036 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6038}
6039
6040// Check that the base address of flat scratch load/store in the form of `base +
6041// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6042// requirement). We always treat the first operand as the base address here.
6043bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6044 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6045
6046 if (isNoUnsignedWrap(AddrMI))
6047 return true;
6048
6049 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6050 // values.
6051 if (STI.hasSignedScratchOffsets())
6052 return true;
6053
6054 Register LHS = AddrMI->getOperand(1).getReg();
6055 Register RHS = AddrMI->getOperand(2).getReg();
6056
6057 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6058 std::optional<ValueAndVReg> RhsValReg =
6060 // If the immediate offset is negative and within certain range, the base
6061 // address cannot also be negative. If the base is also negative, the sum
6062 // would be either negative or much larger than the valid range of scratch
6063 // memory a thread can access.
6064 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6065 RhsValReg->Value.getSExtValue() > -0x40000000)
6066 return true;
6067 }
6068
6069 return VT->signBitIsZero(LHS);
6070}
6071
6072// Check address value in SGPR/VGPR are legal for flat scratch in the form
6073// of: SGPR + VGPR.
6074bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6075 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6076
6077 if (isNoUnsignedWrap(AddrMI))
6078 return true;
6079
6080 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6081 // values.
6082 if (STI.hasSignedScratchOffsets())
6083 return true;
6084
6085 Register LHS = AddrMI->getOperand(1).getReg();
6086 Register RHS = AddrMI->getOperand(2).getReg();
6087 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6088}
6089
6090// Check address value in SGPR/VGPR are legal for flat scratch in the form
6091// of: SGPR + VGPR + Imm.
6092bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6093 Register Addr) const {
6094 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6095 // values.
6096 if (STI.hasSignedScratchOffsets())
6097 return true;
6098
6099 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6100 Register Base = AddrMI->getOperand(1).getReg();
6101 std::optional<DefinitionAndSourceRegister> BaseDef =
6103 std::optional<ValueAndVReg> RHSOffset =
6105 assert(RHSOffset);
6106
6107 // If the immediate offset is negative and within certain range, the base
6108 // address cannot also be negative. If the base is also negative, the sum
6109 // would be either negative or much larger than the valid range of scratch
6110 // memory a thread can access.
6111 if (isNoUnsignedWrap(BaseDef->MI) &&
6112 (isNoUnsignedWrap(AddrMI) ||
6113 (RHSOffset->Value.getSExtValue() < 0 &&
6114 RHSOffset->Value.getSExtValue() > -0x40000000)))
6115 return true;
6116
6117 Register LHS = BaseDef->MI->getOperand(1).getReg();
6118 Register RHS = BaseDef->MI->getOperand(2).getReg();
6119 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6120}
6121
6122bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6123 unsigned ShAmtBits) const {
6124 assert(MI.getOpcode() == TargetOpcode::G_AND);
6125
6126 std::optional<APInt> RHS =
6127 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6128 if (!RHS)
6129 return false;
6130
6131 if (RHS->countr_one() >= ShAmtBits)
6132 return true;
6133
6134 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6135 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6136}
6137
6139AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6140 MachineOperand &Root) const {
6141 Register Reg = Root.getReg();
6142 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6143
6144 std::optional<DefinitionAndSourceRegister> Def =
6146 assert(Def && "this shouldn't be an optional result");
6147 Reg = Def->Reg;
6148
6149 if (Register WaveBase = getWaveAddress(Def->MI)) {
6150 return {{
6151 [=](MachineInstrBuilder &MIB) { // rsrc
6152 MIB.addReg(Info->getScratchRSrcReg());
6153 },
6154 [=](MachineInstrBuilder &MIB) { // soffset
6155 MIB.addReg(WaveBase);
6156 },
6157 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6158 }};
6159 }
6160
6161 int64_t Offset = 0;
6162
6163 // FIXME: Copy check is a hack
6165 if (mi_match(Reg, *MRI,
6166 m_GPtrAdd(m_Reg(BasePtr),
6168 if (!TII.isLegalMUBUFImmOffset(Offset))
6169 return {};
6170 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6171 Register WaveBase = getWaveAddress(BasePtrDef);
6172 if (!WaveBase)
6173 return {};
6174
6175 return {{
6176 [=](MachineInstrBuilder &MIB) { // rsrc
6177 MIB.addReg(Info->getScratchRSrcReg());
6178 },
6179 [=](MachineInstrBuilder &MIB) { // soffset
6180 MIB.addReg(WaveBase);
6181 },
6182 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6183 }};
6184 }
6185
6186 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6187 !TII.isLegalMUBUFImmOffset(Offset))
6188 return {};
6189
6190 return {{
6191 [=](MachineInstrBuilder &MIB) { // rsrc
6192 MIB.addReg(Info->getScratchRSrcReg());
6193 },
6194 [=](MachineInstrBuilder &MIB) { // soffset
6195 MIB.addImm(0);
6196 },
6197 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6198 }};
6199}
6200
6201std::pair<Register, unsigned>
6202AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6203 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6204 int64_t ConstAddr = 0;
6205
6206 Register PtrBase;
6207 int64_t Offset;
6208 std::tie(PtrBase, Offset, std::ignore) =
6209 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6210
6211 if (Offset) {
6212 if (isDSOffsetLegal(PtrBase, Offset)) {
6213 // (add n0, c0)
6214 return std::pair(PtrBase, Offset);
6215 }
6216 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6217 // TODO
6218
6219
6220 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6221 // TODO
6222
6223 }
6224
6225 return std::pair(Root.getReg(), 0);
6226}
6227
6229AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6230 Register Reg;
6231 unsigned Offset;
6232 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6233 return {{
6234 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6235 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6236 }};
6237}
6238
6240AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6241 return selectDSReadWrite2(Root, 4);
6242}
6243
6245AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6246 return selectDSReadWrite2(Root, 8);
6247}
6248
6250AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6251 unsigned Size) const {
6252 Register Reg;
6253 unsigned Offset;
6254 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6255 return {{
6256 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6258 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6259 }};
6260}
6261
6262std::pair<Register, unsigned>
6263AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6264 unsigned Size) const {
6265 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6266 int64_t ConstAddr = 0;
6267
6268 Register PtrBase;
6269 int64_t Offset;
6270 std::tie(PtrBase, Offset, std::ignore) =
6271 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6272
6273 if (Offset) {
6274 int64_t OffsetValue0 = Offset;
6275 int64_t OffsetValue1 = Offset + Size;
6276 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6277 // (add n0, c0)
6278 return std::pair(PtrBase, OffsetValue0 / Size);
6279 }
6280 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6281 // TODO
6282
6283 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6284 // TODO
6285
6286 }
6287
6288 return std::pair(Root.getReg(), 0);
6289}
6290
6291/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6292/// the base value with the constant offset, and if the offset computation is
6293/// known to be inbounds. There may be intervening copies between \p Root and
6294/// the identified constant. Returns \p Root, 0, false if this does not match
6295/// the pattern.
6296std::tuple<Register, int64_t, bool>
6297AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6298 Register Root, const MachineRegisterInfo &MRI) const {
6299 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6300 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6301 return {Root, 0, false};
6302
6303 MachineOperand &RHS = RootI->getOperand(2);
6304 std::optional<ValueAndVReg> MaybeOffset =
6306 if (!MaybeOffset)
6307 return {Root, 0, false};
6308 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6309 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6310 IsInBounds};
6311}
6312
6314 MIB.addImm(0);
6315}
6316
6317/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6318/// BasePtr is not valid, a null base pointer will be used.
6320 uint32_t FormatLo, uint32_t FormatHi,
6321 Register BasePtr) {
6322 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6323 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6324 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6325 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6326
6327 B.buildInstr(AMDGPU::S_MOV_B32)
6328 .addDef(RSrc2)
6329 .addImm(FormatLo);
6330 B.buildInstr(AMDGPU::S_MOV_B32)
6331 .addDef(RSrc3)
6332 .addImm(FormatHi);
6333
6334 // Build the half of the subregister with the constants before building the
6335 // full 128-bit register. If we are building multiple resource descriptors,
6336 // this will allow CSEing of the 2-component register.
6337 B.buildInstr(AMDGPU::REG_SEQUENCE)
6338 .addDef(RSrcHi)
6339 .addReg(RSrc2)
6340 .addImm(AMDGPU::sub0)
6341 .addReg(RSrc3)
6342 .addImm(AMDGPU::sub1);
6343
6344 Register RSrcLo = BasePtr;
6345 if (!BasePtr) {
6346 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6347 B.buildInstr(AMDGPU::S_MOV_B64)
6348 .addDef(RSrcLo)
6349 .addImm(0);
6350 }
6351
6352 B.buildInstr(AMDGPU::REG_SEQUENCE)
6353 .addDef(RSrc)
6354 .addReg(RSrcLo)
6355 .addImm(AMDGPU::sub0_sub1)
6356 .addReg(RSrcHi)
6357 .addImm(AMDGPU::sub2_sub3);
6358
6359 return RSrc;
6360}
6361
6363 const SIInstrInfo &TII, Register BasePtr) {
6364 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6365
6366 // FIXME: Why are half the "default" bits ignored based on the addressing
6367 // mode?
6368 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6369}
6370
6372 const SIInstrInfo &TII, Register BasePtr) {
6373 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6374
6375 // FIXME: Why are half the "default" bits ignored based on the addressing
6376 // mode?
6377 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6378}
6379
6380AMDGPUInstructionSelector::MUBUFAddressData
6381AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6382 MUBUFAddressData Data;
6383 Data.N0 = Src;
6384
6385 Register PtrBase;
6386 int64_t Offset;
6387
6388 std::tie(PtrBase, Offset, std::ignore) =
6389 getPtrBaseWithConstantOffset(Src, *MRI);
6390 if (isUInt<32>(Offset)) {
6391 Data.N0 = PtrBase;
6392 Data.Offset = Offset;
6393 }
6394
6395 if (MachineInstr *InputAdd
6396 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6397 Data.N2 = InputAdd->getOperand(1).getReg();
6398 Data.N3 = InputAdd->getOperand(2).getReg();
6399
6400 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6401 // FIXME: Don't know this was defined by operand 0
6402 //
6403 // TODO: Remove this when we have copy folding optimizations after
6404 // RegBankSelect.
6405 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6406 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6407 }
6408
6409 return Data;
6410}
6411
6412/// Return if the addr64 mubuf mode should be used for the given address.
6413bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6414 // (ptr_add N2, N3) -> addr64, or
6415 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6416 if (Addr.N2)
6417 return true;
6418
6419 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6420 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6421}
6422
6423/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6424/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6425/// component.
6426void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6427 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6428 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6429 return;
6430
6431 // Illegal offset, store it in soffset.
6432 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6433 B.buildInstr(AMDGPU::S_MOV_B32)
6434 .addDef(SOffset)
6435 .addImm(ImmOffset);
6436 ImmOffset = 0;
6437}
6438
6439bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6440 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6441 Register &SOffset, int64_t &Offset) const {
6442 // FIXME: Predicates should stop this from reaching here.
6443 // addr64 bit was removed for volcanic islands.
6444 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6445 return false;
6446
6447 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6448 if (!shouldUseAddr64(AddrData))
6449 return false;
6450
6451 Register N0 = AddrData.N0;
6452 Register N2 = AddrData.N2;
6453 Register N3 = AddrData.N3;
6454 Offset = AddrData.Offset;
6455
6456 // Base pointer for the SRD.
6457 Register SRDPtr;
6458
6459 if (N2) {
6460 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6461 assert(N3);
6462 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6463 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6464 // addr64, and construct the default resource from a 0 address.
6465 VAddr = N0;
6466 } else {
6467 SRDPtr = N3;
6468 VAddr = N2;
6469 }
6470 } else {
6471 // N2 is not divergent.
6472 SRDPtr = N2;
6473 VAddr = N3;
6474 }
6475 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6476 // Use the default null pointer in the resource
6477 VAddr = N0;
6478 } else {
6479 // N0 -> offset, or
6480 // (N0 + C1) -> offset
6481 SRDPtr = N0;
6482 }
6483
6484 MachineIRBuilder B(*Root.getParent());
6485 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6486 splitIllegalMUBUFOffset(B, SOffset, Offset);
6487 return true;
6488}
6489
6490bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6491 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6492 int64_t &Offset) const {
6493
6494 // FIXME: Pattern should not reach here.
6495 if (STI.useFlatForGlobal())
6496 return false;
6497
6498 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6499 if (shouldUseAddr64(AddrData))
6500 return false;
6501
6502 // N0 -> offset, or
6503 // (N0 + C1) -> offset
6504 Register SRDPtr = AddrData.N0;
6505 Offset = AddrData.Offset;
6506
6507 // TODO: Look through extensions for 32-bit soffset.
6508 MachineIRBuilder B(*Root.getParent());
6509
6510 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6511 splitIllegalMUBUFOffset(B, SOffset, Offset);
6512 return true;
6513}
6514
6516AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6517 Register VAddr;
6518 Register RSrcReg;
6519 Register SOffset;
6520 int64_t Offset = 0;
6521
6522 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6523 return {};
6524
6525 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6526 // pattern.
6527 return {{
6528 [=](MachineInstrBuilder &MIB) { // rsrc
6529 MIB.addReg(RSrcReg);
6530 },
6531 [=](MachineInstrBuilder &MIB) { // vaddr
6532 MIB.addReg(VAddr);
6533 },
6534 [=](MachineInstrBuilder &MIB) { // soffset
6535 if (SOffset)
6536 MIB.addReg(SOffset);
6537 else if (STI.hasRestrictedSOffset())
6538 MIB.addReg(AMDGPU::SGPR_NULL);
6539 else
6540 MIB.addImm(0);
6541 },
6542 [=](MachineInstrBuilder &MIB) { // offset
6543 MIB.addImm(Offset);
6544 },
6545 addZeroImm, // cpol
6546 addZeroImm, // tfe
6547 addZeroImm // swz
6548 }};
6549}
6550
6552AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6553 Register RSrcReg;
6554 Register SOffset;
6555 int64_t Offset = 0;
6556
6557 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6558 return {};
6559
6560 return {{
6561 [=](MachineInstrBuilder &MIB) { // rsrc
6562 MIB.addReg(RSrcReg);
6563 },
6564 [=](MachineInstrBuilder &MIB) { // soffset
6565 if (SOffset)
6566 MIB.addReg(SOffset);
6567 else if (STI.hasRestrictedSOffset())
6568 MIB.addReg(AMDGPU::SGPR_NULL);
6569 else
6570 MIB.addImm(0);
6571 },
6572 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6573 addZeroImm, // cpol
6574 addZeroImm, // tfe
6575 addZeroImm, // swz
6576 }};
6577}
6578
6580AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6581
6582 Register SOffset = Root.getReg();
6583
6584 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6585 SOffset = AMDGPU::SGPR_NULL;
6586
6587 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6588}
6589
6590/// Get an immediate that must be 32-bits, and treated as zero extended.
6591static std::optional<uint64_t>
6593 // getIConstantVRegVal sexts any values, so see if that matters.
6594 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6595 if (!OffsetVal || !isInt<32>(*OffsetVal))
6596 return std::nullopt;
6597 return Lo_32(*OffsetVal);
6598}
6599
6601AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6602 std::optional<uint64_t> OffsetVal =
6603 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6604 if (!OffsetVal)
6605 return {};
6606
6607 std::optional<int64_t> EncodedImm =
6608 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6609 if (!EncodedImm)
6610 return {};
6611
6612 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6613}
6614
6616AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6617 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6618
6619 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6620 if (!OffsetVal)
6621 return {};
6622
6623 std::optional<int64_t> EncodedImm =
6625 if (!EncodedImm)
6626 return {};
6627
6628 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6629}
6630
6632AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6633 // Match the (soffset + offset) pair as a 32-bit register base and
6634 // an immediate offset.
6635 Register SOffset;
6636 unsigned Offset;
6637 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6638 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6639 if (!SOffset)
6640 return std::nullopt;
6641
6642 std::optional<int64_t> EncodedOffset =
6643 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6644 if (!EncodedOffset)
6645 return std::nullopt;
6646
6647 assert(MRI->getType(SOffset) == LLT::scalar(32));
6648 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6649 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6650}
6651
6652std::pair<Register, unsigned>
6653AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6654 bool &Matched) const {
6655 Matched = false;
6656
6657 Register Src;
6658 unsigned Mods;
6659 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6660
6661 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6662 assert(MRI->getType(Src) == LLT::scalar(16));
6663
6664 // Only change Src if src modifier could be gained. In such cases new Src
6665 // could be sgpr but this does not violate constant bus restriction for
6666 // instruction that is being selected.
6667 Src = stripBitCast(Src, *MRI);
6668
6669 const auto CheckAbsNeg = [&]() {
6670 // Be careful about folding modifiers if we already have an abs. fneg is
6671 // applied last, so we don't want to apply an earlier fneg.
6672 if ((Mods & SISrcMods::ABS) == 0) {
6673 unsigned ModsTmp;
6674 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6675
6676 if ((ModsTmp & SISrcMods::NEG) != 0)
6677 Mods ^= SISrcMods::NEG;
6678
6679 if ((ModsTmp & SISrcMods::ABS) != 0)
6680 Mods |= SISrcMods::ABS;
6681 }
6682 };
6683
6684 CheckAbsNeg();
6685
6686 // op_sel/op_sel_hi decide the source type and source.
6687 // If the source's op_sel_hi is set, it indicates to do a conversion from
6688 // fp16. If the sources's op_sel is set, it picks the high half of the
6689 // source register.
6690
6691 Mods |= SISrcMods::OP_SEL_1;
6692
6693 if (isExtractHiElt(*MRI, Src, Src)) {
6694 Mods |= SISrcMods::OP_SEL_0;
6695 CheckAbsNeg();
6696 }
6697
6698 Matched = true;
6699 }
6700
6701 return {Src, Mods};
6702}
6703
6705AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6706 MachineOperand &Root) const {
6707 Register Src;
6708 unsigned Mods;
6709 bool Matched;
6710 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6711 if (!Matched)
6712 return {};
6713
6714 return {{
6715 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6716 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6717 }};
6718}
6719
6721AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6722 Register Src;
6723 unsigned Mods;
6724 bool Matched;
6725 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6726
6727 return {{
6728 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6729 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6730 }};
6731}
6732
6733bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6734 MachineInstr &I, Intrinsic::ID IntrID) const {
6735 MachineBasicBlock *MBB = I.getParent();
6736 const DebugLoc &DL = I.getDebugLoc();
6737 Register CCReg = I.getOperand(0).getReg();
6738
6739 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6740 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6741
6742 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6743 .addImm(I.getOperand(2).getImm());
6744
6745 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6746
6747 I.eraseFromParent();
6748 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6749 *MRI);
6750}
6751
6752bool AMDGPUInstructionSelector::selectSGetBarrierState(
6753 MachineInstr &I, Intrinsic::ID IntrID) const {
6754 MachineBasicBlock *MBB = I.getParent();
6755 const DebugLoc &DL = I.getDebugLoc();
6756 MachineOperand BarOp = I.getOperand(2);
6757 std::optional<int64_t> BarValImm =
6758 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6759
6760 if (!BarValImm) {
6761 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6762 .addReg(BarOp.getReg());
6763 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6764 }
6765 MachineInstrBuilder MIB;
6766 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6767 : AMDGPU::S_GET_BARRIER_STATE_M0;
6768 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6769
6770 auto DstReg = I.getOperand(0).getReg();
6771 const TargetRegisterClass *DstRC =
6772 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6773 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6774 return false;
6775 MIB.addDef(DstReg);
6776 if (BarValImm) {
6777 MIB.addImm(*BarValImm);
6778 }
6779 I.eraseFromParent();
6780 return true;
6781}
6782
6783unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6784 if (HasInlineConst) {
6785 switch (IntrID) {
6786 default:
6787 llvm_unreachable("not a named barrier op");
6788 case Intrinsic::amdgcn_s_barrier_join:
6789 return AMDGPU::S_BARRIER_JOIN_IMM;
6790 case Intrinsic::amdgcn_s_get_named_barrier_state:
6791 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6792 };
6793 } else {
6794 switch (IntrID) {
6795 default:
6796 llvm_unreachable("not a named barrier op");
6797 case Intrinsic::amdgcn_s_barrier_join:
6798 return AMDGPU::S_BARRIER_JOIN_M0;
6799 case Intrinsic::amdgcn_s_get_named_barrier_state:
6800 return AMDGPU::S_GET_BARRIER_STATE_M0;
6801 };
6802 }
6803}
6804
6805bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6806 MachineInstr &I, Intrinsic::ID IntrID) const {
6807 MachineBasicBlock *MBB = I.getParent();
6808 const DebugLoc &DL = I.getDebugLoc();
6809 MachineOperand BarOp = I.getOperand(1);
6810 MachineOperand CntOp = I.getOperand(2);
6811
6812 // BarID = (BarOp >> 4) & 0x3F
6813 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6814 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6815 .add(BarOp)
6816 .addImm(4u)
6817 .setOperandDead(3); // Dead scc
6818
6819 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6820 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6821 .addReg(TmpReg0)
6822 .addImm(0x3F)
6823 .setOperandDead(3); // Dead scc
6824
6825 // MO = ((CntOp & 0x3F) << shAmt) | BarID
6826 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6827 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
6828 .add(CntOp)
6829 .addImm(0x3F)
6830 .setOperandDead(3); // Dead scc
6831
6832 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6833 constexpr unsigned ShAmt = 16;
6834 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
6835 .addReg(TmpReg2)
6836 .addImm(ShAmt)
6837 .setOperandDead(3); // Dead scc
6838
6839 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6840 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
6841 .addReg(TmpReg1)
6842 .addReg(TmpReg3)
6843 .setOperandDead(3); // Dead scc;
6844
6845 auto CopyMIB =
6846 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
6847 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6848
6849 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
6850 ? AMDGPU::S_BARRIER_INIT_M0
6851 : AMDGPU::S_BARRIER_SIGNAL_M0;
6852 MachineInstrBuilder MIB;
6853 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6854
6855 I.eraseFromParent();
6856 return true;
6857}
6858
6859bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6860 MachineInstr &I, Intrinsic::ID IntrID) const {
6861 MachineBasicBlock *MBB = I.getParent();
6862 const DebugLoc &DL = I.getDebugLoc();
6863 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6864 ? I.getOperand(2)
6865 : I.getOperand(1);
6866 std::optional<int64_t> BarValImm =
6867 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6868
6869 if (!BarValImm) {
6870 // BarID = (BarOp >> 4) & 0x3F
6871 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6872 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6873 .addReg(BarOp.getReg())
6874 .addImm(4u)
6875 .setOperandDead(3); // Dead scc;
6876
6877 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6878 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6879 .addReg(TmpReg0)
6880 .addImm(0x3F)
6881 .setOperandDead(3); // Dead scc;
6882
6883 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6884 .addReg(TmpReg1);
6885 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6886 }
6887
6888 MachineInstrBuilder MIB;
6889 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6890 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6891
6892 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6893 auto DstReg = I.getOperand(0).getReg();
6894 const TargetRegisterClass *DstRC =
6895 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6896 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6897 return false;
6898 MIB.addDef(DstReg);
6899 }
6900
6901 if (BarValImm) {
6902 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6903 MIB.addImm(BarId);
6904 }
6905
6906 I.eraseFromParent();
6907 return true;
6908}
6909
6910void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6911 const MachineInstr &MI,
6912 int OpIdx) const {
6913 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6914 "Expected G_CONSTANT");
6915 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6916}
6917
6918void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6919 const MachineInstr &MI,
6920 int OpIdx) const {
6921 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6922 "Expected G_CONSTANT");
6923 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6924}
6925
6926void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6927 const MachineInstr &MI,
6928 int OpIdx) const {
6929 const MachineOperand &Op = MI.getOperand(1);
6930 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6931 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6932}
6933
6934void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6935 const MachineInstr &MI,
6936 int OpIdx) const {
6937 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6938 "Expected G_CONSTANT");
6939 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6940}
6941
6942/// This only really exists to satisfy DAG type checking machinery, so is a
6943/// no-op here.
6944void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6945 const MachineInstr &MI,
6946 int OpIdx) const {
6947 const MachineOperand &Op = MI.getOperand(OpIdx);
6948 int64_t Imm;
6949 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6950 MIB.addImm(Imm);
6951 else
6952 MIB.addImm(Op.getImm());
6953}
6954
6955void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6956 const MachineInstr &MI,
6957 int OpIdx) const {
6958 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6959}
6960
6961void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6962 const MachineInstr &MI,
6963 int OpIdx) const {
6964 assert(OpIdx >= 0 && "expected to match an immediate operand");
6965 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6966}
6967
6968void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6969 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6970 assert(OpIdx >= 0 && "expected to match an immediate operand");
6971 MIB.addImm(
6972 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6973}
6974
6975void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6976 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6977 assert(OpIdx >= 0 && "expected to match an immediate operand");
6978 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6980 : (int64_t)SISrcMods::DST_OP_SEL);
6981}
6982
6983void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6984 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6985 assert(OpIdx >= 0 && "expected to match an immediate operand");
6986 MIB.addImm(
6987 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6988}
6989
6990void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6991 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6992 assert(OpIdx >= 0 && "expected to match an immediate operand");
6993 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6994 ? (int64_t)(SISrcMods::OP_SEL_0)
6995 : 0);
6996}
6997
6998void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6999 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7000 assert(OpIdx >= 0 && "expected to match an immediate operand");
7001 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7002 : 0);
7003}
7004
7005void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7006 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7007 assert(OpIdx >= 0 && "expected to match an immediate operand");
7008 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7009 : 0);
7010}
7011
7012void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7013 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7014 assert(OpIdx >= 0 && "expected to match an immediate operand");
7015 MIB.addImm(
7016 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7017}
7018
7019void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7020 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7021 assert(OpIdx >= 0 && "expected to match an immediate operand");
7022 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7023 ? (int64_t)SISrcMods::DST_OP_SEL
7024 : 0);
7025}
7026
7027void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7028 const MachineInstr &MI,
7029 int OpIdx) const {
7030 assert(OpIdx >= 0 && "expected to match an immediate operand");
7031 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7034}
7035
7036void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7037 const MachineInstr &MI,
7038 int OpIdx) const {
7039 assert(OpIdx >= 0 && "expected to match an immediate operand");
7040 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7043 MIB.addImm(Swizzle);
7044}
7045
7046void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7047 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7048 assert(OpIdx >= 0 && "expected to match an immediate operand");
7049 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7052 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7053}
7054
7055void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7056 const MachineInstr &MI,
7057 int OpIdx) const {
7058 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7059}
7060
7061void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7062 const MachineInstr &MI,
7063 int OpIdx) const {
7064 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7065 int ExpVal = APF.getExactLog2Abs();
7066 assert(ExpVal != INT_MIN);
7067 MIB.addImm(ExpVal);
7068}
7069
7070void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7071 const MachineInstr &MI,
7072 int OpIdx) const {
7073 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7074 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7075 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7076 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7077 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7078}
7079
7080void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7081 const MachineInstr &MI,
7082 int OpIdx) const {
7083 unsigned Mods = SISrcMods::OP_SEL_1;
7084 if (MI.getOperand(OpIdx).getImm())
7085 Mods ^= SISrcMods::NEG;
7086 MIB.addImm((int64_t)Mods);
7087}
7088
7089void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7090 const MachineInstr &MI,
7091 int OpIdx) const {
7092 unsigned Mods = SISrcMods::OP_SEL_1;
7093 if (MI.getOperand(OpIdx).getImm())
7095 MIB.addImm((int64_t)Mods);
7096}
7097
7098void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7099 const MachineInstr &MI,
7100 int OpIdx) const {
7101 unsigned Val = MI.getOperand(OpIdx).getImm();
7102 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7103 if (Val == 1) // neg
7104 Mods ^= SISrcMods::NEG;
7105 if (Val == 2) // abs
7106 Mods ^= SISrcMods::ABS;
7107 if (Val == 3) // neg and abs
7108 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7109 MIB.addImm((int64_t)Mods);
7110}
7111
7112void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7113 const MachineInstr &MI,
7114 int OpIdx) const {
7115 uint32_t V = MI.getOperand(2).getImm();
7118 if (!Subtarget->hasSafeCUPrefetch())
7119 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7120 MIB.addImm(V);
7121}
7122
7123/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7124void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7125 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7126 unsigned Val = MI.getOperand(OpIdx).getImm();
7127 unsigned New = 0;
7128 if (Val & 0x1)
7130 if (Val & 0x2)
7132 MIB.addImm(New);
7133}
7134
7135bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7136 return TII.isInlineConstant(Imm);
7137}
7138
7139bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7140 return TII.isInlineConstant(Imm);
7141}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:50
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:915
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:651
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:492
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:467
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:499
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.