LLVM 22.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
165 assert(Subtarget->useRealTrue16Insts());
166 const int64_t NoMods = 0;
167 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
168 .addImm(NoMods)
169 .addImm(1)
170 .addImm(NoMods)
171 .addReg(SrcReg)
172 .addImm(NoMods);
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
174 .addImm(NoMods)
175 .addImm(0)
176 .addImm(NoMods)
177 .addReg(MaskedReg)
178 .addImm(NoMods);
179 } else {
180 bool IsSGPR = TRI.isSGPRClass(SrcRC);
181 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
182 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
183 .addImm(1)
184 .addReg(SrcReg);
185 if (IsSGPR)
186 And.setOperandDead(3); // Dead scc
187
188 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
189 .addImm(0)
190 .addReg(MaskedReg);
191 }
192 }
193
194 if (!MRI->getRegClassOrNull(SrcReg))
195 MRI->setRegClass(SrcReg, SrcRC);
196 I.eraseFromParent();
197 return true;
198 }
199
200 const TargetRegisterClass *RC =
201 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
202 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
203 return false;
204
205 return true;
206 }
207
208 for (const MachineOperand &MO : I.operands()) {
209 if (MO.getReg().isPhysical())
210 continue;
211
212 const TargetRegisterClass *RC =
213 TRI.getConstrainedRegClassForOperand(MO, *MRI);
214 if (!RC)
215 continue;
216 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
217 }
218 return true;
219}
220
221bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
222 const DebugLoc &DL = I.getDebugLoc();
223 MachineBasicBlock *BB = I.getParent();
224
225 unsigned CmpOpc =
226 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
227 MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
228 .addReg(I.getOperand(1).getReg())
229 .addImm(0);
230 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
231 return false;
232
233 Register DstReg = I.getOperand(0).getReg();
234 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
235
236 I.eraseFromParent();
237 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
238}
239
240bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
241 const DebugLoc &DL = I.getDebugLoc();
242 MachineBasicBlock *BB = I.getParent();
243
244 Register DstReg = I.getOperand(0).getReg();
245 Register SrcReg = I.getOperand(1).getReg();
246 std::optional<ValueAndVReg> Arg =
247 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
248
249 if (Arg) {
250 const int64_t Value = Arg->Value.getZExtValue();
251 if (Value == 0) {
252 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
253 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
254 } else {
255 assert(Value == 1);
256 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
257 }
258 I.eraseFromParent();
259 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
260 }
261
262 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
263 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
264
265 unsigned SelectOpcode =
266 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
267 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
268 .addReg(TRI.getExec())
269 .addImm(0);
270
271 I.eraseFromParent();
272 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
273}
274
275bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
276 Register DstReg = I.getOperand(0).getReg();
277 Register SrcReg = I.getOperand(1).getReg();
278
279 const DebugLoc &DL = I.getDebugLoc();
280 MachineBasicBlock *BB = I.getParent();
281
282 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
283 .addReg(SrcReg);
284
285 I.eraseFromParent();
286 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
287}
288
289bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
290 const Register DefReg = I.getOperand(0).getReg();
291 const LLT DefTy = MRI->getType(DefReg);
292
293 // S1 G_PHIs should not be selected in instruction-select, instead:
294 // - divergent S1 G_PHI should go through lane mask merging algorithm
295 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
296 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
297 if (DefTy == LLT::scalar(1))
298 return false;
299
300 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
301
302 const RegClassOrRegBank &RegClassOrBank =
303 MRI->getRegClassOrRegBank(DefReg);
304
305 const TargetRegisterClass *DefRC =
307 if (!DefRC) {
308 if (!DefTy.isValid()) {
309 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
310 return false;
311 }
312
313 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
314 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
315 if (!DefRC) {
316 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
317 return false;
318 }
319 }
320
321 // If inputs have register bank, assign corresponding reg class.
322 // Note: registers don't need to have the same reg bank.
323 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
324 const Register SrcReg = I.getOperand(i).getReg();
325
326 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
327 if (RB) {
328 const LLT SrcTy = MRI->getType(SrcReg);
329 const TargetRegisterClass *SrcRC =
330 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
331 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
332 return false;
333 }
334 }
335
336 I.setDesc(TII.get(TargetOpcode::PHI));
337 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
338}
339
341AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
342 const TargetRegisterClass &SubRC,
343 unsigned SubIdx) const {
344
345 MachineInstr *MI = MO.getParent();
346 MachineBasicBlock *BB = MO.getParent()->getParent();
347 Register DstReg = MRI->createVirtualRegister(&SubRC);
348
349 if (MO.isReg()) {
350 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
351 Register Reg = MO.getReg();
352 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
353 .addReg(Reg, 0, ComposedSubIdx);
354
355 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
356 MO.isKill(), MO.isDead(), MO.isUndef(),
357 MO.isEarlyClobber(), 0, MO.isDebug(),
358 MO.isInternalRead());
359 }
360
361 assert(MO.isImm());
362
363 APInt Imm(64, MO.getImm());
364
365 switch (SubIdx) {
366 default:
367 llvm_unreachable("do not know to split immediate with this sub index.");
368 case AMDGPU::sub0:
369 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
370 case AMDGPU::sub1:
371 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
372 }
373}
374
375static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
376 switch (Opc) {
377 case AMDGPU::G_AND:
378 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
379 case AMDGPU::G_OR:
380 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
381 case AMDGPU::G_XOR:
382 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
383 default:
384 llvm_unreachable("not a bit op");
385 }
386}
387
388bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
389 Register DstReg = I.getOperand(0).getReg();
390 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
391
392 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
393 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
394 DstRB->getID() != AMDGPU::VCCRegBankID)
395 return false;
396
397 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
398 STI.isWave64());
399 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
400
401 // Dead implicit-def of scc
402 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
403 true, // isImp
404 false, // isKill
405 true)); // isDead
406 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
407}
408
409bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
410 MachineBasicBlock *BB = I.getParent();
411 MachineFunction *MF = BB->getParent();
412 Register DstReg = I.getOperand(0).getReg();
413 const DebugLoc &DL = I.getDebugLoc();
414 LLT Ty = MRI->getType(DstReg);
415 if (Ty.isVector())
416 return false;
417
418 unsigned Size = Ty.getSizeInBits();
419 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
420 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
421 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
422
423 if (Size == 32) {
424 if (IsSALU) {
425 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
426 MachineInstr *Add =
427 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
428 .add(I.getOperand(1))
429 .add(I.getOperand(2))
430 .setOperandDead(3); // Dead scc
431 I.eraseFromParent();
432 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
433 }
434
435 if (STI.hasAddNoCarry()) {
436 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
437 I.setDesc(TII.get(Opc));
438 I.addOperand(*MF, MachineOperand::CreateImm(0));
439 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
440 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
441 }
442
443 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
444
445 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
446 MachineInstr *Add
447 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
448 .addDef(UnusedCarry, RegState::Dead)
449 .add(I.getOperand(1))
450 .add(I.getOperand(2))
451 .addImm(0);
452 I.eraseFromParent();
453 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
454 }
455
456 assert(!Sub && "illegal sub should not reach here");
457
458 const TargetRegisterClass &RC
459 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
460 const TargetRegisterClass &HalfRC
461 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
462
463 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
464 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
465 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
466 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
467
468 Register DstLo = MRI->createVirtualRegister(&HalfRC);
469 Register DstHi = MRI->createVirtualRegister(&HalfRC);
470
471 if (IsSALU) {
472 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
473 .add(Lo1)
474 .add(Lo2);
475 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
476 .add(Hi1)
477 .add(Hi2)
478 .setOperandDead(3); // Dead scc
479 } else {
480 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
481 Register CarryReg = MRI->createVirtualRegister(CarryRC);
482 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
483 .addDef(CarryReg)
484 .add(Lo1)
485 .add(Lo2)
486 .addImm(0);
487 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
488 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
489 .add(Hi1)
490 .add(Hi2)
491 .addReg(CarryReg, RegState::Kill)
492 .addImm(0);
493
494 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
495 return false;
496 }
497
498 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
499 .addReg(DstLo)
500 .addImm(AMDGPU::sub0)
501 .addReg(DstHi)
502 .addImm(AMDGPU::sub1);
503
504
505 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
506 return false;
507
508 I.eraseFromParent();
509 return true;
510}
511
512bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
513 MachineInstr &I) const {
514 MachineBasicBlock *BB = I.getParent();
515 MachineFunction *MF = BB->getParent();
516 const DebugLoc &DL = I.getDebugLoc();
517 Register Dst0Reg = I.getOperand(0).getReg();
518 Register Dst1Reg = I.getOperand(1).getReg();
519 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
520 I.getOpcode() == AMDGPU::G_UADDE;
521 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
522 I.getOpcode() == AMDGPU::G_USUBE;
523
524 if (isVCC(Dst1Reg, *MRI)) {
525 unsigned NoCarryOpc =
526 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
527 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
528 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
529 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
530 I.addOperand(*MF, MachineOperand::CreateImm(0));
531 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
532 }
533
534 Register Src0Reg = I.getOperand(2).getReg();
535 Register Src1Reg = I.getOperand(3).getReg();
536
537 if (HasCarryIn) {
538 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
539 .addReg(I.getOperand(4).getReg());
540 }
541
542 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
543 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
544
545 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
546 .add(I.getOperand(2))
547 .add(I.getOperand(3));
548
549 if (MRI->use_nodbg_empty(Dst1Reg)) {
550 CarryInst.setOperandDead(3); // Dead scc
551 } else {
552 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
553 .addReg(AMDGPU::SCC);
554 if (!MRI->getRegClassOrNull(Dst1Reg))
555 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
556 }
557
558 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
559 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
560 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
561 return false;
562
563 if (HasCarryIn &&
564 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
565 AMDGPU::SReg_32RegClass, *MRI))
566 return false;
567
568 I.eraseFromParent();
569 return true;
570}
571
572bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
573 MachineInstr &I) const {
574 MachineBasicBlock *BB = I.getParent();
575 MachineFunction *MF = BB->getParent();
576 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
577 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
578 MRI->use_nodbg_empty(I.getOperand(1).getReg());
579
580 unsigned Opc;
581 if (Subtarget->hasMADIntraFwdBug())
582 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
583 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
584 else if (UseNoCarry)
585 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
586 : AMDGPU::V_MAD_NC_I64_I32_e64;
587 else
588 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
589
590 if (UseNoCarry)
591 I.removeOperand(1);
592
593 I.setDesc(TII.get(Opc));
594 I.addOperand(*MF, MachineOperand::CreateImm(0));
595 I.addImplicitDefUseOperands(*MF);
596 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
597}
598
599// TODO: We should probably legalize these to only using 32-bit results.
600bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
601 MachineBasicBlock *BB = I.getParent();
602 Register DstReg = I.getOperand(0).getReg();
603 Register SrcReg = I.getOperand(1).getReg();
604 LLT DstTy = MRI->getType(DstReg);
605 LLT SrcTy = MRI->getType(SrcReg);
606 const unsigned SrcSize = SrcTy.getSizeInBits();
607 unsigned DstSize = DstTy.getSizeInBits();
608
609 // TODO: Should handle any multiple of 32 offset.
610 unsigned Offset = I.getOperand(2).getImm();
611 if (Offset % 32 != 0 || DstSize > 128)
612 return false;
613
614 // 16-bit operations really use 32-bit registers.
615 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
616 if (DstSize == 16)
617 DstSize = 32;
618
619 const TargetRegisterClass *DstRC =
620 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
621 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
622 return false;
623
624 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
625 const TargetRegisterClass *SrcRC =
626 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
627 if (!SrcRC)
628 return false;
630 DstSize / 32);
631 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
632 if (!SrcRC)
633 return false;
634
635 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
636 *SrcRC, I.getOperand(1));
637 const DebugLoc &DL = I.getDebugLoc();
638 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
639 .addReg(SrcReg, 0, SubReg);
640
641 I.eraseFromParent();
642 return true;
643}
644
645bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
646 MachineBasicBlock *BB = MI.getParent();
647 Register DstReg = MI.getOperand(0).getReg();
648 LLT DstTy = MRI->getType(DstReg);
649 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
650
651 const unsigned SrcSize = SrcTy.getSizeInBits();
652 if (SrcSize < 32)
653 return selectImpl(MI, *CoverageInfo);
654
655 const DebugLoc &DL = MI.getDebugLoc();
656 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
657 const unsigned DstSize = DstTy.getSizeInBits();
658 const TargetRegisterClass *DstRC =
659 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
660 if (!DstRC)
661 return false;
662
663 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
664 MachineInstrBuilder MIB =
665 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
666 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
667 MachineOperand &Src = MI.getOperand(I + 1);
668 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
669 MIB.addImm(SubRegs[I]);
670
671 const TargetRegisterClass *SrcRC
672 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
673 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
674 return false;
675 }
676
677 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
678 return false;
679
680 MI.eraseFromParent();
681 return true;
682}
683
684bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
685 MachineBasicBlock *BB = MI.getParent();
686 const int NumDst = MI.getNumOperands() - 1;
687
688 MachineOperand &Src = MI.getOperand(NumDst);
689
690 Register SrcReg = Src.getReg();
691 Register DstReg0 = MI.getOperand(0).getReg();
692 LLT DstTy = MRI->getType(DstReg0);
693 LLT SrcTy = MRI->getType(SrcReg);
694
695 const unsigned DstSize = DstTy.getSizeInBits();
696 const unsigned SrcSize = SrcTy.getSizeInBits();
697 const DebugLoc &DL = MI.getDebugLoc();
698 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
699
700 const TargetRegisterClass *SrcRC =
701 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
702 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
703 return false;
704
705 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
706 // source, and this relies on the fact that the same subregister indices are
707 // used for both.
708 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
709 for (int I = 0, E = NumDst; I != E; ++I) {
710 MachineOperand &Dst = MI.getOperand(I);
711 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
712 .addReg(SrcReg, 0, SubRegs[I]);
713
714 // Make sure the subregister index is valid for the source register.
715 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
716 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
717 return false;
718
719 const TargetRegisterClass *DstRC =
720 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
721 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
722 return false;
723 }
724
725 MI.eraseFromParent();
726 return true;
727}
728
729bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
730 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
731 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
732
733 Register Src0 = MI.getOperand(1).getReg();
734 Register Src1 = MI.getOperand(2).getReg();
735 LLT SrcTy = MRI->getType(Src0);
736 const unsigned SrcSize = SrcTy.getSizeInBits();
737
738 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
739 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
740 return selectG_MERGE_VALUES(MI);
741 }
742
743 // Selection logic below is for V2S16 only.
744 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
745 Register Dst = MI.getOperand(0).getReg();
746 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
747 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
748 SrcTy != LLT::scalar(32)))
749 return selectImpl(MI, *CoverageInfo);
750
751 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
752 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
753 return false;
754
755 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
756 DstBank->getID() == AMDGPU::VGPRRegBankID);
757 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
758
759 const DebugLoc &DL = MI.getDebugLoc();
760 MachineBasicBlock *BB = MI.getParent();
761
762 // First, before trying TableGen patterns, check if both sources are
763 // constants. In those cases, we can trivially compute the final constant
764 // and emit a simple move.
765 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
766 if (ConstSrc1) {
767 auto ConstSrc0 =
768 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
769 if (ConstSrc0) {
770 const int64_t K0 = ConstSrc0->Value.getSExtValue();
771 const int64_t K1 = ConstSrc1->Value.getSExtValue();
772 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
773 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
774 uint32_t Imm = Lo16 | (Hi16 << 16);
775
776 // VALU
777 if (IsVector) {
778 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
779 MI.eraseFromParent();
780 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
781 }
782
783 // SALU
784 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
785 MI.eraseFromParent();
786 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
787 }
788 }
789
790 // Now try TableGen patterns.
791 if (selectImpl(MI, *CoverageInfo))
792 return true;
793
794 // TODO: This should probably be a combine somewhere
795 // (build_vector $src0, undef) -> copy $src0
796 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
797 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
798 MI.setDesc(TII.get(AMDGPU::COPY));
799 MI.removeOperand(2);
800 const auto &RC =
801 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
802 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
803 RBI.constrainGenericRegister(Src0, RC, *MRI);
804 }
805
806 // TODO: Can be improved?
807 if (IsVector) {
808 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
809 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
810 .addImm(0xFFFF)
811 .addReg(Src0);
812 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
813 return false;
814
815 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
816 .addReg(Src1)
817 .addImm(16)
818 .addReg(TmpReg);
819 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
820 return false;
821
822 MI.eraseFromParent();
823 return true;
824 }
825
826 Register ShiftSrc0;
827 Register ShiftSrc1;
828
829 // With multiple uses of the shift, this will duplicate the shift and
830 // increase register pressure.
831 //
832 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
833 // => (S_PACK_HH_B32_B16 $src0, $src1)
834 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
835 // => (S_PACK_HL_B32_B16 $src0, $src1)
836 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
837 // => (S_PACK_LH_B32_B16 $src0, $src1)
838 // (build_vector $src0, $src1)
839 // => (S_PACK_LL_B32_B16 $src0, $src1)
840
841 bool Shift0 = mi_match(
842 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
843
844 bool Shift1 = mi_match(
845 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
846
847 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
848 if (Shift0 && Shift1) {
849 Opc = AMDGPU::S_PACK_HH_B32_B16;
850 MI.getOperand(1).setReg(ShiftSrc0);
851 MI.getOperand(2).setReg(ShiftSrc1);
852 } else if (Shift1) {
853 Opc = AMDGPU::S_PACK_LH_B32_B16;
854 MI.getOperand(2).setReg(ShiftSrc1);
855 } else if (Shift0) {
856 auto ConstSrc1 =
857 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
858 if (ConstSrc1 && ConstSrc1->Value == 0) {
859 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
860 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
861 .addReg(ShiftSrc0)
862 .addImm(16)
863 .setOperandDead(3); // Dead scc
864
865 MI.eraseFromParent();
866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
867 }
868 if (STI.hasSPackHL()) {
869 Opc = AMDGPU::S_PACK_HL_B32_B16;
870 MI.getOperand(1).setReg(ShiftSrc0);
871 }
872 }
873
874 MI.setDesc(TII.get(Opc));
875 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
876}
877
878bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
879 const MachineOperand &MO = I.getOperand(0);
880
881 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
882 // regbank check here is to know why getConstrainedRegClassForOperand failed.
883 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
884 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
885 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
886 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
887 return true;
888 }
889
890 return false;
891}
892
893bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
894 MachineBasicBlock *BB = I.getParent();
895
896 Register DstReg = I.getOperand(0).getReg();
897 Register Src0Reg = I.getOperand(1).getReg();
898 Register Src1Reg = I.getOperand(2).getReg();
899 LLT Src1Ty = MRI->getType(Src1Reg);
900
901 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
902 unsigned InsSize = Src1Ty.getSizeInBits();
903
904 int64_t Offset = I.getOperand(3).getImm();
905
906 // FIXME: These cases should have been illegal and unnecessary to check here.
907 if (Offset % 32 != 0 || InsSize % 32 != 0)
908 return false;
909
910 // Currently not handled by getSubRegFromChannel.
911 if (InsSize > 128)
912 return false;
913
914 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
915 if (SubReg == AMDGPU::NoSubRegister)
916 return false;
917
918 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
919 const TargetRegisterClass *DstRC =
920 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
921 if (!DstRC)
922 return false;
923
924 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
925 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
926 const TargetRegisterClass *Src0RC =
927 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
928 const TargetRegisterClass *Src1RC =
929 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
930
931 // Deal with weird cases where the class only partially supports the subreg
932 // index.
933 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
934 if (!Src0RC || !Src1RC)
935 return false;
936
937 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
938 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
939 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
940 return false;
941
942 const DebugLoc &DL = I.getDebugLoc();
943 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
944 .addReg(Src0Reg)
945 .addReg(Src1Reg)
946 .addImm(SubReg);
947
948 I.eraseFromParent();
949 return true;
950}
951
952bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
953 Register DstReg = MI.getOperand(0).getReg();
954 Register SrcReg = MI.getOperand(1).getReg();
955 Register OffsetReg = MI.getOperand(2).getReg();
956 Register WidthReg = MI.getOperand(3).getReg();
957
958 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
959 "scalar BFX instructions are expanded in regbankselect");
960 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
961 "64-bit vector BFX instructions are expanded in regbankselect");
962
963 const DebugLoc &DL = MI.getDebugLoc();
964 MachineBasicBlock *MBB = MI.getParent();
965
966 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
967 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
968 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
969 .addReg(SrcReg)
970 .addReg(OffsetReg)
971 .addReg(WidthReg);
972 MI.eraseFromParent();
973 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
974}
975
976bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
977 if (STI.getLDSBankCount() != 16)
978 return selectImpl(MI, *CoverageInfo);
979
980 Register Dst = MI.getOperand(0).getReg();
981 Register Src0 = MI.getOperand(2).getReg();
982 Register M0Val = MI.getOperand(6).getReg();
983 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
984 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
985 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
986 return false;
987
988 // This requires 2 instructions. It is possible to write a pattern to support
989 // this, but the generated isel emitter doesn't correctly deal with multiple
990 // output instructions using the same physical register input. The copy to m0
991 // is incorrectly placed before the second instruction.
992 //
993 // TODO: Match source modifiers.
994
995 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
996 const DebugLoc &DL = MI.getDebugLoc();
997 MachineBasicBlock *MBB = MI.getParent();
998
999 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1000 .addReg(M0Val);
1001 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1002 .addImm(2)
1003 .addImm(MI.getOperand(4).getImm()) // $attr
1004 .addImm(MI.getOperand(3).getImm()); // $attrchan
1005
1006 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1007 .addImm(0) // $src0_modifiers
1008 .addReg(Src0) // $src0
1009 .addImm(MI.getOperand(4).getImm()) // $attr
1010 .addImm(MI.getOperand(3).getImm()) // $attrchan
1011 .addImm(0) // $src2_modifiers
1012 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1013 .addImm(MI.getOperand(5).getImm()) // $high
1014 .addImm(0) // $clamp
1015 .addImm(0); // $omod
1016
1017 MI.eraseFromParent();
1018 return true;
1019}
1020
1021// Writelane is special in that it can use SGPR and M0 (which would normally
1022// count as using the constant bus twice - but in this case it is allowed since
1023// the lane selector doesn't count as a use of the constant bus). However, it is
1024// still required to abide by the 1 SGPR rule. Fix this up if we might have
1025// multiple SGPRs.
1026bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1027 // With a constant bus limit of at least 2, there's no issue.
1028 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1029 return selectImpl(MI, *CoverageInfo);
1030
1031 MachineBasicBlock *MBB = MI.getParent();
1032 const DebugLoc &DL = MI.getDebugLoc();
1033 Register VDst = MI.getOperand(0).getReg();
1034 Register Val = MI.getOperand(2).getReg();
1035 Register LaneSelect = MI.getOperand(3).getReg();
1036 Register VDstIn = MI.getOperand(4).getReg();
1037
1038 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1039
1040 std::optional<ValueAndVReg> ConstSelect =
1041 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1042 if (ConstSelect) {
1043 // The selector has to be an inline immediate, so we can use whatever for
1044 // the other operands.
1045 MIB.addReg(Val);
1046 MIB.addImm(ConstSelect->Value.getSExtValue() &
1047 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1048 } else {
1049 std::optional<ValueAndVReg> ConstVal =
1051
1052 // If the value written is an inline immediate, we can get away without a
1053 // copy to m0.
1054 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1055 STI.hasInv2PiInlineImm())) {
1056 MIB.addImm(ConstVal->Value.getSExtValue());
1057 MIB.addReg(LaneSelect);
1058 } else {
1059 MIB.addReg(Val);
1060
1061 // If the lane selector was originally in a VGPR and copied with
1062 // readfirstlane, there's a hazard to read the same SGPR from the
1063 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1064 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1065
1066 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1067 .addReg(LaneSelect);
1068 MIB.addReg(AMDGPU::M0);
1069 }
1070 }
1071
1072 MIB.addReg(VDstIn);
1073
1074 MI.eraseFromParent();
1075 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1076}
1077
1078// We need to handle this here because tablegen doesn't support matching
1079// instructions with multiple outputs.
1080bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1081 Register Dst0 = MI.getOperand(0).getReg();
1082 Register Dst1 = MI.getOperand(1).getReg();
1083
1084 LLT Ty = MRI->getType(Dst0);
1085 unsigned Opc;
1086 if (Ty == LLT::scalar(32))
1087 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1088 else if (Ty == LLT::scalar(64))
1089 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1090 else
1091 return false;
1092
1093 // TODO: Match source modifiers.
1094
1095 const DebugLoc &DL = MI.getDebugLoc();
1096 MachineBasicBlock *MBB = MI.getParent();
1097
1098 Register Numer = MI.getOperand(3).getReg();
1099 Register Denom = MI.getOperand(4).getReg();
1100 unsigned ChooseDenom = MI.getOperand(5).getImm();
1101
1102 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1103
1104 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1105 .addDef(Dst1)
1106 .addImm(0) // $src0_modifiers
1107 .addUse(Src0) // $src0
1108 .addImm(0) // $src1_modifiers
1109 .addUse(Denom) // $src1
1110 .addImm(0) // $src2_modifiers
1111 .addUse(Numer) // $src2
1112 .addImm(0) // $clamp
1113 .addImm(0); // $omod
1114
1115 MI.eraseFromParent();
1116 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1117}
1118
1119bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1120 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1121 switch (IntrinsicID) {
1122 case Intrinsic::amdgcn_if_break: {
1123 MachineBasicBlock *BB = I.getParent();
1124
1125 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1126 // SelectionDAG uses for wave32 vs wave64.
1127 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1128 .add(I.getOperand(0))
1129 .add(I.getOperand(2))
1130 .add(I.getOperand(3));
1131
1132 Register DstReg = I.getOperand(0).getReg();
1133 Register Src0Reg = I.getOperand(2).getReg();
1134 Register Src1Reg = I.getOperand(3).getReg();
1135
1136 I.eraseFromParent();
1137
1138 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1139 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1140
1141 return true;
1142 }
1143 case Intrinsic::amdgcn_interp_p1_f16:
1144 return selectInterpP1F16(I);
1145 case Intrinsic::amdgcn_wqm:
1146 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1147 case Intrinsic::amdgcn_softwqm:
1148 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1149 case Intrinsic::amdgcn_strict_wwm:
1150 case Intrinsic::amdgcn_wwm:
1151 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1152 case Intrinsic::amdgcn_strict_wqm:
1153 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1154 case Intrinsic::amdgcn_writelane:
1155 return selectWritelane(I);
1156 case Intrinsic::amdgcn_div_scale:
1157 return selectDivScale(I);
1158 case Intrinsic::amdgcn_icmp:
1159 case Intrinsic::amdgcn_fcmp:
1160 if (selectImpl(I, *CoverageInfo))
1161 return true;
1162 return selectIntrinsicCmp(I);
1163 case Intrinsic::amdgcn_ballot:
1164 return selectBallot(I);
1165 case Intrinsic::amdgcn_reloc_constant:
1166 return selectRelocConstant(I);
1167 case Intrinsic::amdgcn_groupstaticsize:
1168 return selectGroupStaticSize(I);
1169 case Intrinsic::returnaddress:
1170 return selectReturnAddress(I);
1171 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1172 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1173 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1174 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1175 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1176 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1177 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1178 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1179 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1180 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1181 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1182 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1183 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1184 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1185 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1186 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1187 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1189 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1190 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1191 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1192 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1193 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1194 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1195 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1196 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1198 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1199 return selectSMFMACIntrin(I);
1200 case Intrinsic::amdgcn_permlane16_swap:
1201 case Intrinsic::amdgcn_permlane32_swap:
1202 return selectPermlaneSwapIntrin(I, IntrinsicID);
1203 default:
1204 return selectImpl(I, *CoverageInfo);
1205 }
1206}
1207
1209 const GCNSubtarget &ST) {
1210 if (Size != 16 && Size != 32 && Size != 64)
1211 return -1;
1212
1213 if (Size == 16 && !ST.has16BitInsts())
1214 return -1;
1215
1216 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1217 unsigned FakeS16Opc, unsigned S32Opc,
1218 unsigned S64Opc) {
1219 if (Size == 16)
1220 return ST.hasTrue16BitInsts()
1221 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1222 : S16Opc;
1223 if (Size == 32)
1224 return S32Opc;
1225 return S64Opc;
1226 };
1227
1228 switch (P) {
1229 default:
1230 llvm_unreachable("Unknown condition code!");
1231 case CmpInst::ICMP_NE:
1232 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1233 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1234 AMDGPU::V_CMP_NE_U64_e64);
1235 case CmpInst::ICMP_EQ:
1236 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1237 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1238 AMDGPU::V_CMP_EQ_U64_e64);
1239 case CmpInst::ICMP_SGT:
1240 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1241 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1242 AMDGPU::V_CMP_GT_I64_e64);
1243 case CmpInst::ICMP_SGE:
1244 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1245 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1246 AMDGPU::V_CMP_GE_I64_e64);
1247 case CmpInst::ICMP_SLT:
1248 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1249 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1250 AMDGPU::V_CMP_LT_I64_e64);
1251 case CmpInst::ICMP_SLE:
1252 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1253 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1254 AMDGPU::V_CMP_LE_I64_e64);
1255 case CmpInst::ICMP_UGT:
1256 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1257 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1258 AMDGPU::V_CMP_GT_U64_e64);
1259 case CmpInst::ICMP_UGE:
1260 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1261 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1262 AMDGPU::V_CMP_GE_U64_e64);
1263 case CmpInst::ICMP_ULT:
1264 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1265 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1266 AMDGPU::V_CMP_LT_U64_e64);
1267 case CmpInst::ICMP_ULE:
1268 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1269 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1270 AMDGPU::V_CMP_LE_U64_e64);
1271
1272 case CmpInst::FCMP_OEQ:
1273 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1274 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1275 AMDGPU::V_CMP_EQ_F64_e64);
1276 case CmpInst::FCMP_OGT:
1277 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1278 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1279 AMDGPU::V_CMP_GT_F64_e64);
1280 case CmpInst::FCMP_OGE:
1281 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1282 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1283 AMDGPU::V_CMP_GE_F64_e64);
1284 case CmpInst::FCMP_OLT:
1285 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1286 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1287 AMDGPU::V_CMP_LT_F64_e64);
1288 case CmpInst::FCMP_OLE:
1289 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1290 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1291 AMDGPU::V_CMP_LE_F64_e64);
1292 case CmpInst::FCMP_ONE:
1293 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1294 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1295 AMDGPU::V_CMP_NEQ_F64_e64);
1296 case CmpInst::FCMP_ORD:
1297 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1298 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1299 AMDGPU::V_CMP_O_F64_e64);
1300 case CmpInst::FCMP_UNO:
1301 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1302 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1303 AMDGPU::V_CMP_U_F64_e64);
1304 case CmpInst::FCMP_UEQ:
1305 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1306 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1307 AMDGPU::V_CMP_NLG_F64_e64);
1308 case CmpInst::FCMP_UGT:
1309 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1310 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1311 AMDGPU::V_CMP_NLE_F64_e64);
1312 case CmpInst::FCMP_UGE:
1313 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1314 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1315 AMDGPU::V_CMP_NLT_F64_e64);
1316 case CmpInst::FCMP_ULT:
1317 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1318 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1319 AMDGPU::V_CMP_NGE_F64_e64);
1320 case CmpInst::FCMP_ULE:
1321 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1322 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1323 AMDGPU::V_CMP_NGT_F64_e64);
1324 case CmpInst::FCMP_UNE:
1325 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1326 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1327 AMDGPU::V_CMP_NEQ_F64_e64);
1328 case CmpInst::FCMP_TRUE:
1329 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1330 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1331 AMDGPU::V_CMP_TRU_F64_e64);
1333 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1334 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1335 AMDGPU::V_CMP_F_F64_e64);
1336 }
1337}
1338
1339int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1340 unsigned Size) const {
1341 if (Size == 64) {
1342 if (!STI.hasScalarCompareEq64())
1343 return -1;
1344
1345 switch (P) {
1346 case CmpInst::ICMP_NE:
1347 return AMDGPU::S_CMP_LG_U64;
1348 case CmpInst::ICMP_EQ:
1349 return AMDGPU::S_CMP_EQ_U64;
1350 default:
1351 return -1;
1352 }
1353 }
1354
1355 if (Size == 32) {
1356 switch (P) {
1357 case CmpInst::ICMP_NE:
1358 return AMDGPU::S_CMP_LG_U32;
1359 case CmpInst::ICMP_EQ:
1360 return AMDGPU::S_CMP_EQ_U32;
1361 case CmpInst::ICMP_SGT:
1362 return AMDGPU::S_CMP_GT_I32;
1363 case CmpInst::ICMP_SGE:
1364 return AMDGPU::S_CMP_GE_I32;
1365 case CmpInst::ICMP_SLT:
1366 return AMDGPU::S_CMP_LT_I32;
1367 case CmpInst::ICMP_SLE:
1368 return AMDGPU::S_CMP_LE_I32;
1369 case CmpInst::ICMP_UGT:
1370 return AMDGPU::S_CMP_GT_U32;
1371 case CmpInst::ICMP_UGE:
1372 return AMDGPU::S_CMP_GE_U32;
1373 case CmpInst::ICMP_ULT:
1374 return AMDGPU::S_CMP_LT_U32;
1375 case CmpInst::ICMP_ULE:
1376 return AMDGPU::S_CMP_LE_U32;
1377 case CmpInst::FCMP_OEQ:
1378 return AMDGPU::S_CMP_EQ_F32;
1379 case CmpInst::FCMP_OGT:
1380 return AMDGPU::S_CMP_GT_F32;
1381 case CmpInst::FCMP_OGE:
1382 return AMDGPU::S_CMP_GE_F32;
1383 case CmpInst::FCMP_OLT:
1384 return AMDGPU::S_CMP_LT_F32;
1385 case CmpInst::FCMP_OLE:
1386 return AMDGPU::S_CMP_LE_F32;
1387 case CmpInst::FCMP_ONE:
1388 return AMDGPU::S_CMP_LG_F32;
1389 case CmpInst::FCMP_ORD:
1390 return AMDGPU::S_CMP_O_F32;
1391 case CmpInst::FCMP_UNO:
1392 return AMDGPU::S_CMP_U_F32;
1393 case CmpInst::FCMP_UEQ:
1394 return AMDGPU::S_CMP_NLG_F32;
1395 case CmpInst::FCMP_UGT:
1396 return AMDGPU::S_CMP_NLE_F32;
1397 case CmpInst::FCMP_UGE:
1398 return AMDGPU::S_CMP_NLT_F32;
1399 case CmpInst::FCMP_ULT:
1400 return AMDGPU::S_CMP_NGE_F32;
1401 case CmpInst::FCMP_ULE:
1402 return AMDGPU::S_CMP_NGT_F32;
1403 case CmpInst::FCMP_UNE:
1404 return AMDGPU::S_CMP_NEQ_F32;
1405 default:
1406 llvm_unreachable("Unknown condition code!");
1407 }
1408 }
1409
1410 if (Size == 16) {
1411 if (!STI.hasSALUFloatInsts())
1412 return -1;
1413
1414 switch (P) {
1415 case CmpInst::FCMP_OEQ:
1416 return AMDGPU::S_CMP_EQ_F16;
1417 case CmpInst::FCMP_OGT:
1418 return AMDGPU::S_CMP_GT_F16;
1419 case CmpInst::FCMP_OGE:
1420 return AMDGPU::S_CMP_GE_F16;
1421 case CmpInst::FCMP_OLT:
1422 return AMDGPU::S_CMP_LT_F16;
1423 case CmpInst::FCMP_OLE:
1424 return AMDGPU::S_CMP_LE_F16;
1425 case CmpInst::FCMP_ONE:
1426 return AMDGPU::S_CMP_LG_F16;
1427 case CmpInst::FCMP_ORD:
1428 return AMDGPU::S_CMP_O_F16;
1429 case CmpInst::FCMP_UNO:
1430 return AMDGPU::S_CMP_U_F16;
1431 case CmpInst::FCMP_UEQ:
1432 return AMDGPU::S_CMP_NLG_F16;
1433 case CmpInst::FCMP_UGT:
1434 return AMDGPU::S_CMP_NLE_F16;
1435 case CmpInst::FCMP_UGE:
1436 return AMDGPU::S_CMP_NLT_F16;
1437 case CmpInst::FCMP_ULT:
1438 return AMDGPU::S_CMP_NGE_F16;
1439 case CmpInst::FCMP_ULE:
1440 return AMDGPU::S_CMP_NGT_F16;
1441 case CmpInst::FCMP_UNE:
1442 return AMDGPU::S_CMP_NEQ_F16;
1443 default:
1444 llvm_unreachable("Unknown condition code!");
1445 }
1446 }
1447
1448 return -1;
1449}
1450
1451bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1452
1453 MachineBasicBlock *BB = I.getParent();
1454 const DebugLoc &DL = I.getDebugLoc();
1455
1456 Register SrcReg = I.getOperand(2).getReg();
1457 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1458
1459 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1460
1461 Register CCReg = I.getOperand(0).getReg();
1462 if (!isVCC(CCReg, *MRI)) {
1463 int Opcode = getS_CMPOpcode(Pred, Size);
1464 if (Opcode == -1)
1465 return false;
1466 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1467 .add(I.getOperand(2))
1468 .add(I.getOperand(3));
1469 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1470 .addReg(AMDGPU::SCC);
1471 bool Ret =
1472 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1473 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1474 I.eraseFromParent();
1475 return Ret;
1476 }
1477
1478 if (I.getOpcode() == AMDGPU::G_FCMP)
1479 return false;
1480
1481 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1482 if (Opcode == -1)
1483 return false;
1484
1485 MachineInstrBuilder ICmp;
1486 // t16 instructions
1487 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1488 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1489 .addImm(0)
1490 .add(I.getOperand(2))
1491 .addImm(0)
1492 .add(I.getOperand(3))
1493 .addImm(0); // op_sel
1494 } else {
1495 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1496 .add(I.getOperand(2))
1497 .add(I.getOperand(3));
1498 }
1499
1500 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1501 *TRI.getBoolRC(), *MRI);
1502 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1503 I.eraseFromParent();
1504 return Ret;
1505}
1506
1507bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1508 Register Dst = I.getOperand(0).getReg();
1509 if (isVCC(Dst, *MRI))
1510 return false;
1511
1512 LLT DstTy = MRI->getType(Dst);
1513 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1514 return false;
1515
1516 MachineBasicBlock *BB = I.getParent();
1517 const DebugLoc &DL = I.getDebugLoc();
1518 Register SrcReg = I.getOperand(2).getReg();
1519 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1520
1521 // i1 inputs are not supported in GlobalISel.
1522 if (Size == 1)
1523 return false;
1524
1525 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1526 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1527 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1528 I.eraseFromParent();
1529 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1530 }
1531
1532 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1533 if (Opcode == -1)
1534 return false;
1535
1536 MachineInstrBuilder SelectedMI;
1537 MachineOperand &LHS = I.getOperand(2);
1538 MachineOperand &RHS = I.getOperand(3);
1539 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1540 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1541 Register Src0Reg =
1542 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1543 Register Src1Reg =
1544 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1545 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1546 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1547 SelectedMI.addImm(Src0Mods);
1548 SelectedMI.addReg(Src0Reg);
1549 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1550 SelectedMI.addImm(Src1Mods);
1551 SelectedMI.addReg(Src1Reg);
1552 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1553 SelectedMI.addImm(0); // clamp
1554 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1555 SelectedMI.addImm(0); // op_sel
1556
1557 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1558 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1559 return false;
1560
1561 I.eraseFromParent();
1562 return true;
1563}
1564
1565// Ballot has to zero bits in input lane-mask that are zero in current exec,
1566// Done as AND with exec. For inputs that are results of instruction that
1567// implicitly use same exec, for example compares in same basic block or SCC to
1568// VCC copy, use copy.
1571 MachineInstr *MI = MRI.getVRegDef(Reg);
1572 if (MI->getParent() != MBB)
1573 return false;
1574
1575 // Lane mask generated by SCC to VCC copy.
1576 if (MI->getOpcode() == AMDGPU::COPY) {
1577 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1578 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1579 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1580 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1581 return true;
1582 }
1583
1584 // Lane mask generated using compare with same exec.
1585 if (isa<GAnyCmp>(MI))
1586 return true;
1587
1588 Register LHS, RHS;
1589 // Look through AND.
1590 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1591 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1593
1594 return false;
1595}
1596
1597bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1598 MachineBasicBlock *BB = I.getParent();
1599 const DebugLoc &DL = I.getDebugLoc();
1600 Register DstReg = I.getOperand(0).getReg();
1601 Register SrcReg = I.getOperand(2).getReg();
1602 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1603 const unsigned WaveSize = STI.getWavefrontSize();
1604
1605 // In the common case, the return type matches the wave size.
1606 // However we also support emitting i64 ballots in wave32 mode.
1607 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1608 return false;
1609
1610 std::optional<ValueAndVReg> Arg =
1612
1613 Register Dst = DstReg;
1614 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1615 if (BallotSize != WaveSize) {
1616 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1617 }
1618
1619 if (Arg) {
1620 const int64_t Value = Arg->Value.getZExtValue();
1621 if (Value == 0) {
1622 // Dst = S_MOV 0
1623 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1624 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1625 } else {
1626 // Dst = COPY EXEC
1627 assert(Value == 1);
1628 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1629 }
1630 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1631 return false;
1632 } else {
1633 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1634 // Dst = COPY SrcReg
1635 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1636 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1637 return false;
1638 } else {
1639 // Dst = S_AND SrcReg, EXEC
1640 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1641 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1642 .addReg(SrcReg)
1643 .addReg(TRI.getExec())
1644 .setOperandDead(3); // Dead scc
1645 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1646 return false;
1647 }
1648 }
1649
1650 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1651 if (BallotSize != WaveSize) {
1652 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1653 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1654 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1655 .addReg(Dst)
1656 .addImm(AMDGPU::sub0)
1657 .addReg(HiReg)
1658 .addImm(AMDGPU::sub1);
1659 }
1660
1661 I.eraseFromParent();
1662 return true;
1663}
1664
1665bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1666 Register DstReg = I.getOperand(0).getReg();
1667 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1668 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1669 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1670 return false;
1671
1672 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1673
1674 Module *M = MF->getFunction().getParent();
1675 const MDNode *Metadata = I.getOperand(2).getMetadata();
1676 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1677 auto *RelocSymbol = cast<GlobalVariable>(
1678 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1679
1680 MachineBasicBlock *BB = I.getParent();
1681 BuildMI(*BB, &I, I.getDebugLoc(),
1682 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1684
1685 I.eraseFromParent();
1686 return true;
1687}
1688
1689bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1690 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1691
1692 Register DstReg = I.getOperand(0).getReg();
1693 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1694 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1695 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1696
1697 MachineBasicBlock *MBB = I.getParent();
1698 const DebugLoc &DL = I.getDebugLoc();
1699
1700 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1701
1702 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1703 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1704 MIB.addImm(MFI->getLDSSize());
1705 } else {
1706 Module *M = MF->getFunction().getParent();
1707 const GlobalValue *GV =
1708 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1710 }
1711
1712 I.eraseFromParent();
1713 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1714}
1715
1716bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1717 MachineBasicBlock *MBB = I.getParent();
1718 MachineFunction &MF = *MBB->getParent();
1719 const DebugLoc &DL = I.getDebugLoc();
1720
1721 MachineOperand &Dst = I.getOperand(0);
1722 Register DstReg = Dst.getReg();
1723 unsigned Depth = I.getOperand(2).getImm();
1724
1725 const TargetRegisterClass *RC
1726 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1727 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1728 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1729 return false;
1730
1731 // Check for kernel and shader functions
1732 if (Depth != 0 ||
1733 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1734 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1735 .addImm(0);
1736 I.eraseFromParent();
1737 return true;
1738 }
1739
1740 MachineFrameInfo &MFI = MF.getFrameInfo();
1741 // There is a call to @llvm.returnaddress in this function
1742 MFI.setReturnAddressIsTaken(true);
1743
1744 // Get the return address reg and mark it as an implicit live-in
1745 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1746 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1747 AMDGPU::SReg_64RegClass, DL);
1748 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1749 .addReg(LiveIn);
1750 I.eraseFromParent();
1751 return true;
1752}
1753
1754bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1755 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1756 // SelectionDAG uses for wave32 vs wave64.
1757 MachineBasicBlock *BB = MI.getParent();
1758 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1759 .add(MI.getOperand(1));
1760
1761 Register Reg = MI.getOperand(1).getReg();
1762 MI.eraseFromParent();
1763
1764 if (!MRI->getRegClassOrNull(Reg))
1765 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1766 return true;
1767}
1768
1769bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1770 MachineInstr &MI, Intrinsic::ID IntrID) const {
1771 MachineBasicBlock *MBB = MI.getParent();
1772 MachineFunction *MF = MBB->getParent();
1773 const DebugLoc &DL = MI.getDebugLoc();
1774
1775 unsigned IndexOperand = MI.getOperand(7).getImm();
1776 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1777 bool WaveDone = MI.getOperand(9).getImm() != 0;
1778
1779 if (WaveDone && !WaveRelease) {
1780 // TODO: Move this to IR verifier
1781 const Function &Fn = MF->getFunction();
1782 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1783 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1784 }
1785
1786 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1787 IndexOperand &= ~0x3f;
1788 unsigned CountDw = 0;
1789
1790 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1791 CountDw = (IndexOperand >> 24) & 0xf;
1792 IndexOperand &= ~(0xf << 24);
1793
1794 if (CountDw < 1 || CountDw > 4) {
1795 const Function &Fn = MF->getFunction();
1796 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1797 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1798 CountDw = 1;
1799 }
1800 }
1801
1802 if (IndexOperand) {
1803 const Function &Fn = MF->getFunction();
1804 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1805 Fn, "ds_ordered_count: bad index operand", DL));
1806 }
1807
1808 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1809 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1810
1811 unsigned Offset0 = OrderedCountIndex << 2;
1812 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1813
1814 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1815 Offset1 |= (CountDw - 1) << 6;
1816
1817 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1818 Offset1 |= ShaderType << 2;
1819
1820 unsigned Offset = Offset0 | (Offset1 << 8);
1821
1822 Register M0Val = MI.getOperand(2).getReg();
1823 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1824 .addReg(M0Val);
1825
1826 Register DstReg = MI.getOperand(0).getReg();
1827 Register ValReg = MI.getOperand(3).getReg();
1828 MachineInstrBuilder DS =
1829 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1830 .addReg(ValReg)
1831 .addImm(Offset)
1832 .cloneMemRefs(MI);
1833
1834 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1835 return false;
1836
1837 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1838 MI.eraseFromParent();
1839 return Ret;
1840}
1841
1842static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1843 switch (IntrID) {
1844 case Intrinsic::amdgcn_ds_gws_init:
1845 return AMDGPU::DS_GWS_INIT;
1846 case Intrinsic::amdgcn_ds_gws_barrier:
1847 return AMDGPU::DS_GWS_BARRIER;
1848 case Intrinsic::amdgcn_ds_gws_sema_v:
1849 return AMDGPU::DS_GWS_SEMA_V;
1850 case Intrinsic::amdgcn_ds_gws_sema_br:
1851 return AMDGPU::DS_GWS_SEMA_BR;
1852 case Intrinsic::amdgcn_ds_gws_sema_p:
1853 return AMDGPU::DS_GWS_SEMA_P;
1854 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1855 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1856 default:
1857 llvm_unreachable("not a gws intrinsic");
1858 }
1859}
1860
1861bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1862 Intrinsic::ID IID) const {
1863 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1864 !STI.hasGWSSemaReleaseAll()))
1865 return false;
1866
1867 // intrinsic ID, vsrc, offset
1868 const bool HasVSrc = MI.getNumOperands() == 3;
1869 assert(HasVSrc || MI.getNumOperands() == 2);
1870
1871 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1872 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1873 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1874 return false;
1875
1876 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1877 unsigned ImmOffset;
1878
1879 MachineBasicBlock *MBB = MI.getParent();
1880 const DebugLoc &DL = MI.getDebugLoc();
1881
1882 MachineInstr *Readfirstlane = nullptr;
1883
1884 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1885 // incoming offset, in case there's an add of a constant. We'll have to put it
1886 // back later.
1887 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1888 Readfirstlane = OffsetDef;
1889 BaseOffset = OffsetDef->getOperand(1).getReg();
1890 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1891 }
1892
1893 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1894 // If we have a constant offset, try to use the 0 in m0 as the base.
1895 // TODO: Look into changing the default m0 initialization value. If the
1896 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1897 // the immediate offset.
1898
1899 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1900 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1901 .addImm(0);
1902 } else {
1903 std::tie(BaseOffset, ImmOffset) =
1904 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1905
1906 if (Readfirstlane) {
1907 // We have the constant offset now, so put the readfirstlane back on the
1908 // variable component.
1909 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1910 return false;
1911
1912 Readfirstlane->getOperand(1).setReg(BaseOffset);
1913 BaseOffset = Readfirstlane->getOperand(0).getReg();
1914 } else {
1915 if (!RBI.constrainGenericRegister(BaseOffset,
1916 AMDGPU::SReg_32RegClass, *MRI))
1917 return false;
1918 }
1919
1920 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1921 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1922 .addReg(BaseOffset)
1923 .addImm(16)
1924 .setOperandDead(3); // Dead scc
1925
1926 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1927 .addReg(M0Base);
1928 }
1929
1930 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1931 // offset field) % 64. Some versions of the programming guide omit the m0
1932 // part, or claim it's from offset 0.
1933 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1934
1935 if (HasVSrc) {
1936 Register VSrc = MI.getOperand(1).getReg();
1937 MIB.addReg(VSrc);
1938
1939 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1940 return false;
1941 }
1942
1943 MIB.addImm(ImmOffset)
1944 .cloneMemRefs(MI);
1945
1946 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1947
1948 MI.eraseFromParent();
1949 return true;
1950}
1951
1952bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1953 bool IsAppend) const {
1954 Register PtrBase = MI.getOperand(2).getReg();
1955 LLT PtrTy = MRI->getType(PtrBase);
1956 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1957
1958 unsigned Offset;
1959 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1960
1961 // TODO: Should this try to look through readfirstlane like GWS?
1962 if (!isDSOffsetLegal(PtrBase, Offset)) {
1963 PtrBase = MI.getOperand(2).getReg();
1964 Offset = 0;
1965 }
1966
1967 MachineBasicBlock *MBB = MI.getParent();
1968 const DebugLoc &DL = MI.getDebugLoc();
1969 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1970
1971 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1972 .addReg(PtrBase);
1973 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1974 return false;
1975
1976 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1977 .addImm(Offset)
1978 .addImm(IsGDS ? -1 : 0)
1979 .cloneMemRefs(MI);
1980 MI.eraseFromParent();
1981 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1982}
1983
1984bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1985 MachineFunction *MF = MI.getParent()->getParent();
1986 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1987
1988 MFInfo->setInitWholeWave();
1989 return selectImpl(MI, *CoverageInfo);
1990}
1991
1992static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1993 bool &IsTexFail) {
1994 if (TexFailCtrl)
1995 IsTexFail = true;
1996
1997 TFE = TexFailCtrl & 0x1;
1998 TexFailCtrl &= ~(uint64_t)0x1;
1999 LWE = TexFailCtrl & 0x2;
2000 TexFailCtrl &= ~(uint64_t)0x2;
2001
2002 return TexFailCtrl == 0;
2003}
2004
2005bool AMDGPUInstructionSelector::selectImageIntrinsic(
2006 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2007 MachineBasicBlock *MBB = MI.getParent();
2008 const DebugLoc &DL = MI.getDebugLoc();
2009
2010 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2012
2013 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2014 unsigned IntrOpcode = Intr->BaseOpcode;
2015 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2016 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2017 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2018
2019 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2020
2021 Register VDataIn, VDataOut;
2022 LLT VDataTy;
2023 int NumVDataDwords = -1;
2024 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2025 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2026
2027 bool Unorm;
2028 if (!BaseOpcode->Sampler)
2029 Unorm = true;
2030 else
2031 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2032
2033 bool TFE;
2034 bool LWE;
2035 bool IsTexFail = false;
2036 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2037 TFE, LWE, IsTexFail))
2038 return false;
2039
2040 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2041 const bool IsA16 = (Flags & 1) != 0;
2042 const bool IsG16 = (Flags & 2) != 0;
2043
2044 // A16 implies 16 bit gradients if subtarget doesn't support G16
2045 if (IsA16 && !STI.hasG16() && !IsG16)
2046 return false;
2047
2048 unsigned DMask = 0;
2049 unsigned DMaskLanes = 0;
2050
2051 if (BaseOpcode->Atomic) {
2052 VDataOut = MI.getOperand(0).getReg();
2053 VDataIn = MI.getOperand(2).getReg();
2054 LLT Ty = MRI->getType(VDataIn);
2055
2056 // Be careful to allow atomic swap on 16-bit element vectors.
2057 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2058 Ty.getSizeInBits() == 128 :
2059 Ty.getSizeInBits() == 64;
2060
2061 if (BaseOpcode->AtomicX2) {
2062 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2063
2064 DMask = Is64Bit ? 0xf : 0x3;
2065 NumVDataDwords = Is64Bit ? 4 : 2;
2066 } else {
2067 DMask = Is64Bit ? 0x3 : 0x1;
2068 NumVDataDwords = Is64Bit ? 2 : 1;
2069 }
2070 } else {
2071 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2072 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2073
2074 if (BaseOpcode->Store) {
2075 VDataIn = MI.getOperand(1).getReg();
2076 VDataTy = MRI->getType(VDataIn);
2077 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2078 } else if (BaseOpcode->NoReturn) {
2079 NumVDataDwords = 0;
2080 } else {
2081 VDataOut = MI.getOperand(0).getReg();
2082 VDataTy = MRI->getType(VDataOut);
2083 NumVDataDwords = DMaskLanes;
2084
2085 if (IsD16 && !STI.hasUnpackedD16VMem())
2086 NumVDataDwords = (DMaskLanes + 1) / 2;
2087 }
2088 }
2089
2090 // Set G16 opcode
2091 if (Subtarget->hasG16() && IsG16) {
2092 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2094 assert(G16MappingInfo);
2095 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2096 }
2097
2098 // TODO: Check this in verifier.
2099 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2100
2101 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2102 if (BaseOpcode->Atomic)
2103 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
2104 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2106 return false;
2107
2108 int NumVAddrRegs = 0;
2109 int NumVAddrDwords = 0;
2110 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2111 // Skip the $noregs and 0s inserted during legalization.
2112 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2113 if (!AddrOp.isReg())
2114 continue; // XXX - Break?
2115
2116 Register Addr = AddrOp.getReg();
2117 if (!Addr)
2118 break;
2119
2120 ++NumVAddrRegs;
2121 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2122 }
2123
2124 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2125 // NSA, these should have been packed into a single value in the first
2126 // address register
2127 const bool UseNSA =
2128 NumVAddrRegs != 1 &&
2129 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2130 : NumVAddrDwords == NumVAddrRegs);
2131 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2132 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2133 return false;
2134 }
2135
2136 if (IsTexFail)
2137 ++NumVDataDwords;
2138
2139 int Opcode = -1;
2140 if (IsGFX12Plus) {
2141 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2142 NumVDataDwords, NumVAddrDwords);
2143 } else if (IsGFX11Plus) {
2144 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2145 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2146 : AMDGPU::MIMGEncGfx11Default,
2147 NumVDataDwords, NumVAddrDwords);
2148 } else if (IsGFX10Plus) {
2149 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2150 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2151 : AMDGPU::MIMGEncGfx10Default,
2152 NumVDataDwords, NumVAddrDwords);
2153 } else {
2154 if (Subtarget->hasGFX90AInsts()) {
2155 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2156 NumVDataDwords, NumVAddrDwords);
2157 if (Opcode == -1) {
2158 LLVM_DEBUG(
2159 dbgs()
2160 << "requested image instruction is not supported on this GPU\n");
2161 return false;
2162 }
2163 }
2164 if (Opcode == -1 &&
2165 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2166 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2167 NumVDataDwords, NumVAddrDwords);
2168 if (Opcode == -1)
2169 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2170 NumVDataDwords, NumVAddrDwords);
2171 }
2172 if (Opcode == -1)
2173 return false;
2174
2175 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2176 .cloneMemRefs(MI);
2177
2178 if (VDataOut) {
2179 if (BaseOpcode->AtomicX2) {
2180 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2181
2182 Register TmpReg = MRI->createVirtualRegister(
2183 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2184 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2185
2186 MIB.addDef(TmpReg);
2187 if (!MRI->use_empty(VDataOut)) {
2188 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2189 .addReg(TmpReg, RegState::Kill, SubReg);
2190 }
2191
2192 } else {
2193 MIB.addDef(VDataOut); // vdata output
2194 }
2195 }
2196
2197 if (VDataIn)
2198 MIB.addReg(VDataIn); // vdata input
2199
2200 for (int I = 0; I != NumVAddrRegs; ++I) {
2201 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2202 if (SrcOp.isReg()) {
2203 assert(SrcOp.getReg() != 0);
2204 MIB.addReg(SrcOp.getReg());
2205 }
2206 }
2207
2208 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2209 if (BaseOpcode->Sampler)
2210 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2211
2212 MIB.addImm(DMask); // dmask
2213
2214 if (IsGFX10Plus)
2215 MIB.addImm(DimInfo->Encoding);
2216 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2217 MIB.addImm(Unorm);
2218
2219 MIB.addImm(CPol);
2220 MIB.addImm(IsA16 && // a16 or r128
2221 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2222 if (IsGFX10Plus)
2223 MIB.addImm(IsA16 ? -1 : 0);
2224
2225 if (!Subtarget->hasGFX90AInsts()) {
2226 MIB.addImm(TFE); // tfe
2227 } else if (TFE) {
2228 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2229 return false;
2230 }
2231
2232 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2233 MIB.addImm(LWE); // lwe
2234 if (!IsGFX10Plus)
2235 MIB.addImm(DimInfo->DA ? -1 : 0);
2236 if (BaseOpcode->HasD16)
2237 MIB.addImm(IsD16 ? -1 : 0);
2238
2239 MI.eraseFromParent();
2240 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2241 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2242 return true;
2243}
2244
2245// We need to handle this here because tablegen doesn't support matching
2246// instructions with multiple outputs.
2247bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2248 MachineInstr &MI) const {
2249 Register Dst0 = MI.getOperand(0).getReg();
2250 Register Dst1 = MI.getOperand(1).getReg();
2251
2252 const DebugLoc &DL = MI.getDebugLoc();
2253 MachineBasicBlock *MBB = MI.getParent();
2254
2255 Register Addr = MI.getOperand(3).getReg();
2256 Register Data0 = MI.getOperand(4).getReg();
2257 Register Data1 = MI.getOperand(5).getReg();
2258 unsigned Offset = MI.getOperand(6).getImm();
2259
2260 unsigned Opc;
2261 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2262 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2263 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2264 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2265 break;
2266 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2267 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2268 break;
2269 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2270 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2271 break;
2272 }
2273
2274 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2275 .addDef(Dst1)
2276 .addUse(Addr)
2277 .addUse(Data0)
2278 .addUse(Data1)
2279 .addImm(Offset)
2280 .cloneMemRefs(MI);
2281
2282 MI.eraseFromParent();
2283 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2284}
2285
2286bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2287 MachineInstr &I) const {
2288 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2289 switch (IntrinsicID) {
2290 case Intrinsic::amdgcn_end_cf:
2291 return selectEndCfIntrinsic(I);
2292 case Intrinsic::amdgcn_ds_ordered_add:
2293 case Intrinsic::amdgcn_ds_ordered_swap:
2294 return selectDSOrderedIntrinsic(I, IntrinsicID);
2295 case Intrinsic::amdgcn_ds_gws_init:
2296 case Intrinsic::amdgcn_ds_gws_barrier:
2297 case Intrinsic::amdgcn_ds_gws_sema_v:
2298 case Intrinsic::amdgcn_ds_gws_sema_br:
2299 case Intrinsic::amdgcn_ds_gws_sema_p:
2300 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2301 return selectDSGWSIntrinsic(I, IntrinsicID);
2302 case Intrinsic::amdgcn_ds_append:
2303 return selectDSAppendConsume(I, true);
2304 case Intrinsic::amdgcn_ds_consume:
2305 return selectDSAppendConsume(I, false);
2306 case Intrinsic::amdgcn_init_whole_wave:
2307 return selectInitWholeWave(I);
2308 case Intrinsic::amdgcn_raw_buffer_load_lds:
2309 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2310 case Intrinsic::amdgcn_struct_buffer_load_lds:
2311 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2312 return selectBufferLoadLds(I);
2313 // Until we can store both the address space of the global and the LDS
2314 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2315 // that the argument is a global pointer (buffer pointers have been handled by
2316 // a LLVM IR-level lowering).
2317 case Intrinsic::amdgcn_load_to_lds:
2318 case Intrinsic::amdgcn_global_load_lds:
2319 return selectGlobalLoadLds(I);
2320 case Intrinsic::amdgcn_exp_compr:
2321 if (!STI.hasCompressedExport()) {
2322 Function &F = I.getMF()->getFunction();
2323 F.getContext().diagnose(
2324 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2325 I.getDebugLoc(), DS_Error));
2326 return false;
2327 }
2328 break;
2329 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2330 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2331 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2332 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2333 return selectDSBvhStackIntrinsic(I);
2334 case Intrinsic::amdgcn_s_barrier_init:
2335 case Intrinsic::amdgcn_s_barrier_signal_var:
2336 return selectNamedBarrierInit(I, IntrinsicID);
2337 case Intrinsic::amdgcn_s_barrier_join:
2338 case Intrinsic::amdgcn_s_get_named_barrier_state:
2339 return selectNamedBarrierInst(I, IntrinsicID);
2340 case Intrinsic::amdgcn_s_get_barrier_state:
2341 return selectSGetBarrierState(I, IntrinsicID);
2342 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2343 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2344 }
2345 return selectImpl(I, *CoverageInfo);
2346}
2347
2348bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2349 if (selectImpl(I, *CoverageInfo))
2350 return true;
2351
2352 MachineBasicBlock *BB = I.getParent();
2353 const DebugLoc &DL = I.getDebugLoc();
2354
2355 Register DstReg = I.getOperand(0).getReg();
2356 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2357 assert(Size <= 32 || Size == 64);
2358 const MachineOperand &CCOp = I.getOperand(1);
2359 Register CCReg = CCOp.getReg();
2360 if (!isVCC(CCReg, *MRI)) {
2361 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2362 AMDGPU::S_CSELECT_B32;
2363 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2364 .addReg(CCReg);
2365
2366 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2367 // bank, because it does not cover the register class that we used to represent
2368 // for it. So we need to manually set the register class here.
2369 if (!MRI->getRegClassOrNull(CCReg))
2370 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2371 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2372 .add(I.getOperand(2))
2373 .add(I.getOperand(3));
2374
2375 bool Ret = false;
2376 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2377 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2378 I.eraseFromParent();
2379 return Ret;
2380 }
2381
2382 // Wide VGPR select should have been split in RegBankSelect.
2383 if (Size > 32)
2384 return false;
2385
2386 MachineInstr *Select =
2387 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2388 .addImm(0)
2389 .add(I.getOperand(3))
2390 .addImm(0)
2391 .add(I.getOperand(2))
2392 .add(I.getOperand(1));
2393
2394 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2395 I.eraseFromParent();
2396 return Ret;
2397}
2398
2399bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2400 Register DstReg = I.getOperand(0).getReg();
2401 Register SrcReg = I.getOperand(1).getReg();
2402 const LLT DstTy = MRI->getType(DstReg);
2403 const LLT SrcTy = MRI->getType(SrcReg);
2404 const LLT S1 = LLT::scalar(1);
2405
2406 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2407 const RegisterBank *DstRB;
2408 if (DstTy == S1) {
2409 // This is a special case. We don't treat s1 for legalization artifacts as
2410 // vcc booleans.
2411 DstRB = SrcRB;
2412 } else {
2413 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2414 if (SrcRB != DstRB)
2415 return false;
2416 }
2417
2418 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2419
2420 unsigned DstSize = DstTy.getSizeInBits();
2421 unsigned SrcSize = SrcTy.getSizeInBits();
2422
2423 const TargetRegisterClass *SrcRC =
2424 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2425 const TargetRegisterClass *DstRC =
2426 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2427 if (!SrcRC || !DstRC)
2428 return false;
2429
2430 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2431 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2432 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2433 return false;
2434 }
2435
2436 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2437 assert(STI.useRealTrue16Insts());
2438 const DebugLoc &DL = I.getDebugLoc();
2439 MachineBasicBlock *MBB = I.getParent();
2440 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2441 .addReg(SrcReg, 0, AMDGPU::lo16);
2442 I.eraseFromParent();
2443 return true;
2444 }
2445
2446 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2447 MachineBasicBlock *MBB = I.getParent();
2448 const DebugLoc &DL = I.getDebugLoc();
2449
2450 Register LoReg = MRI->createVirtualRegister(DstRC);
2451 Register HiReg = MRI->createVirtualRegister(DstRC);
2452 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2453 .addReg(SrcReg, 0, AMDGPU::sub0);
2454 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2455 .addReg(SrcReg, 0, AMDGPU::sub1);
2456
2457 if (IsVALU && STI.hasSDWA()) {
2458 // Write the low 16-bits of the high element into the high 16-bits of the
2459 // low element.
2460 MachineInstr *MovSDWA =
2461 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2462 .addImm(0) // $src0_modifiers
2463 .addReg(HiReg) // $src0
2464 .addImm(0) // $clamp
2465 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2466 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2467 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2468 .addReg(LoReg, RegState::Implicit);
2469 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2470 } else {
2471 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2472 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2473 Register ImmReg = MRI->createVirtualRegister(DstRC);
2474 if (IsVALU) {
2475 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2476 .addImm(16)
2477 .addReg(HiReg);
2478 } else {
2479 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2480 .addReg(HiReg)
2481 .addImm(16)
2482 .setOperandDead(3); // Dead scc
2483 }
2484
2485 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2486 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2487 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2488
2489 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2490 .addImm(0xffff);
2491 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2492 .addReg(LoReg)
2493 .addReg(ImmReg);
2494 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2495 .addReg(TmpReg0)
2496 .addReg(TmpReg1);
2497
2498 if (!IsVALU) {
2499 And.setOperandDead(3); // Dead scc
2500 Or.setOperandDead(3); // Dead scc
2501 }
2502 }
2503
2504 I.eraseFromParent();
2505 return true;
2506 }
2507
2508 if (!DstTy.isScalar())
2509 return false;
2510
2511 if (SrcSize > 32) {
2512 unsigned SubRegIdx = DstSize < 32
2513 ? static_cast<unsigned>(AMDGPU::sub0)
2514 : TRI.getSubRegFromChannel(0, DstSize / 32);
2515 if (SubRegIdx == AMDGPU::NoSubRegister)
2516 return false;
2517
2518 // Deal with weird cases where the class only partially supports the subreg
2519 // index.
2520 const TargetRegisterClass *SrcWithSubRC
2521 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2522 if (!SrcWithSubRC)
2523 return false;
2524
2525 if (SrcWithSubRC != SrcRC) {
2526 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2527 return false;
2528 }
2529
2530 I.getOperand(1).setSubReg(SubRegIdx);
2531 }
2532
2533 I.setDesc(TII.get(TargetOpcode::COPY));
2534 return true;
2535}
2536
2537/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2538static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2540 int SignedMask = static_cast<int>(Mask);
2541 return SignedMask >= -16 && SignedMask <= 64;
2542}
2543
2544// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2545const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2547 const TargetRegisterInfo &TRI) const {
2548 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2549 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2550 return RB;
2551
2552 // Ignore the type, since we don't use vcc in artifacts.
2553 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2554 return &RBI.getRegBankFromRegClass(*RC, LLT());
2555 return nullptr;
2556}
2557
2558bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2559 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2560 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2561 const DebugLoc &DL = I.getDebugLoc();
2562 MachineBasicBlock &MBB = *I.getParent();
2563 const Register DstReg = I.getOperand(0).getReg();
2564 const Register SrcReg = I.getOperand(1).getReg();
2565
2566 const LLT DstTy = MRI->getType(DstReg);
2567 const LLT SrcTy = MRI->getType(SrcReg);
2568 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2569 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2570 const unsigned DstSize = DstTy.getSizeInBits();
2571 if (!DstTy.isScalar())
2572 return false;
2573
2574 // Artifact casts should never use vcc.
2575 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2576
2577 // FIXME: This should probably be illegal and split earlier.
2578 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2579 if (DstSize <= 32)
2580 return selectCOPY(I);
2581
2582 const TargetRegisterClass *SrcRC =
2583 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2584 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2585 const TargetRegisterClass *DstRC =
2586 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2587
2588 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2589 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2590 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2591 .addReg(SrcReg)
2592 .addImm(AMDGPU::sub0)
2593 .addReg(UndefReg)
2594 .addImm(AMDGPU::sub1);
2595 I.eraseFromParent();
2596
2597 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2598 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2599 }
2600
2601 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2602 // 64-bit should have been split up in RegBankSelect
2603
2604 // Try to use an and with a mask if it will save code size.
2605 unsigned Mask;
2606 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2607 MachineInstr *ExtI =
2608 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2609 .addImm(Mask)
2610 .addReg(SrcReg);
2611 I.eraseFromParent();
2612 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2613 }
2614
2615 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2616 MachineInstr *ExtI =
2617 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2618 .addReg(SrcReg)
2619 .addImm(0) // Offset
2620 .addImm(SrcSize); // Width
2621 I.eraseFromParent();
2622 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2623 }
2624
2625 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2626 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2627 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2628 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2629 return false;
2630
2631 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2632 const unsigned SextOpc = SrcSize == 8 ?
2633 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2634 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2635 .addReg(SrcReg);
2636 I.eraseFromParent();
2637 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2638 }
2639
2640 // Using a single 32-bit SALU to calculate the high half is smaller than
2641 // S_BFE with a literal constant operand.
2642 if (DstSize > 32 && SrcSize == 32) {
2643 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2644 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2645 if (Signed) {
2646 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2647 .addReg(SrcReg, 0, SubReg)
2648 .addImm(31)
2649 .setOperandDead(3); // Dead scc
2650 } else {
2651 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2652 .addImm(0);
2653 }
2654 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2655 .addReg(SrcReg, 0, SubReg)
2656 .addImm(AMDGPU::sub0)
2657 .addReg(HiReg)
2658 .addImm(AMDGPU::sub1);
2659 I.eraseFromParent();
2660 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2661 *MRI);
2662 }
2663
2664 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2665 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2666
2667 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2668 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2669 // We need a 64-bit register source, but the high bits don't matter.
2670 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2671 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2672 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2673
2674 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2675 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2676 .addReg(SrcReg, 0, SubReg)
2677 .addImm(AMDGPU::sub0)
2678 .addReg(UndefReg)
2679 .addImm(AMDGPU::sub1);
2680
2681 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2682 .addReg(ExtReg)
2683 .addImm(SrcSize << 16);
2684
2685 I.eraseFromParent();
2686 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2687 }
2688
2689 unsigned Mask;
2690 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2691 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2692 .addReg(SrcReg)
2693 .addImm(Mask)
2694 .setOperandDead(3); // Dead scc
2695 } else {
2696 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2697 .addReg(SrcReg)
2698 .addImm(SrcSize << 16);
2699 }
2700
2701 I.eraseFromParent();
2702 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2703 }
2704
2705 return false;
2706}
2707
2711
2713 Register BitcastSrc;
2714 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2715 Reg = BitcastSrc;
2716 return Reg;
2717}
2718
2720 Register &Out) {
2721 Register Trunc;
2722 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2723 return false;
2724
2725 Register LShlSrc;
2726 Register Cst;
2727 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2728 Cst = stripCopy(Cst, MRI);
2729 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2730 Out = stripBitCast(LShlSrc, MRI);
2731 return true;
2732 }
2733 }
2734
2735 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2736 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2737 return false;
2738
2739 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2740 LLT::fixed_vector(2, 16));
2741
2742 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2743 assert(Mask.size() == 2);
2744
2745 if (Mask[0] == 1 && Mask[1] <= 1) {
2746 Out = Shuffle->getOperand(0).getReg();
2747 return true;
2748 }
2749
2750 return false;
2751}
2752
2753bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2754 if (!Subtarget->hasSALUFloatInsts())
2755 return false;
2756
2757 Register Dst = I.getOperand(0).getReg();
2758 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2759 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2760 return false;
2761
2762 Register Src = I.getOperand(1).getReg();
2763
2764 if (MRI->getType(Dst) == LLT::scalar(32) &&
2765 MRI->getType(Src) == LLT::scalar(16)) {
2766 if (isExtractHiElt(*MRI, Src, Src)) {
2767 MachineBasicBlock *BB = I.getParent();
2768 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2769 .addUse(Src);
2770 I.eraseFromParent();
2771 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2772 }
2773 }
2774
2775 return false;
2776}
2777
2778bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2779 // Only manually handle the f64 SGPR case.
2780 //
2781 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2782 // the bit ops theoretically have a second result due to the implicit def of
2783 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2784 // that is easy by disabling the check. The result works, but uses a
2785 // nonsensical sreg32orlds_and_sreg_1 regclass.
2786 //
2787 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2788 // the variadic REG_SEQUENCE operands.
2789
2790 Register Dst = MI.getOperand(0).getReg();
2791 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2792 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2793 MRI->getType(Dst) != LLT::scalar(64))
2794 return false;
2795
2796 Register Src = MI.getOperand(1).getReg();
2797 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2798 if (Fabs)
2799 Src = Fabs->getOperand(1).getReg();
2800
2801 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2802 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2803 return false;
2804
2805 MachineBasicBlock *BB = MI.getParent();
2806 const DebugLoc &DL = MI.getDebugLoc();
2807 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2808 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2809 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2810 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2811
2812 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2813 .addReg(Src, 0, AMDGPU::sub0);
2814 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2815 .addReg(Src, 0, AMDGPU::sub1);
2816 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2817 .addImm(0x80000000);
2818
2819 // Set or toggle sign bit.
2820 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2821 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2822 .addReg(HiReg)
2823 .addReg(ConstReg)
2824 .setOperandDead(3); // Dead scc
2825 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2826 .addReg(LoReg)
2827 .addImm(AMDGPU::sub0)
2828 .addReg(OpReg)
2829 .addImm(AMDGPU::sub1);
2830 MI.eraseFromParent();
2831 return true;
2832}
2833
2834// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2835bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2836 Register Dst = MI.getOperand(0).getReg();
2837 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2838 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2839 MRI->getType(Dst) != LLT::scalar(64))
2840 return false;
2841
2842 Register Src = MI.getOperand(1).getReg();
2843 MachineBasicBlock *BB = MI.getParent();
2844 const DebugLoc &DL = MI.getDebugLoc();
2845 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2846 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2847 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2848 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2849
2850 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2851 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2852 return false;
2853
2854 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2855 .addReg(Src, 0, AMDGPU::sub0);
2856 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2857 .addReg(Src, 0, AMDGPU::sub1);
2858 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2859 .addImm(0x7fffffff);
2860
2861 // Clear sign bit.
2862 // TODO: Should this used S_BITSET0_*?
2863 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2864 .addReg(HiReg)
2865 .addReg(ConstReg)
2866 .setOperandDead(3); // Dead scc
2867 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2868 .addReg(LoReg)
2869 .addImm(AMDGPU::sub0)
2870 .addReg(OpReg)
2871 .addImm(AMDGPU::sub1);
2872
2873 MI.eraseFromParent();
2874 return true;
2875}
2876
2877static bool isConstant(const MachineInstr &MI) {
2878 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2879}
2880
2881void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2882 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2883
2884 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2885 const MachineInstr *PtrMI =
2886 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2887
2888 assert(PtrMI);
2889
2890 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2891 return;
2892
2893 GEPInfo GEPInfo;
2894
2895 for (unsigned i = 1; i != 3; ++i) {
2896 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2897 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2898 assert(OpDef);
2899 if (i == 2 && isConstant(*OpDef)) {
2900 // TODO: Could handle constant base + variable offset, but a combine
2901 // probably should have commuted it.
2902 assert(GEPInfo.Imm == 0);
2903 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2904 continue;
2905 }
2906 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2907 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2908 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2909 else
2910 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2911 }
2912
2913 AddrInfo.push_back(GEPInfo);
2914 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2915}
2916
2917bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2918 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2919}
2920
2921bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2922 if (!MI.hasOneMemOperand())
2923 return false;
2924
2925 const MachineMemOperand *MMO = *MI.memoperands_begin();
2926 const Value *Ptr = MMO->getValue();
2927
2928 // UndefValue means this is a load of a kernel input. These are uniform.
2929 // Sometimes LDS instructions have constant pointers.
2930 // If Ptr is null, then that means this mem operand contains a
2931 // PseudoSourceValue like GOT.
2933 return true;
2934
2936 return true;
2937
2938 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2939 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2940 AMDGPU::SGPRRegBankID;
2941
2943 return I && I->getMetadata("amdgpu.uniform");
2944}
2945
2946bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2947 for (const GEPInfo &GEPInfo : AddrInfo) {
2948 if (!GEPInfo.VgprParts.empty())
2949 return true;
2950 }
2951 return false;
2952}
2953
2954void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2955 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2956 unsigned AS = PtrTy.getAddressSpace();
2958 STI.ldsRequiresM0Init()) {
2959 MachineBasicBlock *BB = I.getParent();
2960
2961 // If DS instructions require M0 initialization, insert it before selecting.
2962 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2963 .addImm(-1);
2964 }
2965}
2966
2967bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2968 MachineInstr &I) const {
2969 initM0(I);
2970 return selectImpl(I, *CoverageInfo);
2971}
2972
2974 if (Reg.isPhysical())
2975 return false;
2976
2977 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2978 const unsigned Opcode = MI.getOpcode();
2979
2980 if (Opcode == AMDGPU::COPY)
2981 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2982
2983 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2984 Opcode == AMDGPU::G_XOR)
2985 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2986 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2987
2988 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2989 return GI->is(Intrinsic::amdgcn_class);
2990
2991 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2992}
2993
2994bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2995 MachineBasicBlock *BB = I.getParent();
2996 MachineOperand &CondOp = I.getOperand(0);
2997 Register CondReg = CondOp.getReg();
2998 const DebugLoc &DL = I.getDebugLoc();
2999
3000 unsigned BrOpcode;
3001 Register CondPhysReg;
3002 const TargetRegisterClass *ConstrainRC;
3003
3004 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3005 // whether the branch is uniform when selecting the instruction. In
3006 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3007 // RegBankSelect knows what it's doing if the branch condition is scc, even
3008 // though it currently does not.
3009 if (!isVCC(CondReg, *MRI)) {
3010 if (MRI->getType(CondReg) != LLT::scalar(32))
3011 return false;
3012
3013 CondPhysReg = AMDGPU::SCC;
3014 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3015 ConstrainRC = &AMDGPU::SReg_32RegClass;
3016 } else {
3017 // FIXME: Should scc->vcc copies and with exec?
3018
3019 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3020 // need to insert an and with exec.
3021 if (!isVCmpResult(CondReg, *MRI)) {
3022 const bool Is64 = STI.isWave64();
3023 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3024 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3025
3026 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3027 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3028 .addReg(CondReg)
3029 .addReg(Exec)
3030 .setOperandDead(3); // Dead scc
3031 CondReg = TmpReg;
3032 }
3033
3034 CondPhysReg = TRI.getVCC();
3035 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3036 ConstrainRC = TRI.getBoolRC();
3037 }
3038
3039 if (!MRI->getRegClassOrNull(CondReg))
3040 MRI->setRegClass(CondReg, ConstrainRC);
3041
3042 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3043 .addReg(CondReg);
3044 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3045 .addMBB(I.getOperand(1).getMBB());
3046
3047 I.eraseFromParent();
3048 return true;
3049}
3050
3051bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3052 MachineInstr &I) const {
3053 Register DstReg = I.getOperand(0).getReg();
3054 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3055 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3056 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3057 if (IsVGPR)
3058 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3059
3060 return RBI.constrainGenericRegister(
3061 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3062}
3063
3064bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3065 Register DstReg = I.getOperand(0).getReg();
3066 Register SrcReg = I.getOperand(1).getReg();
3067 Register MaskReg = I.getOperand(2).getReg();
3068 LLT Ty = MRI->getType(DstReg);
3069 LLT MaskTy = MRI->getType(MaskReg);
3070 MachineBasicBlock *BB = I.getParent();
3071 const DebugLoc &DL = I.getDebugLoc();
3072
3073 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3074 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3075 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3076 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3077 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3078 return false;
3079
3080 // Try to avoid emitting a bit operation when we only need to touch half of
3081 // the 64-bit pointer.
3082 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3083 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3084 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3085
3086 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3087 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3088
3089 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3090 !CanCopyLow32 && !CanCopyHi32) {
3091 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3092 .addReg(SrcReg)
3093 .addReg(MaskReg)
3094 .setOperandDead(3); // Dead scc
3095 I.eraseFromParent();
3096 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3097 }
3098
3099 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3100 const TargetRegisterClass &RegRC
3101 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3102
3103 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3104 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3105 const TargetRegisterClass *MaskRC =
3106 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3107
3108 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3109 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3110 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3111 return false;
3112
3113 if (Ty.getSizeInBits() == 32) {
3114 assert(MaskTy.getSizeInBits() == 32 &&
3115 "ptrmask should have been narrowed during legalize");
3116
3117 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3118 .addReg(SrcReg)
3119 .addReg(MaskReg);
3120
3121 if (!IsVGPR)
3122 NewOp.setOperandDead(3); // Dead scc
3123 I.eraseFromParent();
3124 return true;
3125 }
3126
3127 Register HiReg = MRI->createVirtualRegister(&RegRC);
3128 Register LoReg = MRI->createVirtualRegister(&RegRC);
3129
3130 // Extract the subregisters from the source pointer.
3131 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3132 .addReg(SrcReg, 0, AMDGPU::sub0);
3133 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3134 .addReg(SrcReg, 0, AMDGPU::sub1);
3135
3136 Register MaskedLo, MaskedHi;
3137
3138 if (CanCopyLow32) {
3139 // If all the bits in the low half are 1, we only need a copy for it.
3140 MaskedLo = LoReg;
3141 } else {
3142 // Extract the mask subregister and apply the and.
3143 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3144 MaskedLo = MRI->createVirtualRegister(&RegRC);
3145
3146 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3147 .addReg(MaskReg, 0, AMDGPU::sub0);
3148 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3149 .addReg(LoReg)
3150 .addReg(MaskLo);
3151 }
3152
3153 if (CanCopyHi32) {
3154 // If all the bits in the high half are 1, we only need a copy for it.
3155 MaskedHi = HiReg;
3156 } else {
3157 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3158 MaskedHi = MRI->createVirtualRegister(&RegRC);
3159
3160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3161 .addReg(MaskReg, 0, AMDGPU::sub1);
3162 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3163 .addReg(HiReg)
3164 .addReg(MaskHi);
3165 }
3166
3167 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3168 .addReg(MaskedLo)
3169 .addImm(AMDGPU::sub0)
3170 .addReg(MaskedHi)
3171 .addImm(AMDGPU::sub1);
3172 I.eraseFromParent();
3173 return true;
3174}
3175
3176/// Return the register to use for the index value, and the subregister to use
3177/// for the indirectly accessed register.
3178static std::pair<Register, unsigned>
3180 const TargetRegisterClass *SuperRC, Register IdxReg,
3181 unsigned EltSize, GISelValueTracking &ValueTracking) {
3182 Register IdxBaseReg;
3183 int Offset;
3184
3185 std::tie(IdxBaseReg, Offset) =
3186 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3187 if (IdxBaseReg == AMDGPU::NoRegister) {
3188 // This will happen if the index is a known constant. This should ordinarily
3189 // be legalized out, but handle it as a register just in case.
3190 assert(Offset == 0);
3191 IdxBaseReg = IdxReg;
3192 }
3193
3194 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3195
3196 // Skip out of bounds offsets, or else we would end up using an undefined
3197 // register.
3198 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3199 return std::pair(IdxReg, SubRegs[0]);
3200 return std::pair(IdxBaseReg, SubRegs[Offset]);
3201}
3202
3203bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3204 MachineInstr &MI) const {
3205 Register DstReg = MI.getOperand(0).getReg();
3206 Register SrcReg = MI.getOperand(1).getReg();
3207 Register IdxReg = MI.getOperand(2).getReg();
3208
3209 LLT DstTy = MRI->getType(DstReg);
3210 LLT SrcTy = MRI->getType(SrcReg);
3211
3212 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3213 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3214 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3215
3216 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3217 // into a waterfall loop.
3218 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3219 return false;
3220
3221 const TargetRegisterClass *SrcRC =
3222 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3223 const TargetRegisterClass *DstRC =
3224 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3225 if (!SrcRC || !DstRC)
3226 return false;
3227 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3228 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3229 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3230 return false;
3231
3232 MachineBasicBlock *BB = MI.getParent();
3233 const DebugLoc &DL = MI.getDebugLoc();
3234 const bool Is64 = DstTy.getSizeInBits() == 64;
3235
3236 unsigned SubReg;
3237 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3238 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3239
3240 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3241 if (DstTy.getSizeInBits() != 32 && !Is64)
3242 return false;
3243
3244 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3245 .addReg(IdxReg);
3246
3247 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3248 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3249 .addReg(SrcReg, 0, SubReg)
3250 .addReg(SrcReg, RegState::Implicit);
3251 MI.eraseFromParent();
3252 return true;
3253 }
3254
3255 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3256 return false;
3257
3258 if (!STI.useVGPRIndexMode()) {
3259 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3260 .addReg(IdxReg);
3261 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3262 .addReg(SrcReg, 0, SubReg)
3263 .addReg(SrcReg, RegState::Implicit);
3264 MI.eraseFromParent();
3265 return true;
3266 }
3267
3268 const MCInstrDesc &GPRIDXDesc =
3269 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3270 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3271 .addReg(SrcReg)
3272 .addReg(IdxReg)
3273 .addImm(SubReg);
3274
3275 MI.eraseFromParent();
3276 return true;
3277}
3278
3279// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3280bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3281 MachineInstr &MI) const {
3282 Register DstReg = MI.getOperand(0).getReg();
3283 Register VecReg = MI.getOperand(1).getReg();
3284 Register ValReg = MI.getOperand(2).getReg();
3285 Register IdxReg = MI.getOperand(3).getReg();
3286
3287 LLT VecTy = MRI->getType(DstReg);
3288 LLT ValTy = MRI->getType(ValReg);
3289 unsigned VecSize = VecTy.getSizeInBits();
3290 unsigned ValSize = ValTy.getSizeInBits();
3291
3292 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3293 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3294 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3295
3296 assert(VecTy.getElementType() == ValTy);
3297
3298 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3299 // into a waterfall loop.
3300 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3301 return false;
3302
3303 const TargetRegisterClass *VecRC =
3304 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3305 const TargetRegisterClass *ValRC =
3306 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3307
3308 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3309 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3310 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3311 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3312 return false;
3313
3314 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3315 return false;
3316
3317 unsigned SubReg;
3318 std::tie(IdxReg, SubReg) =
3319 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3320
3321 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3322 STI.useVGPRIndexMode();
3323
3324 MachineBasicBlock *BB = MI.getParent();
3325 const DebugLoc &DL = MI.getDebugLoc();
3326
3327 if (!IndexMode) {
3328 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3329 .addReg(IdxReg);
3330
3331 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3332 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3333 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3334 .addReg(VecReg)
3335 .addReg(ValReg)
3336 .addImm(SubReg);
3337 MI.eraseFromParent();
3338 return true;
3339 }
3340
3341 const MCInstrDesc &GPRIDXDesc =
3342 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3343 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3344 .addReg(VecReg)
3345 .addReg(ValReg)
3346 .addReg(IdxReg)
3347 .addImm(SubReg);
3348
3349 MI.eraseFromParent();
3350 return true;
3351}
3352
3353bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3354 if (!Subtarget->hasVMemToLDSLoad())
3355 return false;
3356 unsigned Opc;
3357 unsigned Size = MI.getOperand(3).getImm();
3358
3359 // The struct intrinsic variants add one additional operand over raw.
3360 const bool HasVIndex = MI.getNumOperands() == 9;
3361 Register VIndex;
3362 int OpOffset = 0;
3363 if (HasVIndex) {
3364 VIndex = MI.getOperand(4).getReg();
3365 OpOffset = 1;
3366 }
3367
3368 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3369 std::optional<ValueAndVReg> MaybeVOffset =
3371 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3372
3373 switch (Size) {
3374 default:
3375 return false;
3376 case 1:
3377 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3378 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3379 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3380 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3381 break;
3382 case 2:
3383 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3384 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3385 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3386 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3387 break;
3388 case 4:
3389 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3390 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3391 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3392 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3393 break;
3394 case 12:
3395 if (!Subtarget->hasLDSLoadB96_B128())
3396 return false;
3397
3398 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3399 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3400 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3401 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3402 break;
3403 case 16:
3404 if (!Subtarget->hasLDSLoadB96_B128())
3405 return false;
3406
3407 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3408 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3409 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3410 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3411 break;
3412 }
3413
3414 MachineBasicBlock *MBB = MI.getParent();
3415 const DebugLoc &DL = MI.getDebugLoc();
3416 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3417 .add(MI.getOperand(2));
3418
3419 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3420
3421 if (HasVIndex && HasVOffset) {
3422 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3423 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3424 .addReg(VIndex)
3425 .addImm(AMDGPU::sub0)
3426 .addReg(VOffset)
3427 .addImm(AMDGPU::sub1);
3428
3429 MIB.addReg(IdxReg);
3430 } else if (HasVIndex) {
3431 MIB.addReg(VIndex);
3432 } else if (HasVOffset) {
3433 MIB.addReg(VOffset);
3434 }
3435
3436 MIB.add(MI.getOperand(1)); // rsrc
3437 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3438 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3439 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3440 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3441 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3442 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3443 MIB.addImm(
3444 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3445 ? 1
3446 : 0); // swz
3447
3448 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3449 // Don't set the offset value here because the pointer points to the base of
3450 // the buffer.
3451 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3452
3453 MachinePointerInfo StorePtrI = LoadPtrI;
3454 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3458
3459 auto F = LoadMMO->getFlags() &
3461 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3462 Size, LoadMMO->getBaseAlign());
3463
3464 MachineMemOperand *StoreMMO =
3465 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3466 sizeof(int32_t), LoadMMO->getBaseAlign());
3467
3468 MIB.setMemRefs({LoadMMO, StoreMMO});
3469
3470 MI.eraseFromParent();
3471 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3472}
3473
3474/// Match a zero extend from a 32-bit value to 64-bits.
3475Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3476 Register ZExtSrc;
3477 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3478 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3479
3480 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3481 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3482 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3483 return Register();
3484
3485 assert(Def->getNumOperands() == 3 &&
3486 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3487 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3488 return Def->getOperand(1).getReg();
3489 }
3490
3491 return Register();
3492}
3493
3494/// Match a sign extend from a 32-bit value to 64-bits.
3495Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3496 Register SExtSrc;
3497 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3498 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3499
3500 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3501 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3502 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3503 return Register();
3504
3505 assert(Def->getNumOperands() == 3 &&
3506 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3507 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3508 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3509 m_SpecificICst(31))))
3510 return Def->getOperand(1).getReg();
3511
3512 if (VT->signBitIsZero(Reg))
3513 return matchZeroExtendFromS32(Reg);
3514
3515 return Register();
3516}
3517
3518/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3519/// is 32-bit.
3521AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3522 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3523 : matchZeroExtendFromS32(Reg);
3524}
3525
3526/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3527/// is 32-bit.
3529AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3530 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3531 : matchSignExtendFromS32(Reg);
3532}
3533
3535AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3536 bool IsSigned) const {
3537 if (IsSigned)
3538 return matchSignExtendFromS32OrS32(Reg);
3539
3540 return matchZeroExtendFromS32OrS32(Reg);
3541}
3542
3543Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3544 Register AnyExtSrc;
3545 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3546 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3547
3548 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3549 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3550 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3551 return Register();
3552
3553 assert(Def->getNumOperands() == 3 &&
3554 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3555
3556 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3557 return Def->getOperand(1).getReg();
3558
3559 return Register();
3560}
3561
3562bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3563 if (!Subtarget->hasVMemToLDSLoad())
3564 return false;
3565
3566 unsigned Opc;
3567 unsigned Size = MI.getOperand(3).getImm();
3568
3569 switch (Size) {
3570 default:
3571 return false;
3572 case 1:
3573 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3574 break;
3575 case 2:
3576 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3577 break;
3578 case 4:
3579 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3580 break;
3581 case 12:
3582 if (!Subtarget->hasLDSLoadB96_B128())
3583 return false;
3584 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3585 break;
3586 case 16:
3587 if (!Subtarget->hasLDSLoadB96_B128())
3588 return false;
3589 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3590 break;
3591 }
3592
3593 MachineBasicBlock *MBB = MI.getParent();
3594 const DebugLoc &DL = MI.getDebugLoc();
3595 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3596 .add(MI.getOperand(2));
3597
3598 Register Addr = MI.getOperand(1).getReg();
3599 Register VOffset;
3600 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3601 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3602 if (!isSGPR(Addr)) {
3603 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3604 if (isSGPR(AddrDef->Reg)) {
3605 Addr = AddrDef->Reg;
3606 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3607 Register SAddr =
3608 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3609 if (isSGPR(SAddr)) {
3610 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3611 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3612 Addr = SAddr;
3613 VOffset = Off;
3614 }
3615 }
3616 }
3617 }
3618
3619 if (isSGPR(Addr)) {
3621 if (!VOffset) {
3622 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3623 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3624 .addImm(0);
3625 }
3626 }
3627
3628 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3629 .addReg(Addr);
3630
3631 if (isSGPR(Addr))
3632 MIB.addReg(VOffset);
3633
3634 MIB.add(MI.getOperand(4)); // offset
3635
3636 unsigned Aux = MI.getOperand(5).getImm();
3637 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3638
3639 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3640 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3641 LoadPtrI.Offset = MI.getOperand(4).getImm();
3642 MachinePointerInfo StorePtrI = LoadPtrI;
3643 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3647 auto F = LoadMMO->getFlags() &
3649 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3650 Size, LoadMMO->getBaseAlign());
3651 MachineMemOperand *StoreMMO =
3652 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3653 sizeof(int32_t), Align(4));
3654
3655 MIB.setMemRefs({LoadMMO, StoreMMO});
3656
3657 MI.eraseFromParent();
3658 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3659}
3660
3661bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3662 MachineInstr &MI) const {
3663 unsigned OpcodeOpIdx =
3664 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3665 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3666 MI.removeOperand(OpcodeOpIdx);
3667 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3668 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3669}
3670
3671// FIXME: This should be removed and let the patterns select. We just need the
3672// AGPR/VGPR combination versions.
3673bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3674 unsigned Opc;
3675 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3676 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3677 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3678 break;
3679 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3680 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3681 break;
3682 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3683 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3684 break;
3685 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3686 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3687 break;
3688 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3689 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3690 break;
3691 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3692 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3693 break;
3694 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3695 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3696 break;
3697 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3698 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3699 break;
3700 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3701 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3702 break;
3703 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3704 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3705 break;
3706 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3707 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3708 break;
3709 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3710 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3711 break;
3712 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3713 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3714 break;
3715 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3716 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3717 break;
3718 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3719 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3720 break;
3721 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3722 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3723 break;
3724 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3725 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3726 break;
3727 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3728 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3729 break;
3730 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3731 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3732 break;
3733 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3734 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3735 break;
3736 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3737 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3738 break;
3739 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3740 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3741 break;
3742 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3743 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3744 break;
3745 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3746 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3747 break;
3748 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3749 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3750 break;
3751 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3752 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3753 break;
3754 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3755 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3756 break;
3757 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3758 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3759 break;
3760 default:
3761 llvm_unreachable("unhandled smfmac intrinsic");
3762 }
3763
3764 auto VDst_In = MI.getOperand(4);
3765
3766 MI.setDesc(TII.get(Opc));
3767 MI.removeOperand(4); // VDst_In
3768 MI.removeOperand(1); // Intrinsic ID
3769 MI.addOperand(VDst_In); // Readd VDst_In to the end
3770 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3771 return true;
3772}
3773
3774bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3775 MachineInstr &MI, Intrinsic::ID IntrID) const {
3776 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3777 !Subtarget->hasPermlane16Swap())
3778 return false;
3779 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3780 !Subtarget->hasPermlane32Swap())
3781 return false;
3782
3783 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3784 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3785 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3786
3787 MI.removeOperand(2);
3788 MI.setDesc(TII.get(Opcode));
3789 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3790
3791 MachineOperand &FI = MI.getOperand(4);
3793
3794 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3795}
3796
3797bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3798 Register DstReg = MI.getOperand(0).getReg();
3799 Register SrcReg = MI.getOperand(1).getReg();
3800 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3801 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3802 MachineBasicBlock *MBB = MI.getParent();
3803 const DebugLoc &DL = MI.getDebugLoc();
3804
3805 if (IsVALU) {
3806 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3807 .addImm(Subtarget->getWavefrontSizeLog2())
3808 .addReg(SrcReg);
3809 } else {
3810 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3811 .addReg(SrcReg)
3812 .addImm(Subtarget->getWavefrontSizeLog2())
3813 .setOperandDead(3); // Dead scc
3814 }
3815
3816 const TargetRegisterClass &RC =
3817 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3818 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3819 return false;
3820
3821 MI.eraseFromParent();
3822 return true;
3823}
3824
3825// Match BITOP3 operation and return a number of matched instructions plus
3826// truth table.
3827static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3829 const MachineRegisterInfo &MRI) {
3830 unsigned NumOpcodes = 0;
3831 uint8_t LHSBits, RHSBits;
3832
3833 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3834 // Define truth table given Src0, Src1, Src2 bits permutations:
3835 // 0 0 0
3836 // 0 0 1
3837 // 0 1 0
3838 // 0 1 1
3839 // 1 0 0
3840 // 1 0 1
3841 // 1 1 0
3842 // 1 1 1
3843 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3844
3845 if (mi_match(Op, MRI, m_AllOnesInt())) {
3846 Bits = 0xff;
3847 return true;
3848 }
3849 if (mi_match(Op, MRI, m_ZeroInt())) {
3850 Bits = 0;
3851 return true;
3852 }
3853
3854 for (unsigned I = 0; I < Src.size(); ++I) {
3855 // Try to find existing reused operand
3856 if (Src[I] == Op) {
3857 Bits = SrcBits[I];
3858 return true;
3859 }
3860 // Try to replace parent operator
3861 if (Src[I] == R) {
3862 Bits = SrcBits[I];
3863 Src[I] = Op;
3864 return true;
3865 }
3866 }
3867
3868 if (Src.size() == 3) {
3869 // No room left for operands. Try one last time, there can be a 'not' of
3870 // one of our source operands. In this case we can compute the bits
3871 // without growing Src vector.
3872 Register LHS;
3873 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3875 for (unsigned I = 0; I < Src.size(); ++I) {
3876 if (Src[I] == LHS) {
3877 Bits = ~SrcBits[I];
3878 return true;
3879 }
3880 }
3881 }
3882
3883 return false;
3884 }
3885
3886 Bits = SrcBits[Src.size()];
3887 Src.push_back(Op);
3888 return true;
3889 };
3890
3891 MachineInstr *MI = MRI.getVRegDef(R);
3892 switch (MI->getOpcode()) {
3893 case TargetOpcode::G_AND:
3894 case TargetOpcode::G_OR:
3895 case TargetOpcode::G_XOR: {
3896 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3897 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3898
3899 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3900 if (!getOperandBits(LHS, LHSBits) ||
3901 !getOperandBits(RHS, RHSBits)) {
3902 Src = Backup;
3903 return std::make_pair(0, 0);
3904 }
3905
3906 // Recursion is naturally limited by the size of the operand vector.
3907 auto Op = BitOp3_Op(LHS, Src, MRI);
3908 if (Op.first) {
3909 NumOpcodes += Op.first;
3910 LHSBits = Op.second;
3911 }
3912
3913 Op = BitOp3_Op(RHS, Src, MRI);
3914 if (Op.first) {
3915 NumOpcodes += Op.first;
3916 RHSBits = Op.second;
3917 }
3918 break;
3919 }
3920 default:
3921 return std::make_pair(0, 0);
3922 }
3923
3924 uint8_t TTbl;
3925 switch (MI->getOpcode()) {
3926 case TargetOpcode::G_AND:
3927 TTbl = LHSBits & RHSBits;
3928 break;
3929 case TargetOpcode::G_OR:
3930 TTbl = LHSBits | RHSBits;
3931 break;
3932 case TargetOpcode::G_XOR:
3933 TTbl = LHSBits ^ RHSBits;
3934 break;
3935 default:
3936 break;
3937 }
3938
3939 return std::make_pair(NumOpcodes + 1, TTbl);
3940}
3941
3942bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3943 if (!Subtarget->hasBitOp3Insts())
3944 return false;
3945
3946 Register DstReg = MI.getOperand(0).getReg();
3947 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3948 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3949 if (!IsVALU)
3950 return false;
3951
3953 uint8_t TTbl;
3954 unsigned NumOpcodes;
3955
3956 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3957
3958 // Src.empty() case can happen if all operands are all zero or all ones.
3959 // Normally it shall be optimized out before reaching this.
3960 if (NumOpcodes < 2 || Src.empty())
3961 return false;
3962
3963 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3964 if (NumOpcodes == 2 && IsB32) {
3965 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3966 // asm more readable. This cannot be modeled with AddedComplexity because
3967 // selector does not know how many operations did we match.
3968 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3969 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
3970 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
3971 return false;
3972 } else if (NumOpcodes < 4) {
3973 // For a uniform case threshold should be higher to account for moves
3974 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3975 // in SGPRs and a readtfirstlane after.
3976 return false;
3977 }
3978
3979 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3980 if (!IsB32 && STI.hasTrue16BitInsts())
3981 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
3982 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
3983 unsigned CBL = STI.getConstantBusLimit(Opc);
3984 MachineBasicBlock *MBB = MI.getParent();
3985 const DebugLoc &DL = MI.getDebugLoc();
3986
3987 for (unsigned I = 0; I < Src.size(); ++I) {
3988 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3989 if (RB->getID() != AMDGPU::SGPRRegBankID)
3990 continue;
3991 if (CBL > 0) {
3992 --CBL;
3993 continue;
3994 }
3995 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3996 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
3997 .addReg(Src[I]);
3998 Src[I] = NewReg;
3999 }
4000
4001 // Last operand can be ignored, turning a ternary operation into a binary.
4002 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4003 // 'c' with 'a' here without changing the answer. In some pathological
4004 // cases it should be possible to get an operation with a single operand
4005 // too if optimizer would not catch it.
4006 while (Src.size() < 3)
4007 Src.push_back(Src[0]);
4008
4009 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4010 if (!IsB32)
4011 MIB.addImm(0); // src_mod0
4012 MIB.addReg(Src[0]);
4013 if (!IsB32)
4014 MIB.addImm(0); // src_mod1
4015 MIB.addReg(Src[1]);
4016 if (!IsB32)
4017 MIB.addImm(0); // src_mod2
4018 MIB.addReg(Src[2])
4019 .addImm(TTbl);
4020 if (!IsB32)
4021 MIB.addImm(0); // op_sel
4022
4023 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4024 MI.eraseFromParent();
4025
4026 return true;
4027}
4028
4029bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4030 Register SrcReg = MI.getOperand(0).getReg();
4031 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4032 return false;
4033
4034 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4035 Register SP =
4036 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4037 Register WaveAddr = getWaveAddress(DefMI);
4038 MachineBasicBlock *MBB = MI.getParent();
4039 const DebugLoc &DL = MI.getDebugLoc();
4040
4041 if (!WaveAddr) {
4042 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4043 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4044 .addReg(SrcReg)
4045 .addImm(Subtarget->getWavefrontSizeLog2())
4046 .setOperandDead(3); // Dead scc
4047 }
4048
4049 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4050 .addReg(WaveAddr);
4051
4052 MI.eraseFromParent();
4053 return true;
4054}
4055
4057
4058 if (!I.isPreISelOpcode()) {
4059 if (I.isCopy())
4060 return selectCOPY(I);
4061 return true;
4062 }
4063
4064 switch (I.getOpcode()) {
4065 case TargetOpcode::G_AND:
4066 case TargetOpcode::G_OR:
4067 case TargetOpcode::G_XOR:
4068 if (selectBITOP3(I))
4069 return true;
4070 if (selectImpl(I, *CoverageInfo))
4071 return true;
4072 return selectG_AND_OR_XOR(I);
4073 case TargetOpcode::G_ADD:
4074 case TargetOpcode::G_SUB:
4075 case TargetOpcode::G_PTR_ADD:
4076 if (selectImpl(I, *CoverageInfo))
4077 return true;
4078 return selectG_ADD_SUB(I);
4079 case TargetOpcode::G_UADDO:
4080 case TargetOpcode::G_USUBO:
4081 case TargetOpcode::G_UADDE:
4082 case TargetOpcode::G_USUBE:
4083 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4084 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4085 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4086 return selectG_AMDGPU_MAD_64_32(I);
4087 case TargetOpcode::G_INTTOPTR:
4088 case TargetOpcode::G_BITCAST:
4089 case TargetOpcode::G_PTRTOINT:
4090 case TargetOpcode::G_FREEZE:
4091 return selectCOPY(I);
4092 case TargetOpcode::G_FNEG:
4093 if (selectImpl(I, *CoverageInfo))
4094 return true;
4095 return selectG_FNEG(I);
4096 case TargetOpcode::G_FABS:
4097 if (selectImpl(I, *CoverageInfo))
4098 return true;
4099 return selectG_FABS(I);
4100 case TargetOpcode::G_EXTRACT:
4101 return selectG_EXTRACT(I);
4102 case TargetOpcode::G_MERGE_VALUES:
4103 case TargetOpcode::G_CONCAT_VECTORS:
4104 return selectG_MERGE_VALUES(I);
4105 case TargetOpcode::G_UNMERGE_VALUES:
4106 return selectG_UNMERGE_VALUES(I);
4107 case TargetOpcode::G_BUILD_VECTOR:
4108 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4109 return selectG_BUILD_VECTOR(I);
4110 case TargetOpcode::G_IMPLICIT_DEF:
4111 return selectG_IMPLICIT_DEF(I);
4112 case TargetOpcode::G_INSERT:
4113 return selectG_INSERT(I);
4114 case TargetOpcode::G_INTRINSIC:
4115 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4116 return selectG_INTRINSIC(I);
4117 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4118 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4119 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4120 case TargetOpcode::G_ICMP:
4121 case TargetOpcode::G_FCMP:
4122 if (selectG_ICMP_or_FCMP(I))
4123 return true;
4124 return selectImpl(I, *CoverageInfo);
4125 case TargetOpcode::G_LOAD:
4126 case TargetOpcode::G_ZEXTLOAD:
4127 case TargetOpcode::G_SEXTLOAD:
4128 case TargetOpcode::G_STORE:
4129 case TargetOpcode::G_ATOMIC_CMPXCHG:
4130 case TargetOpcode::G_ATOMICRMW_XCHG:
4131 case TargetOpcode::G_ATOMICRMW_ADD:
4132 case TargetOpcode::G_ATOMICRMW_SUB:
4133 case TargetOpcode::G_ATOMICRMW_AND:
4134 case TargetOpcode::G_ATOMICRMW_OR:
4135 case TargetOpcode::G_ATOMICRMW_XOR:
4136 case TargetOpcode::G_ATOMICRMW_MIN:
4137 case TargetOpcode::G_ATOMICRMW_MAX:
4138 case TargetOpcode::G_ATOMICRMW_UMIN:
4139 case TargetOpcode::G_ATOMICRMW_UMAX:
4140 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4141 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4142 case TargetOpcode::G_ATOMICRMW_FADD:
4143 case TargetOpcode::G_ATOMICRMW_FMIN:
4144 case TargetOpcode::G_ATOMICRMW_FMAX:
4145 return selectG_LOAD_STORE_ATOMICRMW(I);
4146 case TargetOpcode::G_SELECT:
4147 return selectG_SELECT(I);
4148 case TargetOpcode::G_TRUNC:
4149 return selectG_TRUNC(I);
4150 case TargetOpcode::G_SEXT:
4151 case TargetOpcode::G_ZEXT:
4152 case TargetOpcode::G_ANYEXT:
4153 case TargetOpcode::G_SEXT_INREG:
4154 // This is a workaround. For extension from type i1, `selectImpl()` uses
4155 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4156 // i1 can only be hold in a SGPR class.
4157 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4158 selectImpl(I, *CoverageInfo))
4159 return true;
4160 return selectG_SZA_EXT(I);
4161 case TargetOpcode::G_FPEXT:
4162 if (selectG_FPEXT(I))
4163 return true;
4164 return selectImpl(I, *CoverageInfo);
4165 case TargetOpcode::G_BRCOND:
4166 return selectG_BRCOND(I);
4167 case TargetOpcode::G_GLOBAL_VALUE:
4168 return selectG_GLOBAL_VALUE(I);
4169 case TargetOpcode::G_PTRMASK:
4170 return selectG_PTRMASK(I);
4171 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4172 return selectG_EXTRACT_VECTOR_ELT(I);
4173 case TargetOpcode::G_INSERT_VECTOR_ELT:
4174 return selectG_INSERT_VECTOR_ELT(I);
4175 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4176 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4177 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4178 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4179 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4180 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4182 assert(Intr && "not an image intrinsic with image pseudo");
4183 return selectImageIntrinsic(I, Intr);
4184 }
4185 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4186 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4187 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4188 return selectBVHIntersectRayIntrinsic(I);
4189 case AMDGPU::G_SBFX:
4190 case AMDGPU::G_UBFX:
4191 return selectG_SBFX_UBFX(I);
4192 case AMDGPU::G_SI_CALL:
4193 I.setDesc(TII.get(AMDGPU::SI_CALL));
4194 return true;
4195 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4196 return selectWaveAddress(I);
4197 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4198 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4199 return true;
4200 }
4201 case AMDGPU::G_STACKRESTORE:
4202 return selectStackRestore(I);
4203 case AMDGPU::G_PHI:
4204 return selectPHI(I);
4205 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4206 return selectCOPY_SCC_VCC(I);
4207 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4208 return selectCOPY_VCC_SCC(I);
4209 case AMDGPU::G_AMDGPU_READANYLANE:
4210 return selectReadAnyLane(I);
4211 case TargetOpcode::G_CONSTANT:
4212 case TargetOpcode::G_FCONSTANT:
4213 default:
4214 return selectImpl(I, *CoverageInfo);
4215 }
4216 return false;
4217}
4218
4220AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4221 return {{
4222 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4223 }};
4224
4225}
4226
4227std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4228 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4229 unsigned Mods = 0;
4230 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4231
4232 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4233 Src = MI->getOperand(1).getReg();
4234 Mods |= SISrcMods::NEG;
4235 MI = getDefIgnoringCopies(Src, *MRI);
4236 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4237 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4238 // denormal mode, but we're implicitly canonicalizing in a source operand.
4239 const ConstantFP *LHS =
4240 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4241 if (LHS && LHS->isZero()) {
4242 Mods |= SISrcMods::NEG;
4243 Src = MI->getOperand(2).getReg();
4244 }
4245 }
4246
4247 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4248 Src = MI->getOperand(1).getReg();
4249 Mods |= SISrcMods::ABS;
4250 }
4251
4252 if (OpSel)
4253 Mods |= SISrcMods::OP_SEL_0;
4254
4255 return std::pair(Src, Mods);
4256}
4257
4258Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4259 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4260 bool ForceVGPR) const {
4261 if ((Mods != 0 || ForceVGPR) &&
4262 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4263
4264 // If we looked through copies to find source modifiers on an SGPR operand,
4265 // we now have an SGPR register source. To avoid potentially violating the
4266 // constant bus restriction, we need to insert a copy to a VGPR.
4267 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4268 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4269 TII.get(AMDGPU::COPY), VGPRSrc)
4270 .addReg(Src);
4271 Src = VGPRSrc;
4272 }
4273
4274 return Src;
4275}
4276
4277///
4278/// This will select either an SGPR or VGPR operand and will save us from
4279/// having to write an extra tablegen pattern.
4281AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4282 return {{
4283 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4284 }};
4285}
4286
4288AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4289 Register Src;
4290 unsigned Mods;
4291 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4292
4293 return {{
4294 [=](MachineInstrBuilder &MIB) {
4295 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4296 },
4297 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4298 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4299 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4300 }};
4301}
4302
4304AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4305 Register Src;
4306 unsigned Mods;
4307 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4308 /*IsCanonicalizing=*/true,
4309 /*AllowAbs=*/false);
4310
4311 return {{
4312 [=](MachineInstrBuilder &MIB) {
4313 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4314 },
4315 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4316 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4317 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4318 }};
4319}
4320
4322AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4323 return {{
4324 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4325 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4326 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4327 }};
4328}
4329
4331AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4332 Register Src;
4333 unsigned Mods;
4334 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4335
4336 return {{
4337 [=](MachineInstrBuilder &MIB) {
4338 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4339 },
4340 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4341 }};
4342}
4343
4345AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4346 MachineOperand &Root) const {
4347 Register Src;
4348 unsigned Mods;
4349 std::tie(Src, Mods) =
4350 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4351
4352 return {{
4353 [=](MachineInstrBuilder &MIB) {
4354 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4355 },
4356 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4357 }};
4358}
4359
4361AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4362 Register Src;
4363 unsigned Mods;
4364 std::tie(Src, Mods) =
4365 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4366 /*AllowAbs=*/false);
4367
4368 return {{
4369 [=](MachineInstrBuilder &MIB) {
4370 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4371 },
4372 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4373 }};
4374}
4375
4377AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4378 Register Reg = Root.getReg();
4379 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4380 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4381 return {};
4382 return {{
4383 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4384 }};
4385}
4386
4387enum class SrcStatus {
4392 // This means current op = [op_upper, op_lower] and src = -op_lower.
4395 // This means current op = [op_upper, op_lower] and src = [op_upper,
4396 // -op_lower].
4404};
4405/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4406static bool isTruncHalf(const MachineInstr *MI,
4407 const MachineRegisterInfo &MRI) {
4408 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4409 return false;
4410
4411 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4412 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4413 return DstSize * 2 == SrcSize;
4414}
4415
4416/// Test if the MI is logic shift right with half bits,
4417/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4418static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4419 if (MI->getOpcode() != AMDGPU::G_LSHR)
4420 return false;
4421
4422 Register ShiftSrc;
4423 std::optional<ValueAndVReg> ShiftAmt;
4424 if (mi_match(MI->getOperand(0).getReg(), MRI,
4425 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4426 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4427 unsigned Shift = ShiftAmt->Value.getZExtValue();
4428 return Shift * 2 == SrcSize;
4429 }
4430 return false;
4431}
4432
4433/// Test if the MI is shift left with half bits,
4434/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4435static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4436 if (MI->getOpcode() != AMDGPU::G_SHL)
4437 return false;
4438
4439 Register ShiftSrc;
4440 std::optional<ValueAndVReg> ShiftAmt;
4441 if (mi_match(MI->getOperand(0).getReg(), MRI,
4442 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4443 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4444 unsigned Shift = ShiftAmt->Value.getZExtValue();
4445 return Shift * 2 == SrcSize;
4446 }
4447 return false;
4448}
4449
4450/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4451static bool isUnmergeHalf(const MachineInstr *MI,
4452 const MachineRegisterInfo &MRI) {
4453 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4454 return false;
4455 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4456 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4457}
4458
4460
4462 const MachineRegisterInfo &MRI) {
4463 LLT OpTy = MRI.getType(Reg);
4464 if (OpTy.isScalar())
4465 return TypeClass::SCALAR;
4466 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4469}
4470
4472 const MachineRegisterInfo &MRI) {
4474 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4475 return SrcStatus::INVALID;
4476
4477 switch (S) {
4478 case SrcStatus::IS_SAME:
4479 if (NegType == TypeClass::VECTOR_OF_TWO) {
4480 // Vector of 2:
4481 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4482 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4483 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4484 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4486 }
4487 if (NegType == TypeClass::SCALAR) {
4488 // Scalar:
4489 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4490 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4491 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4492 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4493 return SrcStatus::IS_HI_NEG;
4494 }
4495 break;
4497 if (NegType == TypeClass::VECTOR_OF_TWO) {
4498 // Vector of 2:
4499 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4500 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4501 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4502 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4503 return SrcStatus::IS_LO_NEG;
4504 }
4505 if (NegType == TypeClass::SCALAR) {
4506 // Scalar:
4507 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4508 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4509 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4510 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4511 return SrcStatus::IS_SAME;
4512 }
4513 break;
4515 if (NegType == TypeClass::VECTOR_OF_TWO) {
4516 // Vector of 2:
4517 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4518 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4519 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4520 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4521 return SrcStatus::IS_HI_NEG;
4522 }
4523 if (NegType == TypeClass::SCALAR) {
4524 // Scalar:
4525 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4526 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4527 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4528 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4530 }
4531 break;
4533 if (NegType == TypeClass::VECTOR_OF_TWO) {
4534 // Vector of 2:
4535 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4536 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4537 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4538 // [SrcHi, SrcLo] = [OpHi, OpLo]
4539 return SrcStatus::IS_SAME;
4540 }
4541 if (NegType == TypeClass::SCALAR) {
4542 // Scalar:
4543 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4544 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4545 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4546 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4547 return SrcStatus::IS_LO_NEG;
4548 }
4549 break;
4551 // Vector of 2:
4552 // Src = CurrUpper
4553 // Curr = [CurrUpper, CurrLower]
4554 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4555 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4556 // Src = -OpUpper
4557 //
4558 // Scalar:
4559 // Src = CurrUpper
4560 // Curr = [CurrUpper, CurrLower]
4561 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4562 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4563 // Src = -OpUpper
4566 if (NegType == TypeClass::VECTOR_OF_TWO) {
4567 // Vector of 2:
4568 // Src = CurrLower
4569 // Curr = [CurrUpper, CurrLower]
4570 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4571 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4572 // Src = -OpLower
4574 }
4575 if (NegType == TypeClass::SCALAR) {
4576 // Scalar:
4577 // Src = CurrLower
4578 // Curr = [CurrUpper, CurrLower]
4579 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4580 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4581 // Src = OpLower
4583 }
4584 break;
4586 // Vector of 2:
4587 // Src = -CurrUpper
4588 // Curr = [CurrUpper, CurrLower]
4589 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4590 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4591 // Src = -(-OpUpper) = OpUpper
4592 //
4593 // Scalar:
4594 // Src = -CurrUpper
4595 // Curr = [CurrUpper, CurrLower]
4596 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4597 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4598 // Src = -(-OpUpper) = OpUpper
4601 if (NegType == TypeClass::VECTOR_OF_TWO) {
4602 // Vector of 2:
4603 // Src = -CurrLower
4604 // Curr = [CurrUpper, CurrLower]
4605 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4606 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4607 // Src = -(-OpLower) = OpLower
4609 }
4610 if (NegType == TypeClass::SCALAR) {
4611 // Scalar:
4612 // Src = -CurrLower
4613 // Curr = [CurrUpper, CurrLower]
4614 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4615 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4616 // Src = -OpLower
4618 }
4619 break;
4620 default:
4621 break;
4622 }
4623 llvm_unreachable("unexpected SrcStatus & NegType combination");
4624}
4625
4626static std::optional<std::pair<Register, SrcStatus>>
4627calcNextStatus(std::pair<Register, SrcStatus> Curr,
4628 const MachineRegisterInfo &MRI) {
4629 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4630
4631 unsigned Opc = MI->getOpcode();
4632
4633 // Handle general Opc cases.
4634 switch (Opc) {
4635 case AMDGPU::G_BITCAST:
4636 return std::optional<std::pair<Register, SrcStatus>>(
4637 {MI->getOperand(1).getReg(), Curr.second});
4638 case AMDGPU::COPY:
4639 if (MI->getOperand(1).getReg().isPhysical())
4640 return std::nullopt;
4641 return std::optional<std::pair<Register, SrcStatus>>(
4642 {MI->getOperand(1).getReg(), Curr.second});
4643 case AMDGPU::G_FNEG: {
4644 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4645 if (Stat == SrcStatus::INVALID)
4646 return std::nullopt;
4647 return std::optional<std::pair<Register, SrcStatus>>(
4648 {MI->getOperand(1).getReg(), Stat});
4649 }
4650 default:
4651 break;
4652 }
4653
4654 // Calc next Stat from current Stat.
4655 switch (Curr.second) {
4656 case SrcStatus::IS_SAME:
4657 if (isTruncHalf(MI, MRI))
4658 return std::optional<std::pair<Register, SrcStatus>>(
4659 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4660 else if (isUnmergeHalf(MI, MRI)) {
4661 if (Curr.first == MI->getOperand(0).getReg())
4662 return std::optional<std::pair<Register, SrcStatus>>(
4663 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4664 return std::optional<std::pair<Register, SrcStatus>>(
4665 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4666 }
4667 break;
4669 if (isTruncHalf(MI, MRI)) {
4670 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4671 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4672 // = [OpLowerHi, OpLowerLo]
4673 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4674 // = [-OpLowerHi, OpLowerLo]
4675 // = -OpLower
4676 return std::optional<std::pair<Register, SrcStatus>>(
4677 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4678 }
4679 if (isUnmergeHalf(MI, MRI)) {
4680 if (Curr.first == MI->getOperand(0).getReg())
4681 return std::optional<std::pair<Register, SrcStatus>>(
4682 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4683 return std::optional<std::pair<Register, SrcStatus>>(
4684 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4685 }
4686 break;
4688 if (isShlHalf(MI, MRI))
4689 return std::optional<std::pair<Register, SrcStatus>>(
4690 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4691 break;
4693 if (isLshrHalf(MI, MRI))
4694 return std::optional<std::pair<Register, SrcStatus>>(
4695 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4696 break;
4698 if (isShlHalf(MI, MRI))
4699 return std::optional<std::pair<Register, SrcStatus>>(
4700 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4701 break;
4703 if (isLshrHalf(MI, MRI))
4704 return std::optional<std::pair<Register, SrcStatus>>(
4705 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4706 break;
4707 default:
4708 break;
4709 }
4710 return std::nullopt;
4711}
4712
4713/// This is used to control valid status that current MI supports. For example,
4714/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4715/// bit on VOP3P.
4716/// The class can be further extended to recognize support on SEL, NEG, ABS bit
4717/// for different MI on different arch
4719private:
4720 bool HasNeg = false;
4721 // Assume all complex pattern of VOP3P have opsel.
4722 bool HasOpsel = true;
4723
4724public:
4726 const MachineInstr *MI = MRI.getVRegDef(Reg);
4727 unsigned Opc = MI->getOpcode();
4728
4729 if (Opc < TargetOpcode::GENERIC_OP_END) {
4730 // Keep same for generic op.
4731 HasNeg = true;
4732 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4733 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4734 // Only float point intrinsic has neg & neg_hi bits.
4735 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4736 HasNeg = true;
4737 }
4738 }
4739 bool checkOptions(SrcStatus Stat) const {
4740 if (!HasNeg &&
4741 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4742 return false;
4743 }
4744 if (!HasOpsel &&
4745 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4746 return false;
4747 }
4748 return true;
4749 }
4750};
4751
4754 int MaxDepth = 3) {
4755 int Depth = 0;
4756 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4758
4759 while (Depth <= MaxDepth && Curr.has_value()) {
4760 Depth++;
4761 if (SO.checkOptions(Curr.value().second))
4762 Statlist.push_back(Curr.value());
4763 Curr = calcNextStatus(Curr.value(), MRI);
4764 }
4765
4766 return Statlist;
4767}
4768
4769static std::pair<Register, SrcStatus>
4771 int MaxDepth = 3) {
4772 int Depth = 0;
4773 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4774 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4775
4776 while (Depth <= MaxDepth && Curr.has_value()) {
4777 Depth++;
4778 SrcStatus Stat = Curr.value().second;
4779 if (SO.checkOptions(Stat)) {
4780 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4782 LastSameOrNeg = Curr.value();
4783 }
4784 Curr = calcNextStatus(Curr.value(), MRI);
4785 }
4786
4787 return LastSameOrNeg;
4788}
4789
4790static bool isSameBitWidth(Register Reg1, Register Reg2,
4791 const MachineRegisterInfo &MRI) {
4792 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4793 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4794 return Width1 == Width2;
4795}
4796
4797static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
4798 // SrcStatus::IS_LOWER_HALF remain 0.
4799 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
4800 Mods ^= SISrcMods::NEG_HI;
4801 Mods |= SISrcMods::OP_SEL_1;
4802 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
4803 Mods |= SISrcMods::OP_SEL_1;
4804 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
4805 Mods ^= SISrcMods::NEG_HI;
4806 else if (HiStat == SrcStatus::IS_HI_NEG)
4807 Mods ^= SISrcMods::NEG_HI;
4808
4809 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
4810 Mods ^= SISrcMods::NEG;
4811 Mods |= SISrcMods::OP_SEL_0;
4812 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
4813 Mods |= SISrcMods::OP_SEL_0;
4814 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
4815 Mods |= SISrcMods::NEG;
4816 else if (LoStat == SrcStatus::IS_HI_NEG)
4817 Mods ^= SISrcMods::NEG;
4818
4819 return Mods;
4820}
4821
4822static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
4823 Register RootReg, const SIInstrInfo &TII,
4824 const MachineRegisterInfo &MRI) {
4825 auto IsHalfState = [](SrcStatus S) {
4828 };
4829 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
4830 IsHalfState(HiStat);
4831}
4832
4833std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
4834 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
4835 unsigned Mods = 0;
4836 // No modification if Root type is not form of <2 x Type>.
4837 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
4838 Mods |= SISrcMods::OP_SEL_1;
4839 return {RootReg, Mods};
4840 }
4841
4842 SearchOptions SO(RootReg, MRI);
4843
4844 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
4845
4846 if (Stat.second == SrcStatus::IS_BOTH_NEG)
4848 else if (Stat.second == SrcStatus::IS_HI_NEG)
4849 Mods ^= SISrcMods::NEG_HI;
4850 else if (Stat.second == SrcStatus::IS_LO_NEG)
4851 Mods ^= SISrcMods::NEG;
4852
4853 MachineInstr *MI = MRI.getVRegDef(Stat.first);
4854
4855 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
4856 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
4857 Mods |= SISrcMods::OP_SEL_1;
4858 return {Stat.first, Mods};
4859 }
4860
4862 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
4863
4864 if (StatlistHi.empty()) {
4865 Mods |= SISrcMods::OP_SEL_1;
4866 return {Stat.first, Mods};
4867 }
4868
4870 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
4871
4872 if (StatlistLo.empty()) {
4873 Mods |= SISrcMods::OP_SEL_1;
4874 return {Stat.first, Mods};
4875 }
4876
4877 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
4878 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
4879 if (StatlistHi[I].first == StatlistLo[J].first &&
4880 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
4881 StatlistHi[I].first, RootReg, TII, MRI))
4882 return {StatlistHi[I].first,
4883 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
4884 }
4885 }
4886 // Packed instructions do not have abs modifiers.
4887 Mods |= SISrcMods::OP_SEL_1;
4888
4889 return {Stat.first, Mods};
4890}
4891
4892// Removed unused function `getAllKindImm` to eliminate dead code.
4893
4894static bool checkRB(Register Reg, unsigned int RBNo,
4895 const AMDGPURegisterBankInfo &RBI,
4896 const MachineRegisterInfo &MRI,
4897 const TargetRegisterInfo &TRI) {
4898 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
4899 return RB->getID() == RBNo;
4900}
4901
4902// This function is used to get the correct register bank for returned reg.
4903// Assume:
4904// 1. VOP3P is always legal for VGPR.
4905// 2. RootOp's regbank is legal.
4906// Thus
4907// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
4908// 2. If RootOp is VGPR, then NewOp must be VGPR.
4910 const AMDGPURegisterBankInfo &RBI,
4912 const TargetRegisterInfo &TRI,
4913 const SIInstrInfo &TII) {
4914 // RootOp can only be VGPR or SGPR (some hand written cases such as.
4915 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
4916 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
4917 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
4918 return NewReg;
4919
4920 MachineInstr *MI = MRI.getVRegDef(RootReg);
4921 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
4922 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
4923 return RootReg;
4924 }
4925
4926 MachineBasicBlock *BB = MI->getParent();
4927 Register DstReg = MRI.cloneVirtualRegister(RootReg);
4928
4930 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
4931 .addReg(NewReg);
4932
4933 // Only accept VGPR.
4934 return MIB->getOperand(0).getReg();
4935}
4936
4938AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
4939 bool IsDOT) const {
4940 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
4941 Register Reg;
4942 unsigned Mods;
4943 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
4944
4945 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
4946 return {{
4947 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4948 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4949 }};
4950}
4951
4953AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4954
4955 return selectVOP3PRetHelper(Root);
4956}
4957
4959AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4960
4961 return selectVOP3PRetHelper(Root, true);
4962}
4963
4965AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4966 MachineOperand &Root) const {
4967 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4968 "expected i1 value");
4969 unsigned Mods = SISrcMods::OP_SEL_1;
4970 if (Root.getImm() != 0)
4971 Mods |= SISrcMods::OP_SEL_0;
4972
4973 return {{
4974 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4975 }};
4976}
4977
4979 MachineInstr *InsertPt,
4981 const TargetRegisterClass *DstRegClass;
4982 switch (Elts.size()) {
4983 case 8:
4984 DstRegClass = &AMDGPU::VReg_256RegClass;
4985 break;
4986 case 4:
4987 DstRegClass = &AMDGPU::VReg_128RegClass;
4988 break;
4989 case 2:
4990 DstRegClass = &AMDGPU::VReg_64RegClass;
4991 break;
4992 default:
4993 llvm_unreachable("unhandled Reg sequence size");
4994 }
4995
4996 MachineIRBuilder B(*InsertPt);
4997 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
4998 .addDef(MRI.createVirtualRegister(DstRegClass));
4999 for (unsigned i = 0; i < Elts.size(); ++i) {
5000 MIB.addReg(Elts[i]);
5002 }
5003 return MIB->getOperand(0).getReg();
5004}
5005
5006static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5008 MachineInstr *InsertPt,
5010 if (ModOpcode == TargetOpcode::G_FNEG) {
5011 Mods |= SISrcMods::NEG;
5012 // Check if all elements also have abs modifier
5013 SmallVector<Register, 8> NegAbsElts;
5014 for (auto El : Elts) {
5015 Register FabsSrc;
5016 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5017 break;
5018 NegAbsElts.push_back(FabsSrc);
5019 }
5020 if (Elts.size() != NegAbsElts.size()) {
5021 // Neg
5022 Src = buildRegSequence(Elts, InsertPt, MRI);
5023 } else {
5024 // Neg and Abs
5025 Mods |= SISrcMods::NEG_HI;
5026 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5027 }
5028 } else {
5029 assert(ModOpcode == TargetOpcode::G_FABS);
5030 // Abs
5031 Mods |= SISrcMods::NEG_HI;
5032 Src = buildRegSequence(Elts, InsertPt, MRI);
5033 }
5034}
5035
5037AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5038 Register Src = Root.getReg();
5039 unsigned Mods = SISrcMods::OP_SEL_1;
5041
5042 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5043 assert(BV->getNumSources() > 0);
5044 // Based on first element decide which mod we match, neg or abs
5045 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5046 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5047 ? AMDGPU::G_FNEG
5048 : AMDGPU::G_FABS;
5049 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5050 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5051 if (ElF32->getOpcode() != ModOpcode)
5052 break;
5053 EltsF32.push_back(ElF32->getOperand(1).getReg());
5054 }
5055
5056 // All elements had ModOpcode modifier
5057 if (BV->getNumSources() == EltsF32.size()) {
5058 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5059 *MRI);
5060 }
5061 }
5062
5063 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5064 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5065}
5066
5068AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5069 Register Src = Root.getReg();
5070 unsigned Mods = SISrcMods::OP_SEL_1;
5071 SmallVector<Register, 8> EltsV2F16;
5072
5073 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5074 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5075 Register FNegSrc;
5076 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5077 break;
5078 EltsV2F16.push_back(FNegSrc);
5079 }
5080
5081 // All elements had ModOpcode modifier
5082 if (CV->getNumSources() == EltsV2F16.size()) {
5083 Mods |= SISrcMods::NEG;
5084 Mods |= SISrcMods::NEG_HI;
5085 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5086 }
5087 }
5088
5089 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5090 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5091}
5092
5094AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5095 Register Src = Root.getReg();
5096 unsigned Mods = SISrcMods::OP_SEL_1;
5097 SmallVector<Register, 8> EltsV2F16;
5098
5099 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5100 assert(CV->getNumSources() > 0);
5101 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5102 // Based on first element decide which mod we match, neg or abs
5103 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5104 ? AMDGPU::G_FNEG
5105 : AMDGPU::G_FABS;
5106
5107 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5108 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5109 if (ElV2F16->getOpcode() != ModOpcode)
5110 break;
5111 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5112 }
5113
5114 // All elements had ModOpcode modifier
5115 if (CV->getNumSources() == EltsV2F16.size()) {
5116 MachineIRBuilder B(*Root.getParent());
5117 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5118 *MRI);
5119 }
5120 }
5121
5122 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5123 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5124}
5125
5127AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5128 std::optional<FPValueAndVReg> FPValReg;
5129 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5130 if (TII.isInlineConstant(FPValReg->Value)) {
5131 return {{[=](MachineInstrBuilder &MIB) {
5132 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5133 }}};
5134 }
5135 // Non-inlineable splat floats should not fall-through for integer immediate
5136 // checks.
5137 return {};
5138 }
5139
5140 APInt ICst;
5141 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5142 if (TII.isInlineConstant(ICst)) {
5143 return {
5144 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5145 }
5146 }
5147
5148 return {};
5149}
5150
5152AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5153 Register Src =
5154 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5155 unsigned Key = 0;
5156
5157 Register ShiftSrc;
5158 std::optional<ValueAndVReg> ShiftAmt;
5159 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5160 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5161 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5162 Key = ShiftAmt->Value.getZExtValue() / 8;
5163 Src = ShiftSrc;
5164 }
5165
5166 return {{
5167 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5168 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5169 }};
5170}
5171
5173AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5174
5175 Register Src =
5176 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5177 unsigned Key = 0;
5178
5179 Register ShiftSrc;
5180 std::optional<ValueAndVReg> ShiftAmt;
5181 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5182 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5183 ShiftAmt->Value.getZExtValue() == 16) {
5184 Src = ShiftSrc;
5185 Key = 1;
5186 }
5187
5188 return {{
5189 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5190 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5191 }};
5192}
5193
5195AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5196 Register Src =
5197 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5198 unsigned Key = 0;
5199
5200 Register S32 = matchZeroExtendFromS32(Src);
5201 if (!S32)
5202 S32 = matchAnyExtendFromS32(Src);
5203
5204 if (S32) {
5205 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5206 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5207 assert(Def->getNumOperands() == 3);
5208 Register DstReg1 = Def->getOperand(1).getReg();
5209 if (mi_match(S32, *MRI,
5210 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5211 Src = Def->getOperand(2).getReg();
5212 Key = 1;
5213 }
5214 }
5215 }
5216
5217 return {{
5218 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5219 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5220 }};
5221}
5222
5224AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5225 Register Src;
5226 unsigned Mods;
5227 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5228
5229 // FIXME: Handle op_sel
5230 return {{
5231 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5232 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5233 }};
5234}
5235
5236// FIXME-TRUE16 remove when fake16 is removed
5238AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5239 Register Src;
5240 unsigned Mods;
5241 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5242 /*IsCanonicalizing=*/true,
5243 /*AllowAbs=*/false,
5244 /*OpSel=*/false);
5245
5246 return {{
5247 [=](MachineInstrBuilder &MIB) {
5248 MIB.addReg(
5249 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5250 },
5251 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5252 }};
5253}
5254
5256AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5257 Register Src;
5258 unsigned Mods;
5259 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5260 /*IsCanonicalizing=*/true,
5261 /*AllowAbs=*/false,
5262 /*OpSel=*/true);
5263
5264 return {{
5265 [=](MachineInstrBuilder &MIB) {
5266 MIB.addReg(
5267 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5268 },
5269 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5270 }};
5271}
5272
5273// Given \p Offset and load specified by the \p Root operand check if \p Offset
5274// is a multiple of the load byte size. If it is update \p Offset to a
5275// pre-scaled value and return true.
5276bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5278 bool IsSigned) const {
5279 if (!Subtarget->hasScaleOffset())
5280 return false;
5281
5282 const MachineInstr &MI = *Root.getParent();
5283 MachineMemOperand *MMO = *MI.memoperands_begin();
5284
5285 if (!MMO->getSize().hasValue())
5286 return false;
5287
5288 uint64_t Size = MMO->getSize().getValue();
5289
5290 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5291 if (!OffsetReg)
5292 OffsetReg = Offset;
5293
5294 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5295 OffsetReg = Def->Reg;
5296
5297 Register Op0;
5298 MachineInstr *Mul;
5299 bool ScaleOffset =
5300 (isPowerOf2_64(Size) &&
5301 mi_match(OffsetReg, *MRI,
5302 m_GShl(m_Reg(Op0),
5305 mi_match(OffsetReg, *MRI,
5307 m_Copy(m_SpecificICst(Size))))) ||
5308 mi_match(
5309 OffsetReg, *MRI,
5310 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5311 m_Reg(Op0), m_SpecificICst(Size))) ||
5312 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5313 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5314 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5315 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5316 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5317 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5318 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5319 mi_match(Mul->getOperand(3).getReg(), *MRI,
5321 m_Copy(m_SpecificICst(Size))))) &&
5322 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5323
5324 if (ScaleOffset)
5325 Offset = Op0;
5326
5327 return ScaleOffset;
5328}
5329
5330bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5331 Register &Base,
5332 Register *SOffset,
5333 int64_t *Offset,
5334 bool *ScaleOffset) const {
5335 MachineInstr *MI = Root.getParent();
5336 MachineBasicBlock *MBB = MI->getParent();
5337
5338 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5339 // then we can select all ptr + 32-bit offsets.
5340 SmallVector<GEPInfo, 4> AddrInfo;
5341 getAddrModeInfo(*MI, *MRI, AddrInfo);
5342
5343 if (AddrInfo.empty())
5344 return false;
5345
5346 const GEPInfo &GEPI = AddrInfo[0];
5347 std::optional<int64_t> EncodedImm;
5348
5349 if (ScaleOffset)
5350 *ScaleOffset = false;
5351
5352 if (SOffset && Offset) {
5353 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5354 /*HasSOffset=*/true);
5355 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5356 AddrInfo.size() > 1) {
5357 const GEPInfo &GEPI2 = AddrInfo[1];
5358 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5359 Register OffsetReg = GEPI2.SgprParts[1];
5360 if (ScaleOffset)
5361 *ScaleOffset =
5362 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5363 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5364 if (OffsetReg) {
5365 Base = GEPI2.SgprParts[0];
5366 *SOffset = OffsetReg;
5367 *Offset = *EncodedImm;
5368 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5369 return true;
5370
5371 // For unbuffered smem loads, it is illegal for the Immediate Offset
5372 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5373 // is negative. Handle the case where the Immediate Offset + SOffset
5374 // is negative.
5375 auto SKnown = VT->getKnownBits(*SOffset);
5376 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5377 return false;
5378
5379 return true;
5380 }
5381 }
5382 }
5383 return false;
5384 }
5385
5386 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5387 /*HasSOffset=*/false);
5388 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5389 Base = GEPI.SgprParts[0];
5390 *Offset = *EncodedImm;
5391 return true;
5392 }
5393
5394 // SGPR offset is unsigned.
5395 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5396 GEPI.Imm != 0) {
5397 // If we make it this far we have a load with an 32-bit immediate offset.
5398 // It is OK to select this using a sgpr offset, because we have already
5399 // failed trying to select this load into one of the _IMM variants since
5400 // the _IMM Patterns are considered before the _SGPR patterns.
5401 Base = GEPI.SgprParts[0];
5402 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5403 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5404 .addImm(GEPI.Imm);
5405 return true;
5406 }
5407
5408 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5409 Register OffsetReg = GEPI.SgprParts[1];
5410 if (ScaleOffset)
5411 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5412 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5413 if (OffsetReg) {
5414 Base = GEPI.SgprParts[0];
5415 *SOffset = OffsetReg;
5416 return true;
5417 }
5418 }
5419
5420 return false;
5421}
5422
5424AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5425 Register Base;
5426 int64_t Offset;
5427 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5428 /* ScaleOffset */ nullptr))
5429 return std::nullopt;
5430
5431 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5432 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5433}
5434
5436AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5437 SmallVector<GEPInfo, 4> AddrInfo;
5438 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5439
5440 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5441 return std::nullopt;
5442
5443 const GEPInfo &GEPInfo = AddrInfo[0];
5444 Register PtrReg = GEPInfo.SgprParts[0];
5445 std::optional<int64_t> EncodedImm =
5446 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5447 if (!EncodedImm)
5448 return std::nullopt;
5449
5450 return {{
5451 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5452 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5453 }};
5454}
5455
5457AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5458 Register Base, SOffset;
5459 bool ScaleOffset;
5460 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5461 &ScaleOffset))
5462 return std::nullopt;
5463
5464 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5465 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5466 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5467 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5468}
5469
5471AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5472 Register Base, SOffset;
5473 int64_t Offset;
5474 bool ScaleOffset;
5475 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5476 return std::nullopt;
5477
5478 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5479 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5480 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5481 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5482 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5483}
5484
5485std::pair<Register, int>
5486AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5487 uint64_t FlatVariant) const {
5488 MachineInstr *MI = Root.getParent();
5489
5490 auto Default = std::pair(Root.getReg(), 0);
5491
5492 if (!STI.hasFlatInstOffsets())
5493 return Default;
5494
5495 Register PtrBase;
5496 int64_t ConstOffset;
5497 bool IsInBounds;
5498 std::tie(PtrBase, ConstOffset, IsInBounds) =
5499 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5500
5501 // Adding the offset to the base address with an immediate in a FLAT
5502 // instruction must not change the memory aperture in which the address falls.
5503 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5504 // instructions.
5505 if (ConstOffset == 0 ||
5506 (FlatVariant == SIInstrFlags::FlatScratch &&
5507 !isFlatScratchBaseLegal(Root.getReg())) ||
5508 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5509 return Default;
5510
5511 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5512 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5513 return Default;
5514
5515 return std::pair(PtrBase, ConstOffset);
5516}
5517
5519AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5520 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5521
5522 return {{
5523 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5524 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5525 }};
5526}
5527
5529AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5530 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5531
5532 return {{
5533 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5534 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5535 }};
5536}
5537
5539AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5540 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5541
5542 return {{
5543 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5544 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5545 }};
5546}
5547
5548// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5550AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5551 unsigned CPolBits,
5552 bool NeedIOffset) const {
5553 Register Addr = Root.getReg();
5554 Register PtrBase;
5555 int64_t ConstOffset;
5556 int64_t ImmOffset = 0;
5557
5558 // Match the immediate offset first, which canonically is moved as low as
5559 // possible.
5560 std::tie(PtrBase, ConstOffset, std::ignore) =
5561 getPtrBaseWithConstantOffset(Addr, *MRI);
5562
5563 if (ConstOffset != 0) {
5564 if (NeedIOffset &&
5565 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5567 Addr = PtrBase;
5568 ImmOffset = ConstOffset;
5569 } else {
5570 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5571 if (isSGPR(PtrBaseDef->Reg)) {
5572 if (ConstOffset > 0) {
5573 // Offset is too large.
5574 //
5575 // saddr + large_offset -> saddr +
5576 // (voffset = large_offset & ~MaxOffset) +
5577 // (large_offset & MaxOffset);
5578 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5579 if (NeedIOffset) {
5580 std::tie(SplitImmOffset, RemainderOffset) =
5581 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5583 }
5584
5585 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5586 : isUInt<32>(RemainderOffset)) {
5587 MachineInstr *MI = Root.getParent();
5588 MachineBasicBlock *MBB = MI->getParent();
5589 Register HighBits =
5590 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5591
5592 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5593 HighBits)
5594 .addImm(RemainderOffset);
5595
5596 if (NeedIOffset)
5597 return {{
5598 [=](MachineInstrBuilder &MIB) {
5599 MIB.addReg(PtrBase);
5600 }, // saddr
5601 [=](MachineInstrBuilder &MIB) {
5602 MIB.addReg(HighBits);
5603 }, // voffset
5604 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5605 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5606 }};
5607 return {{
5608 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5609 [=](MachineInstrBuilder &MIB) {
5610 MIB.addReg(HighBits);
5611 }, // voffset
5612 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5613 }};
5614 }
5615 }
5616
5617 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5618 // is 1 we would need to perform 1 or 2 extra moves for each half of
5619 // the constant and it is better to do a scalar add and then issue a
5620 // single VALU instruction to materialize zero. Otherwise it is less
5621 // instructions to perform VALU adds with immediates or inline literals.
5622 unsigned NumLiterals =
5623 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5624 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5625 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5626 return std::nullopt;
5627 }
5628 }
5629 }
5630
5631 // Match the variable offset.
5632 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5633 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5634 // Look through the SGPR->VGPR copy.
5635 Register SAddr =
5636 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5637
5638 if (isSGPR(SAddr)) {
5639 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5640
5641 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5642 // inserted later.
5643 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5644 Subtarget->hasSignedGVSOffset());
5645 if (Register VOffset = matchExtendFromS32OrS32(
5646 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5647 if (NeedIOffset)
5648 return {{[=](MachineInstrBuilder &MIB) { // saddr
5649 MIB.addReg(SAddr);
5650 },
5651 [=](MachineInstrBuilder &MIB) { // voffset
5652 MIB.addReg(VOffset);
5653 },
5654 [=](MachineInstrBuilder &MIB) { // offset
5655 MIB.addImm(ImmOffset);
5656 },
5657 [=](MachineInstrBuilder &MIB) { // cpol
5658 MIB.addImm(CPolBits |
5659 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5660 }}};
5661 return {{[=](MachineInstrBuilder &MIB) { // saddr
5662 MIB.addReg(SAddr);
5663 },
5664 [=](MachineInstrBuilder &MIB) { // voffset
5665 MIB.addReg(VOffset);
5666 },
5667 [=](MachineInstrBuilder &MIB) { // cpol
5668 MIB.addImm(CPolBits |
5669 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5670 }}};
5671 }
5672 }
5673 }
5674
5675 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5676 // drop this.
5677 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5678 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5679 return std::nullopt;
5680
5681 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5682 // moves required to copy a 64-bit SGPR to VGPR.
5683 MachineInstr *MI = Root.getParent();
5684 MachineBasicBlock *MBB = MI->getParent();
5685 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5686
5687 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5688 .addImm(0);
5689
5690 if (NeedIOffset)
5691 return {{
5692 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5693 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5694 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5695 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5696 }};
5697 return {{
5698 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5699 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5700 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5701 }};
5702}
5703
5705AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5706 return selectGlobalSAddr(Root, 0);
5707}
5708
5710AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5711 const MachineInstr &I = *Root.getParent();
5712
5713 // We are assuming CPol is always the last operand of the intrinsic.
5714 auto PassedCPol =
5715 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5716 return selectGlobalSAddr(Root, PassedCPol);
5717}
5718
5720AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
5721 const MachineInstr &I = *Root.getParent();
5722
5723 // We are assuming CPol is second from last operand of the intrinsic.
5724 auto PassedCPol =
5725 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5726 return selectGlobalSAddr(Root, PassedCPol);
5727}
5728
5730AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
5731 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
5732}
5733
5735AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5736 MachineOperand &Root) const {
5737 const MachineInstr &I = *Root.getParent();
5738
5739 // We are assuming CPol is always the last operand of the intrinsic.
5740 auto PassedCPol =
5741 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5742 return selectGlobalSAddr(Root, PassedCPol, false);
5743}
5744
5746AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
5747 MachineOperand &Root) const {
5748 const MachineInstr &I = *Root.getParent();
5749
5750 // We are assuming CPol is second from last operand of the intrinsic.
5751 auto PassedCPol =
5752 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5753 return selectGlobalSAddr(Root, PassedCPol, false);
5754}
5755
5757AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5758 Register Addr = Root.getReg();
5759 Register PtrBase;
5760 int64_t ConstOffset;
5761 int64_t ImmOffset = 0;
5762
5763 // Match the immediate offset first, which canonically is moved as low as
5764 // possible.
5765 std::tie(PtrBase, ConstOffset, std::ignore) =
5766 getPtrBaseWithConstantOffset(Addr, *MRI);
5767
5768 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5769 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5771 Addr = PtrBase;
5772 ImmOffset = ConstOffset;
5773 }
5774
5775 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5776 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5777 int FI = AddrDef->MI->getOperand(1).getIndex();
5778 return {{
5779 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5780 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5781 }};
5782 }
5783
5784 Register SAddr = AddrDef->Reg;
5785
5786 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5787 Register LHS = AddrDef->MI->getOperand(1).getReg();
5788 Register RHS = AddrDef->MI->getOperand(2).getReg();
5789 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5790 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5791
5792 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5793 isSGPR(RHSDef->Reg)) {
5794 int FI = LHSDef->MI->getOperand(1).getIndex();
5795 MachineInstr &I = *Root.getParent();
5796 MachineBasicBlock *BB = I.getParent();
5797 const DebugLoc &DL = I.getDebugLoc();
5798 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5799
5800 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
5801 .addFrameIndex(FI)
5802 .addReg(RHSDef->Reg)
5803 .setOperandDead(3); // Dead scc
5804 }
5805 }
5806
5807 if (!isSGPR(SAddr))
5808 return std::nullopt;
5809
5810 return {{
5811 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
5812 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5813 }};
5814}
5815
5816// Check whether the flat scratch SVS swizzle bug affects this access.
5817bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5818 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5819 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5820 return false;
5821
5822 // The bug affects the swizzling of SVS accesses if there is any carry out
5823 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5824 // voffset to (soffset + inst_offset).
5825 auto VKnown = VT->getKnownBits(VAddr);
5826 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
5827 KnownBits::makeConstant(APInt(32, ImmOffset)));
5828 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5829 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5830 return (VMax & 3) + (SMax & 3) >= 4;
5831}
5832
5834AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5835 Register Addr = Root.getReg();
5836 Register PtrBase;
5837 int64_t ConstOffset;
5838 int64_t ImmOffset = 0;
5839
5840 // Match the immediate offset first, which canonically is moved as low as
5841 // possible.
5842 std::tie(PtrBase, ConstOffset, std::ignore) =
5843 getPtrBaseWithConstantOffset(Addr, *MRI);
5844
5845 Register OrigAddr = Addr;
5846 if (ConstOffset != 0 &&
5847 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5849 Addr = PtrBase;
5850 ImmOffset = ConstOffset;
5851 }
5852
5853 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5854 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5855 return std::nullopt;
5856
5857 Register RHS = AddrDef->MI->getOperand(2).getReg();
5858 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5859 return std::nullopt;
5860
5861 Register LHS = AddrDef->MI->getOperand(1).getReg();
5862 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5863
5864 if (OrigAddr != Addr) {
5865 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5866 return std::nullopt;
5867 } else {
5868 if (!isFlatScratchBaseLegalSV(OrigAddr))
5869 return std::nullopt;
5870 }
5871
5872 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5873 return std::nullopt;
5874
5875 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
5877 : 0;
5878
5879 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5880 int FI = LHSDef->MI->getOperand(1).getIndex();
5881 return {{
5882 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5883 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5884 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5885 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5886 }};
5887 }
5888
5889 if (!isSGPR(LHS))
5890 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
5891 LHS = Def->Reg;
5892
5893 if (!isSGPR(LHS))
5894 return std::nullopt;
5895
5896 return {{
5897 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5898 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5899 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5900 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5901 }};
5902}
5903
5905AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5906 MachineInstr *MI = Root.getParent();
5907 MachineBasicBlock *MBB = MI->getParent();
5908 MachineFunction *MF = MBB->getParent();
5909 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5910
5911 int64_t Offset = 0;
5912 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5913 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5914 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5915
5916 // TODO: Should this be inside the render function? The iterator seems to
5917 // move.
5918 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5919 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5920 HighBits)
5921 .addImm(Offset & ~MaxOffset);
5922
5923 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5924 MIB.addReg(Info->getScratchRSrcReg());
5925 },
5926 [=](MachineInstrBuilder &MIB) { // vaddr
5927 MIB.addReg(HighBits);
5928 },
5929 [=](MachineInstrBuilder &MIB) { // soffset
5930 // Use constant zero for soffset and rely on eliminateFrameIndex
5931 // to choose the appropriate frame register if need be.
5932 MIB.addImm(0);
5933 },
5934 [=](MachineInstrBuilder &MIB) { // offset
5935 MIB.addImm(Offset & MaxOffset);
5936 }}};
5937 }
5938
5939 assert(Offset == 0 || Offset == -1);
5940
5941 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5942 // offsets.
5943 std::optional<int> FI;
5944 Register VAddr = Root.getReg();
5945
5946 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5947 Register PtrBase;
5948 int64_t ConstOffset;
5949 std::tie(PtrBase, ConstOffset, std::ignore) =
5950 getPtrBaseWithConstantOffset(VAddr, *MRI);
5951 if (ConstOffset != 0) {
5952 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5953 (!STI.privateMemoryResourceIsRangeChecked() ||
5954 VT->signBitIsZero(PtrBase))) {
5955 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5956 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5957 FI = PtrBaseDef->getOperand(1).getIndex();
5958 else
5959 VAddr = PtrBase;
5960 Offset = ConstOffset;
5961 }
5962 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5963 FI = RootDef->getOperand(1).getIndex();
5964 }
5965
5966 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5967 MIB.addReg(Info->getScratchRSrcReg());
5968 },
5969 [=](MachineInstrBuilder &MIB) { // vaddr
5970 if (FI)
5971 MIB.addFrameIndex(*FI);
5972 else
5973 MIB.addReg(VAddr);
5974 },
5975 [=](MachineInstrBuilder &MIB) { // soffset
5976 // Use constant zero for soffset and rely on eliminateFrameIndex
5977 // to choose the appropriate frame register if need be.
5978 MIB.addImm(0);
5979 },
5980 [=](MachineInstrBuilder &MIB) { // offset
5981 MIB.addImm(Offset);
5982 }}};
5983}
5984
5985bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
5986 int64_t Offset) const {
5987 if (!isUInt<16>(Offset))
5988 return false;
5989
5990 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5991 return true;
5992
5993 // On Southern Islands instruction with a negative base value and an offset
5994 // don't seem to work.
5995 return VT->signBitIsZero(Base);
5996}
5997
5998bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
5999 int64_t Offset1,
6000 unsigned Size) const {
6001 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6002 return false;
6003 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6004 return false;
6005
6006 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6007 return true;
6008
6009 // On Southern Islands instruction with a negative base value and an offset
6010 // don't seem to work.
6011 return VT->signBitIsZero(Base);
6012}
6013
6014// Return whether the operation has NoUnsignedWrap property.
6015static bool isNoUnsignedWrap(MachineInstr *Addr) {
6016 return Addr->getOpcode() == TargetOpcode::G_OR ||
6017 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6019}
6020
6021// Check that the base address of flat scratch load/store in the form of `base +
6022// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6023// requirement). We always treat the first operand as the base address here.
6024bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6025 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6026
6027 if (isNoUnsignedWrap(AddrMI))
6028 return true;
6029
6030 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6031 // values.
6032 if (STI.hasSignedScratchOffsets())
6033 return true;
6034
6035 Register LHS = AddrMI->getOperand(1).getReg();
6036 Register RHS = AddrMI->getOperand(2).getReg();
6037
6038 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6039 std::optional<ValueAndVReg> RhsValReg =
6041 // If the immediate offset is negative and within certain range, the base
6042 // address cannot also be negative. If the base is also negative, the sum
6043 // would be either negative or much larger than the valid range of scratch
6044 // memory a thread can access.
6045 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6046 RhsValReg->Value.getSExtValue() > -0x40000000)
6047 return true;
6048 }
6049
6050 return VT->signBitIsZero(LHS);
6051}
6052
6053// Check address value in SGPR/VGPR are legal for flat scratch in the form
6054// of: SGPR + VGPR.
6055bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6056 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6057
6058 if (isNoUnsignedWrap(AddrMI))
6059 return true;
6060
6061 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6062 // values.
6063 if (STI.hasSignedScratchOffsets())
6064 return true;
6065
6066 Register LHS = AddrMI->getOperand(1).getReg();
6067 Register RHS = AddrMI->getOperand(2).getReg();
6068 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6069}
6070
6071// Check address value in SGPR/VGPR are legal for flat scratch in the form
6072// of: SGPR + VGPR + Imm.
6073bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6074 Register Addr) const {
6075 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6076 // values.
6077 if (STI.hasSignedScratchOffsets())
6078 return true;
6079
6080 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6081 Register Base = AddrMI->getOperand(1).getReg();
6082 std::optional<DefinitionAndSourceRegister> BaseDef =
6084 std::optional<ValueAndVReg> RHSOffset =
6086 assert(RHSOffset);
6087
6088 // If the immediate offset is negative and within certain range, the base
6089 // address cannot also be negative. If the base is also negative, the sum
6090 // would be either negative or much larger than the valid range of scratch
6091 // memory a thread can access.
6092 if (isNoUnsignedWrap(BaseDef->MI) &&
6093 (isNoUnsignedWrap(AddrMI) ||
6094 (RHSOffset->Value.getSExtValue() < 0 &&
6095 RHSOffset->Value.getSExtValue() > -0x40000000)))
6096 return true;
6097
6098 Register LHS = BaseDef->MI->getOperand(1).getReg();
6099 Register RHS = BaseDef->MI->getOperand(2).getReg();
6100 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6101}
6102
6103bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6104 unsigned ShAmtBits) const {
6105 assert(MI.getOpcode() == TargetOpcode::G_AND);
6106
6107 std::optional<APInt> RHS =
6108 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6109 if (!RHS)
6110 return false;
6111
6112 if (RHS->countr_one() >= ShAmtBits)
6113 return true;
6114
6115 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6116 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6117}
6118
6120AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6121 MachineOperand &Root) const {
6122 Register Reg = Root.getReg();
6123 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6124
6125 std::optional<DefinitionAndSourceRegister> Def =
6127 assert(Def && "this shouldn't be an optional result");
6128 Reg = Def->Reg;
6129
6130 if (Register WaveBase = getWaveAddress(Def->MI)) {
6131 return {{
6132 [=](MachineInstrBuilder &MIB) { // rsrc
6133 MIB.addReg(Info->getScratchRSrcReg());
6134 },
6135 [=](MachineInstrBuilder &MIB) { // soffset
6136 MIB.addReg(WaveBase);
6137 },
6138 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6139 }};
6140 }
6141
6142 int64_t Offset = 0;
6143
6144 // FIXME: Copy check is a hack
6146 if (mi_match(Reg, *MRI,
6147 m_GPtrAdd(m_Reg(BasePtr),
6149 if (!TII.isLegalMUBUFImmOffset(Offset))
6150 return {};
6151 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6152 Register WaveBase = getWaveAddress(BasePtrDef);
6153 if (!WaveBase)
6154 return {};
6155
6156 return {{
6157 [=](MachineInstrBuilder &MIB) { // rsrc
6158 MIB.addReg(Info->getScratchRSrcReg());
6159 },
6160 [=](MachineInstrBuilder &MIB) { // soffset
6161 MIB.addReg(WaveBase);
6162 },
6163 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6164 }};
6165 }
6166
6167 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6168 !TII.isLegalMUBUFImmOffset(Offset))
6169 return {};
6170
6171 return {{
6172 [=](MachineInstrBuilder &MIB) { // rsrc
6173 MIB.addReg(Info->getScratchRSrcReg());
6174 },
6175 [=](MachineInstrBuilder &MIB) { // soffset
6176 MIB.addImm(0);
6177 },
6178 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6179 }};
6180}
6181
6182std::pair<Register, unsigned>
6183AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6184 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6185 int64_t ConstAddr = 0;
6186
6187 Register PtrBase;
6188 int64_t Offset;
6189 std::tie(PtrBase, Offset, std::ignore) =
6190 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6191
6192 if (Offset) {
6193 if (isDSOffsetLegal(PtrBase, Offset)) {
6194 // (add n0, c0)
6195 return std::pair(PtrBase, Offset);
6196 }
6197 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6198 // TODO
6199
6200
6201 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6202 // TODO
6203
6204 }
6205
6206 return std::pair(Root.getReg(), 0);
6207}
6208
6210AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6211 Register Reg;
6212 unsigned Offset;
6213 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6214 return {{
6215 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6216 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6217 }};
6218}
6219
6221AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6222 return selectDSReadWrite2(Root, 4);
6223}
6224
6226AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6227 return selectDSReadWrite2(Root, 8);
6228}
6229
6231AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6232 unsigned Size) const {
6233 Register Reg;
6234 unsigned Offset;
6235 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6236 return {{
6237 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6238 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6239 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6240 }};
6241}
6242
6243std::pair<Register, unsigned>
6244AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6245 unsigned Size) const {
6246 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6247 int64_t ConstAddr = 0;
6248
6249 Register PtrBase;
6250 int64_t Offset;
6251 std::tie(PtrBase, Offset, std::ignore) =
6252 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6253
6254 if (Offset) {
6255 int64_t OffsetValue0 = Offset;
6256 int64_t OffsetValue1 = Offset + Size;
6257 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6258 // (add n0, c0)
6259 return std::pair(PtrBase, OffsetValue0 / Size);
6260 }
6261 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6262 // TODO
6263
6264 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6265 // TODO
6266
6267 }
6268
6269 return std::pair(Root.getReg(), 0);
6270}
6271
6272/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6273/// the base value with the constant offset, and if the offset computation is
6274/// known to be inbounds. There may be intervening copies between \p Root and
6275/// the identified constant. Returns \p Root, 0, false if this does not match
6276/// the pattern.
6277std::tuple<Register, int64_t, bool>
6278AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6279 Register Root, const MachineRegisterInfo &MRI) const {
6280 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6281 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6282 return {Root, 0, false};
6283
6284 MachineOperand &RHS = RootI->getOperand(2);
6285 std::optional<ValueAndVReg> MaybeOffset =
6287 if (!MaybeOffset)
6288 return {Root, 0, false};
6289 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6290 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6291 IsInBounds};
6292}
6293
6295 MIB.addImm(0);
6296}
6297
6298/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6299/// BasePtr is not valid, a null base pointer will be used.
6301 uint32_t FormatLo, uint32_t FormatHi,
6302 Register BasePtr) {
6303 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6304 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6305 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6306 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6307
6308 B.buildInstr(AMDGPU::S_MOV_B32)
6309 .addDef(RSrc2)
6310 .addImm(FormatLo);
6311 B.buildInstr(AMDGPU::S_MOV_B32)
6312 .addDef(RSrc3)
6313 .addImm(FormatHi);
6314
6315 // Build the half of the subregister with the constants before building the
6316 // full 128-bit register. If we are building multiple resource descriptors,
6317 // this will allow CSEing of the 2-component register.
6318 B.buildInstr(AMDGPU::REG_SEQUENCE)
6319 .addDef(RSrcHi)
6320 .addReg(RSrc2)
6321 .addImm(AMDGPU::sub0)
6322 .addReg(RSrc3)
6323 .addImm(AMDGPU::sub1);
6324
6325 Register RSrcLo = BasePtr;
6326 if (!BasePtr) {
6327 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6328 B.buildInstr(AMDGPU::S_MOV_B64)
6329 .addDef(RSrcLo)
6330 .addImm(0);
6331 }
6332
6333 B.buildInstr(AMDGPU::REG_SEQUENCE)
6334 .addDef(RSrc)
6335 .addReg(RSrcLo)
6336 .addImm(AMDGPU::sub0_sub1)
6337 .addReg(RSrcHi)
6338 .addImm(AMDGPU::sub2_sub3);
6339
6340 return RSrc;
6341}
6342
6344 const SIInstrInfo &TII, Register BasePtr) {
6345 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6346
6347 // FIXME: Why are half the "default" bits ignored based on the addressing
6348 // mode?
6349 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6350}
6351
6353 const SIInstrInfo &TII, Register BasePtr) {
6354 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6355
6356 // FIXME: Why are half the "default" bits ignored based on the addressing
6357 // mode?
6358 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6359}
6360
6361AMDGPUInstructionSelector::MUBUFAddressData
6362AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6363 MUBUFAddressData Data;
6364 Data.N0 = Src;
6365
6366 Register PtrBase;
6367 int64_t Offset;
6368
6369 std::tie(PtrBase, Offset, std::ignore) =
6370 getPtrBaseWithConstantOffset(Src, *MRI);
6371 if (isUInt<32>(Offset)) {
6372 Data.N0 = PtrBase;
6373 Data.Offset = Offset;
6374 }
6375
6376 if (MachineInstr *InputAdd
6377 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6378 Data.N2 = InputAdd->getOperand(1).getReg();
6379 Data.N3 = InputAdd->getOperand(2).getReg();
6380
6381 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6382 // FIXME: Don't know this was defined by operand 0
6383 //
6384 // TODO: Remove this when we have copy folding optimizations after
6385 // RegBankSelect.
6386 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6387 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6388 }
6389
6390 return Data;
6391}
6392
6393/// Return if the addr64 mubuf mode should be used for the given address.
6394bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6395 // (ptr_add N2, N3) -> addr64, or
6396 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6397 if (Addr.N2)
6398 return true;
6399
6400 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6401 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6402}
6403
6404/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6405/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6406/// component.
6407void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6408 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6409 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6410 return;
6411
6412 // Illegal offset, store it in soffset.
6413 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6414 B.buildInstr(AMDGPU::S_MOV_B32)
6415 .addDef(SOffset)
6416 .addImm(ImmOffset);
6417 ImmOffset = 0;
6418}
6419
6420bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6421 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6422 Register &SOffset, int64_t &Offset) const {
6423 // FIXME: Predicates should stop this from reaching here.
6424 // addr64 bit was removed for volcanic islands.
6425 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6426 return false;
6427
6428 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6429 if (!shouldUseAddr64(AddrData))
6430 return false;
6431
6432 Register N0 = AddrData.N0;
6433 Register N2 = AddrData.N2;
6434 Register N3 = AddrData.N3;
6435 Offset = AddrData.Offset;
6436
6437 // Base pointer for the SRD.
6438 Register SRDPtr;
6439
6440 if (N2) {
6441 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6442 assert(N3);
6443 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6444 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6445 // addr64, and construct the default resource from a 0 address.
6446 VAddr = N0;
6447 } else {
6448 SRDPtr = N3;
6449 VAddr = N2;
6450 }
6451 } else {
6452 // N2 is not divergent.
6453 SRDPtr = N2;
6454 VAddr = N3;
6455 }
6456 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6457 // Use the default null pointer in the resource
6458 VAddr = N0;
6459 } else {
6460 // N0 -> offset, or
6461 // (N0 + C1) -> offset
6462 SRDPtr = N0;
6463 }
6464
6465 MachineIRBuilder B(*Root.getParent());
6466 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6467 splitIllegalMUBUFOffset(B, SOffset, Offset);
6468 return true;
6469}
6470
6471bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6472 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6473 int64_t &Offset) const {
6474
6475 // FIXME: Pattern should not reach here.
6476 if (STI.useFlatForGlobal())
6477 return false;
6478
6479 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6480 if (shouldUseAddr64(AddrData))
6481 return false;
6482
6483 // N0 -> offset, or
6484 // (N0 + C1) -> offset
6485 Register SRDPtr = AddrData.N0;
6486 Offset = AddrData.Offset;
6487
6488 // TODO: Look through extensions for 32-bit soffset.
6489 MachineIRBuilder B(*Root.getParent());
6490
6491 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6492 splitIllegalMUBUFOffset(B, SOffset, Offset);
6493 return true;
6494}
6495
6497AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6498 Register VAddr;
6499 Register RSrcReg;
6500 Register SOffset;
6501 int64_t Offset = 0;
6502
6503 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6504 return {};
6505
6506 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6507 // pattern.
6508 return {{
6509 [=](MachineInstrBuilder &MIB) { // rsrc
6510 MIB.addReg(RSrcReg);
6511 },
6512 [=](MachineInstrBuilder &MIB) { // vaddr
6513 MIB.addReg(VAddr);
6514 },
6515 [=](MachineInstrBuilder &MIB) { // soffset
6516 if (SOffset)
6517 MIB.addReg(SOffset);
6518 else if (STI.hasRestrictedSOffset())
6519 MIB.addReg(AMDGPU::SGPR_NULL);
6520 else
6521 MIB.addImm(0);
6522 },
6523 [=](MachineInstrBuilder &MIB) { // offset
6524 MIB.addImm(Offset);
6525 },
6526 addZeroImm, // cpol
6527 addZeroImm, // tfe
6528 addZeroImm // swz
6529 }};
6530}
6531
6533AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6534 Register RSrcReg;
6535 Register SOffset;
6536 int64_t Offset = 0;
6537
6538 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6539 return {};
6540
6541 return {{
6542 [=](MachineInstrBuilder &MIB) { // rsrc
6543 MIB.addReg(RSrcReg);
6544 },
6545 [=](MachineInstrBuilder &MIB) { // soffset
6546 if (SOffset)
6547 MIB.addReg(SOffset);
6548 else if (STI.hasRestrictedSOffset())
6549 MIB.addReg(AMDGPU::SGPR_NULL);
6550 else
6551 MIB.addImm(0);
6552 },
6553 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6554 addZeroImm, // cpol
6555 addZeroImm, // tfe
6556 addZeroImm, // swz
6557 }};
6558}
6559
6561AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6562
6563 Register SOffset = Root.getReg();
6564
6565 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6566 SOffset = AMDGPU::SGPR_NULL;
6567
6568 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6569}
6570
6571/// Get an immediate that must be 32-bits, and treated as zero extended.
6572static std::optional<uint64_t>
6574 // getIConstantVRegVal sexts any values, so see if that matters.
6575 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6576 if (!OffsetVal || !isInt<32>(*OffsetVal))
6577 return std::nullopt;
6578 return Lo_32(*OffsetVal);
6579}
6580
6582AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6583 std::optional<uint64_t> OffsetVal =
6584 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6585 if (!OffsetVal)
6586 return {};
6587
6588 std::optional<int64_t> EncodedImm =
6589 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6590 if (!EncodedImm)
6591 return {};
6592
6593 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6594}
6595
6597AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6598 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6599
6600 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6601 if (!OffsetVal)
6602 return {};
6603
6604 std::optional<int64_t> EncodedImm =
6606 if (!EncodedImm)
6607 return {};
6608
6609 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6610}
6611
6613AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6614 // Match the (soffset + offset) pair as a 32-bit register base and
6615 // an immediate offset.
6616 Register SOffset;
6617 unsigned Offset;
6618 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6619 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6620 if (!SOffset)
6621 return std::nullopt;
6622
6623 std::optional<int64_t> EncodedOffset =
6624 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6625 if (!EncodedOffset)
6626 return std::nullopt;
6627
6628 assert(MRI->getType(SOffset) == LLT::scalar(32));
6629 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6630 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6631}
6632
6633std::pair<Register, unsigned>
6634AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6635 bool &Matched) const {
6636 Matched = false;
6637
6638 Register Src;
6639 unsigned Mods;
6640 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6641
6642 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6643 assert(MRI->getType(Src) == LLT::scalar(16));
6644
6645 // Only change Src if src modifier could be gained. In such cases new Src
6646 // could be sgpr but this does not violate constant bus restriction for
6647 // instruction that is being selected.
6648 Src = stripBitCast(Src, *MRI);
6649
6650 const auto CheckAbsNeg = [&]() {
6651 // Be careful about folding modifiers if we already have an abs. fneg is
6652 // applied last, so we don't want to apply an earlier fneg.
6653 if ((Mods & SISrcMods::ABS) == 0) {
6654 unsigned ModsTmp;
6655 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6656
6657 if ((ModsTmp & SISrcMods::NEG) != 0)
6658 Mods ^= SISrcMods::NEG;
6659
6660 if ((ModsTmp & SISrcMods::ABS) != 0)
6661 Mods |= SISrcMods::ABS;
6662 }
6663 };
6664
6665 CheckAbsNeg();
6666
6667 // op_sel/op_sel_hi decide the source type and source.
6668 // If the source's op_sel_hi is set, it indicates to do a conversion from
6669 // fp16. If the sources's op_sel is set, it picks the high half of the
6670 // source register.
6671
6672 Mods |= SISrcMods::OP_SEL_1;
6673
6674 if (isExtractHiElt(*MRI, Src, Src)) {
6675 Mods |= SISrcMods::OP_SEL_0;
6676 CheckAbsNeg();
6677 }
6678
6679 Matched = true;
6680 }
6681
6682 return {Src, Mods};
6683}
6684
6686AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6687 MachineOperand &Root) const {
6688 Register Src;
6689 unsigned Mods;
6690 bool Matched;
6691 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6692 if (!Matched)
6693 return {};
6694
6695 return {{
6696 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6697 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6698 }};
6699}
6700
6702AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6703 Register Src;
6704 unsigned Mods;
6705 bool Matched;
6706 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6707
6708 return {{
6709 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6710 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6711 }};
6712}
6713
6714bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6715 MachineInstr &I, Intrinsic::ID IntrID) const {
6716 MachineBasicBlock *MBB = I.getParent();
6717 const DebugLoc &DL = I.getDebugLoc();
6718 Register CCReg = I.getOperand(0).getReg();
6719
6720 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6721 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6722
6723 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6724 .addImm(I.getOperand(2).getImm());
6725
6726 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6727
6728 I.eraseFromParent();
6729 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6730 *MRI);
6731}
6732
6733bool AMDGPUInstructionSelector::selectSGetBarrierState(
6734 MachineInstr &I, Intrinsic::ID IntrID) const {
6735 MachineBasicBlock *MBB = I.getParent();
6736 const DebugLoc &DL = I.getDebugLoc();
6737 MachineOperand BarOp = I.getOperand(2);
6738 std::optional<int64_t> BarValImm =
6739 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6740
6741 if (!BarValImm) {
6742 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6743 .addReg(BarOp.getReg());
6744 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6745 }
6746 MachineInstrBuilder MIB;
6747 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6748 : AMDGPU::S_GET_BARRIER_STATE_M0;
6749 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6750
6751 auto DstReg = I.getOperand(0).getReg();
6752 const TargetRegisterClass *DstRC =
6753 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6754 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6755 return false;
6756 MIB.addDef(DstReg);
6757 if (BarValImm) {
6758 MIB.addImm(*BarValImm);
6759 }
6760 I.eraseFromParent();
6761 return true;
6762}
6763
6764unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6765 if (HasInlineConst) {
6766 switch (IntrID) {
6767 default:
6768 llvm_unreachable("not a named barrier op");
6769 case Intrinsic::amdgcn_s_barrier_join:
6770 return AMDGPU::S_BARRIER_JOIN_IMM;
6771 case Intrinsic::amdgcn_s_get_named_barrier_state:
6772 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6773 };
6774 } else {
6775 switch (IntrID) {
6776 default:
6777 llvm_unreachable("not a named barrier op");
6778 case Intrinsic::amdgcn_s_barrier_join:
6779 return AMDGPU::S_BARRIER_JOIN_M0;
6780 case Intrinsic::amdgcn_s_get_named_barrier_state:
6781 return AMDGPU::S_GET_BARRIER_STATE_M0;
6782 };
6783 }
6784}
6785
6786bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6787 MachineInstr &I, Intrinsic::ID IntrID) const {
6788 MachineBasicBlock *MBB = I.getParent();
6789 const DebugLoc &DL = I.getDebugLoc();
6790 MachineOperand BarOp = I.getOperand(1);
6791 MachineOperand CntOp = I.getOperand(2);
6792
6793 // BarID = (BarOp >> 4) & 0x3F
6794 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6795 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6796 .add(BarOp)
6797 .addImm(4u)
6798 .setOperandDead(3); // Dead scc
6799
6800 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6801 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6802 .addReg(TmpReg0)
6803 .addImm(0x3F)
6804 .setOperandDead(3); // Dead scc
6805
6806 // MO = ((CntOp & 0x3F) << shAmt) | BarID
6807 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6808 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
6809 .add(CntOp)
6810 .addImm(0x3F)
6811 .setOperandDead(3); // Dead scc
6812
6813 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6814 constexpr unsigned ShAmt = 16;
6815 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
6816 .addReg(TmpReg2)
6817 .addImm(ShAmt)
6818 .setOperandDead(3); // Dead scc
6819
6820 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6821 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
6822 .addReg(TmpReg1)
6823 .addReg(TmpReg3)
6824 .setOperandDead(3); // Dead scc;
6825
6826 auto CopyMIB =
6827 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
6828 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6829
6830 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
6831 ? AMDGPU::S_BARRIER_INIT_M0
6832 : AMDGPU::S_BARRIER_SIGNAL_M0;
6833 MachineInstrBuilder MIB;
6834 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6835
6836 I.eraseFromParent();
6837 return true;
6838}
6839
6840bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6841 MachineInstr &I, Intrinsic::ID IntrID) const {
6842 MachineBasicBlock *MBB = I.getParent();
6843 const DebugLoc &DL = I.getDebugLoc();
6844 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6845 ? I.getOperand(2)
6846 : I.getOperand(1);
6847 std::optional<int64_t> BarValImm =
6848 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6849
6850 if (!BarValImm) {
6851 // BarID = (BarOp >> 4) & 0x3F
6852 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6853 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6854 .addReg(BarOp.getReg())
6855 .addImm(4u)
6856 .setOperandDead(3); // Dead scc;
6857
6858 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6859 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6860 .addReg(TmpReg0)
6861 .addImm(0x3F)
6862 .setOperandDead(3); // Dead scc;
6863
6864 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6865 .addReg(TmpReg1);
6866 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6867 }
6868
6869 MachineInstrBuilder MIB;
6870 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6871 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6872
6873 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6874 auto DstReg = I.getOperand(0).getReg();
6875 const TargetRegisterClass *DstRC =
6876 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6877 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6878 return false;
6879 MIB.addDef(DstReg);
6880 }
6881
6882 if (BarValImm) {
6883 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6884 MIB.addImm(BarId);
6885 }
6886
6887 I.eraseFromParent();
6888 return true;
6889}
6890
6891void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6892 const MachineInstr &MI,
6893 int OpIdx) const {
6894 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6895 "Expected G_CONSTANT");
6896 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6897}
6898
6899void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6900 const MachineInstr &MI,
6901 int OpIdx) const {
6902 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6903 "Expected G_CONSTANT");
6904 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6905}
6906
6907void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6908 const MachineInstr &MI,
6909 int OpIdx) const {
6910 const MachineOperand &Op = MI.getOperand(1);
6911 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6912 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6913}
6914
6915void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6916 const MachineInstr &MI,
6917 int OpIdx) const {
6918 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6919 "Expected G_CONSTANT");
6920 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6921}
6922
6923/// This only really exists to satisfy DAG type checking machinery, so is a
6924/// no-op here.
6925void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6926 const MachineInstr &MI,
6927 int OpIdx) const {
6928 const MachineOperand &Op = MI.getOperand(OpIdx);
6929 int64_t Imm;
6930 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6931 MIB.addImm(Imm);
6932 else
6933 MIB.addImm(Op.getImm());
6934}
6935
6936void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6937 const MachineInstr &MI,
6938 int OpIdx) const {
6939 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6940}
6941
6942void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6943 const MachineInstr &MI,
6944 int OpIdx) const {
6945 assert(OpIdx >= 0 && "expected to match an immediate operand");
6946 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6947}
6948
6949void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6950 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6951 assert(OpIdx >= 0 && "expected to match an immediate operand");
6952 MIB.addImm(
6953 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6954}
6955
6956void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6957 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6958 assert(OpIdx >= 0 && "expected to match an immediate operand");
6959 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6961 : (int64_t)SISrcMods::DST_OP_SEL);
6962}
6963
6964void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6965 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6966 assert(OpIdx >= 0 && "expected to match an immediate operand");
6967 MIB.addImm(
6968 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6969}
6970
6971void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6972 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6973 assert(OpIdx >= 0 && "expected to match an immediate operand");
6974 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6975 ? (int64_t)(SISrcMods::OP_SEL_0)
6976 : 0);
6977}
6978
6979void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6980 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6981 assert(OpIdx >= 0 && "expected to match an immediate operand");
6982 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6983 : 0);
6984}
6985
6986void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6987 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6988 assert(OpIdx >= 0 && "expected to match an immediate operand");
6989 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6990 : 0);
6991}
6992
6993void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6994 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6995 assert(OpIdx >= 0 && "expected to match an immediate operand");
6996 MIB.addImm(
6997 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6998}
6999
7000void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7001 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7002 assert(OpIdx >= 0 && "expected to match an immediate operand");
7003 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7004 ? (int64_t)SISrcMods::DST_OP_SEL
7005 : 0);
7006}
7007
7008void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7009 const MachineInstr &MI,
7010 int OpIdx) const {
7011 assert(OpIdx >= 0 && "expected to match an immediate operand");
7012 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7015}
7016
7017void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7018 const MachineInstr &MI,
7019 int OpIdx) const {
7020 assert(OpIdx >= 0 && "expected to match an immediate operand");
7021 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7024 MIB.addImm(Swizzle);
7025}
7026
7027void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7028 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7029 assert(OpIdx >= 0 && "expected to match an immediate operand");
7030 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7033 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7034}
7035
7036void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7037 const MachineInstr &MI,
7038 int OpIdx) const {
7039 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7040}
7041
7042void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7043 const MachineInstr &MI,
7044 int OpIdx) const {
7045 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7046 int ExpVal = APF.getExactLog2Abs();
7047 assert(ExpVal != INT_MIN);
7048 MIB.addImm(ExpVal);
7049}
7050
7051void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7052 const MachineInstr &MI,
7053 int OpIdx) const {
7054 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7055 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7056 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7057 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7058 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7059}
7060
7061void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7062 const MachineInstr &MI,
7063 int OpIdx) const {
7064 unsigned Mods = SISrcMods::OP_SEL_1;
7065 if (MI.getOperand(OpIdx).getImm())
7066 Mods ^= SISrcMods::NEG;
7067 MIB.addImm((int64_t)Mods);
7068}
7069
7070void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7071 const MachineInstr &MI,
7072 int OpIdx) const {
7073 unsigned Mods = SISrcMods::OP_SEL_1;
7074 if (MI.getOperand(OpIdx).getImm())
7076 MIB.addImm((int64_t)Mods);
7077}
7078
7079void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7080 const MachineInstr &MI,
7081 int OpIdx) const {
7082 unsigned Val = MI.getOperand(OpIdx).getImm();
7083 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7084 if (Val == 1) // neg
7085 Mods ^= SISrcMods::NEG;
7086 if (Val == 2) // abs
7087 Mods ^= SISrcMods::ABS;
7088 if (Val == 3) // neg and abs
7089 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7090 MIB.addImm((int64_t)Mods);
7091}
7092
7093void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7094 const MachineInstr &MI,
7095 int OpIdx) const {
7096 uint32_t V = MI.getOperand(2).getImm();
7099 if (!Subtarget->hasSafeCUPrefetch())
7100 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7101 MIB.addImm(V);
7102}
7103
7104/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7105void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7106 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7107 unsigned Val = MI.getOperand(OpIdx).getImm();
7108 unsigned New = 0;
7109 if (Val & 0x1)
7111 if (Val & 0x2)
7113 MIB.addImm(New);
7114}
7115
7116bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7117 return TII.isInlineConstant(Imm);
7118}
7119
7120bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7121 return TII.isInlineConstant(Imm);
7122}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:50
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:915
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:651
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:492
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:467
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:499
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.