LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
246
247 Register DstReg = I.getOperand(0).getReg();
248 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
249
250 I.eraseFromParent();
251 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
252}
253
254bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
255 const DebugLoc &DL = I.getDebugLoc();
256 MachineBasicBlock *BB = I.getParent();
257
258 Register DstReg = I.getOperand(0).getReg();
259 Register SrcReg = I.getOperand(1).getReg();
260 std::optional<ValueAndVReg> Arg =
261 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
262
263 if (Arg) {
264 const int64_t Value = Arg->Value.getZExtValue();
265 if (Value == 0) {
266 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
267 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
268 } else {
269 assert(Value == 1);
270 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
271 }
272 I.eraseFromParent();
273 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
274 }
275
276 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
277 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
278
279 unsigned SelectOpcode =
280 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
281 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
282 .addReg(TRI.getExec())
283 .addImm(0);
284
285 I.eraseFromParent();
287 return true;
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302 return true;
303}
304
305bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
306 const Register DefReg = I.getOperand(0).getReg();
307 const LLT DefTy = MRI->getType(DefReg);
308
309 // S1 G_PHIs should not be selected in instruction-select, instead:
310 // - divergent S1 G_PHI should go through lane mask merging algorithm
311 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
312 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
313 if (DefTy == LLT::scalar(1))
314 return false;
315
316 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
317
318 const RegClassOrRegBank &RegClassOrBank =
319 MRI->getRegClassOrRegBank(DefReg);
320
321 const TargetRegisterClass *DefRC =
323 if (!DefRC) {
324 if (!DefTy.isValid()) {
325 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
326 return false;
327 }
328
329 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
330 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
331 if (!DefRC) {
332 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
333 return false;
334 }
335 }
336
337 // If inputs have register bank, assign corresponding reg class.
338 // Note: registers don't need to have the same reg bank.
339 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
340 const Register SrcReg = I.getOperand(i).getReg();
341
342 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
343 if (RB) {
344 const LLT SrcTy = MRI->getType(SrcReg);
345 const TargetRegisterClass *SrcRC =
346 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
347 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
348 return false;
349 }
350 }
351
352 I.setDesc(TII.get(TargetOpcode::PHI));
353 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
354}
355
357AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
358 const TargetRegisterClass &SubRC,
359 unsigned SubIdx) const {
360
361 MachineInstr *MI = MO.getParent();
362 MachineBasicBlock *BB = MO.getParent()->getParent();
363 Register DstReg = MRI->createVirtualRegister(&SubRC);
364
365 if (MO.isReg()) {
366 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
367 Register Reg = MO.getReg();
368 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
369 .addReg(Reg, {}, ComposedSubIdx);
370
371 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
372 MO.isKill(), MO.isDead(), MO.isUndef(),
373 MO.isEarlyClobber(), 0, MO.isDebug(),
374 MO.isInternalRead());
375 }
376
377 assert(MO.isImm());
378
379 APInt Imm(64, MO.getImm());
380
381 switch (SubIdx) {
382 default:
383 llvm_unreachable("do not know to split immediate with this sub index.");
384 case AMDGPU::sub0:
385 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
386 case AMDGPU::sub1:
387 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
388 }
389}
390
391static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
392 switch (Opc) {
393 case AMDGPU::G_AND:
394 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
395 case AMDGPU::G_OR:
396 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
397 case AMDGPU::G_XOR:
398 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
399 default:
400 llvm_unreachable("not a bit op");
401 }
402}
403
404bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
405 Register DstReg = I.getOperand(0).getReg();
406 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
407
408 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
409 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
410 DstRB->getID() != AMDGPU::VCCRegBankID)
411 return false;
412
413 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
414 STI.isWave64());
415 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
416
417 // Dead implicit-def of scc
418 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
419 true, // isImp
420 false, // isKill
421 true)); // isDead
422 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
423 return true;
424}
425
426bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
427 MachineBasicBlock *BB = I.getParent();
428 MachineFunction *MF = BB->getParent();
429 Register DstReg = I.getOperand(0).getReg();
430 const DebugLoc &DL = I.getDebugLoc();
431 LLT Ty = MRI->getType(DstReg);
432 if (Ty.isVector())
433 return false;
434
435 unsigned Size = Ty.getSizeInBits();
436 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
437 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
438 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
439
440 if (Size == 32) {
441 if (IsSALU) {
442 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
443 MachineInstr *Add =
444 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
445 .add(I.getOperand(1))
446 .add(I.getOperand(2))
447 .setOperandDead(3); // Dead scc
448 I.eraseFromParent();
449 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
450 return true;
451 }
452
453 if (STI.hasAddNoCarryInsts()) {
454 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
455 I.setDesc(TII.get(Opc));
456 I.addOperand(*MF, MachineOperand::CreateImm(0));
457 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
458 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
459 return true;
460 }
461
462 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
463
464 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
465 MachineInstr *Add
466 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
467 .addDef(UnusedCarry, RegState::Dead)
468 .add(I.getOperand(1))
469 .add(I.getOperand(2))
470 .addImm(0);
471 I.eraseFromParent();
472 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
473 return true;
474 }
475
476 assert(!Sub && "illegal sub should not reach here");
477
478 const TargetRegisterClass &RC
479 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
480 const TargetRegisterClass &HalfRC
481 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
482
483 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
484 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
485 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
486 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
487
488 Register DstLo = MRI->createVirtualRegister(&HalfRC);
489 Register DstHi = MRI->createVirtualRegister(&HalfRC);
490
491 if (IsSALU) {
492 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
493 .add(Lo1)
494 .add(Lo2);
495 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
496 .add(Hi1)
497 .add(Hi2)
498 .setOperandDead(3); // Dead scc
499 } else {
500 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
501 Register CarryReg = MRI->createVirtualRegister(CarryRC);
502 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
503 .addDef(CarryReg)
504 .add(Lo1)
505 .add(Lo2)
506 .addImm(0);
507 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
508 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
509 .add(Hi1)
510 .add(Hi2)
511 .addReg(CarryReg, RegState::Kill)
512 .addImm(0);
513
514 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
515 }
516
517 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
518 .addReg(DstLo)
519 .addImm(AMDGPU::sub0)
520 .addReg(DstHi)
521 .addImm(AMDGPU::sub1);
522
523
524 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
525 return false;
526
527 I.eraseFromParent();
528 return true;
529}
530
531bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
532 MachineInstr &I) const {
533 MachineBasicBlock *BB = I.getParent();
534 MachineFunction *MF = BB->getParent();
535 const DebugLoc &DL = I.getDebugLoc();
536 Register Dst0Reg = I.getOperand(0).getReg();
537 Register Dst1Reg = I.getOperand(1).getReg();
538 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
539 I.getOpcode() == AMDGPU::G_UADDE;
540 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
541 I.getOpcode() == AMDGPU::G_USUBE;
542
543 if (isVCC(Dst1Reg, *MRI)) {
544 unsigned NoCarryOpc =
545 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
546 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
547 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
548 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
549 I.addOperand(*MF, MachineOperand::CreateImm(0));
550 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
551 return true;
552 }
553
554 Register Src0Reg = I.getOperand(2).getReg();
555 Register Src1Reg = I.getOperand(3).getReg();
556
557 if (HasCarryIn) {
558 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
559 .addReg(I.getOperand(4).getReg());
560 }
561
562 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
563 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
564
565 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
566 .add(I.getOperand(2))
567 .add(I.getOperand(3));
568
569 if (MRI->use_nodbg_empty(Dst1Reg)) {
570 CarryInst.setOperandDead(3); // Dead scc
571 } else {
572 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
573 .addReg(AMDGPU::SCC);
574 if (!MRI->getRegClassOrNull(Dst1Reg))
575 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
576 }
577
578 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
579 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
580 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 if (HasCarryIn &&
584 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
585 AMDGPU::SReg_32RegClass, *MRI))
586 return false;
587
588 I.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
593 MachineInstr &I) const {
594 MachineBasicBlock *BB = I.getParent();
595 MachineFunction *MF = BB->getParent();
596 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
597 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
598 MRI->use_nodbg_empty(I.getOperand(1).getReg());
599
600 unsigned Opc;
601 if (Subtarget->hasMADIntraFwdBug())
602 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
603 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
604 else if (UseNoCarry)
605 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
606 : AMDGPU::V_MAD_NC_I64_I32_e64;
607 else
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
609
610 if (UseNoCarry)
611 I.removeOperand(1);
612
613 I.setDesc(TII.get(Opc));
614 I.addOperand(*MF, MachineOperand::CreateImm(0));
615 I.addImplicitDefUseOperands(*MF);
616 I.getOperand(0).setIsEarlyClobber(true);
617 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
618 return true;
619}
620
621// TODO: We should probably legalize these to only using 32-bit results.
622bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
623 MachineBasicBlock *BB = I.getParent();
624 Register DstReg = I.getOperand(0).getReg();
625 Register SrcReg = I.getOperand(1).getReg();
626 LLT DstTy = MRI->getType(DstReg);
627 LLT SrcTy = MRI->getType(SrcReg);
628 const unsigned SrcSize = SrcTy.getSizeInBits();
629 unsigned DstSize = DstTy.getSizeInBits();
630
631 // TODO: Should handle any multiple of 32 offset.
632 unsigned Offset = I.getOperand(2).getImm();
633 if (Offset % 32 != 0 || DstSize > 128)
634 return false;
635
636 // 16-bit operations really use 32-bit registers.
637 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
638 if (DstSize == 16)
639 DstSize = 32;
640
641 const TargetRegisterClass *DstRC =
642 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
643 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
644 return false;
645
646 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
647 const TargetRegisterClass *SrcRC =
648 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
649 if (!SrcRC)
650 return false;
651 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
652 DstSize / 32);
653 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
654 if (!SrcRC)
655 return false;
656
657 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
658 *SrcRC, I.getOperand(1));
659 const DebugLoc &DL = I.getDebugLoc();
660 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
661 .addReg(SrcReg, {}, SubReg);
662
663 I.eraseFromParent();
664 return true;
665}
666
667bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
668 MachineBasicBlock *BB = MI.getParent();
669 Register DstReg = MI.getOperand(0).getReg();
670 LLT DstTy = MRI->getType(DstReg);
671 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
672
673 const unsigned SrcSize = SrcTy.getSizeInBits();
674 if (SrcSize < 32)
675 return selectImpl(MI, *CoverageInfo);
676
677 const DebugLoc &DL = MI.getDebugLoc();
678 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
679 const unsigned DstSize = DstTy.getSizeInBits();
680 const TargetRegisterClass *DstRC =
681 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
682 if (!DstRC)
683 return false;
684
685 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
686 MachineInstrBuilder MIB =
687 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
688 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
689 MachineOperand &Src = MI.getOperand(I + 1);
690 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
691 MIB.addImm(SubRegs[I]);
692
693 const TargetRegisterClass *SrcRC
694 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
695 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
696 return false;
697 }
698
699 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
700 return false;
701
702 MI.eraseFromParent();
703 return true;
704}
705
706bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
707 MachineBasicBlock *BB = MI.getParent();
708 const int NumDst = MI.getNumOperands() - 1;
709
710 MachineOperand &Src = MI.getOperand(NumDst);
711
712 Register SrcReg = Src.getReg();
713 Register DstReg0 = MI.getOperand(0).getReg();
714 LLT DstTy = MRI->getType(DstReg0);
715 LLT SrcTy = MRI->getType(SrcReg);
716
717 const unsigned DstSize = DstTy.getSizeInBits();
718 const unsigned SrcSize = SrcTy.getSizeInBits();
719 const DebugLoc &DL = MI.getDebugLoc();
720 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
721
722 const TargetRegisterClass *SrcRC =
723 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
724 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
725 return false;
726
727 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
728 // source, and this relies on the fact that the same subregister indices are
729 // used for both.
730 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
731 for (int I = 0, E = NumDst; I != E; ++I) {
732 MachineOperand &Dst = MI.getOperand(I);
733 // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits.
734 if (SrcBank->getID() == AMDGPU::SGPRRegBankID &&
735 SubRegs[I] == AMDGPU::hi16) {
736 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg())
737 .addReg(SrcReg)
738 .addImm(16);
739 } else {
740 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
741 .addReg(SrcReg, {}, SubRegs[I]);
742 }
743
744 // Make sure the subregister index is valid for the source register.
745 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
746 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
747 return false;
748
749 const TargetRegisterClass *DstRC =
750 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
751 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
752 return false;
753 }
754
755 MI.eraseFromParent();
756 return true;
757}
758
759bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
760 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
761 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
762
763 Register Src0 = MI.getOperand(1).getReg();
764 Register Src1 = MI.getOperand(2).getReg();
765 LLT SrcTy = MRI->getType(Src0);
766 const unsigned SrcSize = SrcTy.getSizeInBits();
767
768 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
769 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
770 return selectG_MERGE_VALUES(MI);
771 }
772
773 // Selection logic below is for V2S16 only.
774 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
775 Register Dst = MI.getOperand(0).getReg();
776 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
777 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
778 SrcTy != LLT::scalar(32)))
779 return selectImpl(MI, *CoverageInfo);
780
781 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
782 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
783 return false;
784
785 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
786 DstBank->getID() == AMDGPU::VGPRRegBankID);
787 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
788
789 const DebugLoc &DL = MI.getDebugLoc();
790 MachineBasicBlock *BB = MI.getParent();
791
792 // First, before trying TableGen patterns, check if both sources are
793 // constants. In those cases, we can trivially compute the final constant
794 // and emit a simple move.
795 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
796 if (ConstSrc1) {
797 auto ConstSrc0 =
798 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
799 if (ConstSrc0) {
800 const int64_t K0 = ConstSrc0->Value.getSExtValue();
801 const int64_t K1 = ConstSrc1->Value.getSExtValue();
802 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
803 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
804 uint32_t Imm = Lo16 | (Hi16 << 16);
805
806 // VALU
807 if (IsVector) {
808 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
809 MI.eraseFromParent();
810 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
811 }
812
813 // SALU
814 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
815 MI.eraseFromParent();
816 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
817 }
818 }
819
820 // Now try TableGen patterns.
821 if (selectImpl(MI, *CoverageInfo))
822 return true;
823
824 // TODO: This should probably be a combine somewhere
825 // (build_vector $src0, undef) -> copy $src0
826 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
827 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
828 MI.setDesc(TII.get(AMDGPU::COPY));
829 MI.removeOperand(2);
830 const auto &RC =
831 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
832 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
833 RBI.constrainGenericRegister(Src0, RC, *MRI);
834 }
835
836 // TODO: Can be improved?
837 if (IsVector) {
838 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
839 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
840 .addImm(0xFFFF)
841 .addReg(Src0);
842 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
843
844 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
845 .addReg(Src1)
846 .addImm(16)
847 .addReg(TmpReg);
848 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
849
850 MI.eraseFromParent();
851 return true;
852 }
853
854 Register ShiftSrc0;
855 Register ShiftSrc1;
856
857 // With multiple uses of the shift, this will duplicate the shift and
858 // increase register pressure.
859 //
860 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
861 // => (S_PACK_HH_B32_B16 $src0, $src1)
862 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
863 // => (S_PACK_HL_B32_B16 $src0, $src1)
864 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
865 // => (S_PACK_LH_B32_B16 $src0, $src1)
866 // (build_vector $src0, $src1)
867 // => (S_PACK_LL_B32_B16 $src0, $src1)
868
869 bool Shift0 = mi_match(
870 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
871
872 bool Shift1 = mi_match(
873 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
874
875 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
876 if (Shift0 && Shift1) {
877 Opc = AMDGPU::S_PACK_HH_B32_B16;
878 MI.getOperand(1).setReg(ShiftSrc0);
879 MI.getOperand(2).setReg(ShiftSrc1);
880 } else if (Shift1) {
881 Opc = AMDGPU::S_PACK_LH_B32_B16;
882 MI.getOperand(2).setReg(ShiftSrc1);
883 } else if (Shift0) {
884 auto ConstSrc1 =
885 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
886 if (ConstSrc1 && ConstSrc1->Value == 0) {
887 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
888 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
889 .addReg(ShiftSrc0)
890 .addImm(16)
891 .setOperandDead(3); // Dead scc
892
893 MI.eraseFromParent();
894 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
895 return true;
896 }
897 if (STI.hasSPackHL()) {
898 Opc = AMDGPU::S_PACK_HL_B32_B16;
899 MI.getOperand(1).setReg(ShiftSrc0);
900 }
901 }
902
903 MI.setDesc(TII.get(Opc));
904 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
905 return true;
906}
907
908bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
909 const MachineOperand &MO = I.getOperand(0);
910
911 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
912 // regbank check here is to know why getConstrainedRegClassForOperand failed.
913 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
914 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
915 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
916 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
917 return true;
918 }
919
920 return false;
921}
922
923bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
924 MachineBasicBlock *BB = I.getParent();
925
926 Register DstReg = I.getOperand(0).getReg();
927 Register Src0Reg = I.getOperand(1).getReg();
928 Register Src1Reg = I.getOperand(2).getReg();
929 LLT Src1Ty = MRI->getType(Src1Reg);
930
931 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
932 unsigned InsSize = Src1Ty.getSizeInBits();
933
934 int64_t Offset = I.getOperand(3).getImm();
935
936 // FIXME: These cases should have been illegal and unnecessary to check here.
937 if (Offset % 32 != 0 || InsSize % 32 != 0)
938 return false;
939
940 // Currently not handled by getSubRegFromChannel.
941 if (InsSize > 128)
942 return false;
943
944 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
945 if (SubReg == AMDGPU::NoSubRegister)
946 return false;
947
948 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
949 const TargetRegisterClass *DstRC =
950 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
951 if (!DstRC)
952 return false;
953
954 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
955 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
956 const TargetRegisterClass *Src0RC =
957 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
958 const TargetRegisterClass *Src1RC =
959 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
960
961 // Deal with weird cases where the class only partially supports the subreg
962 // index.
963 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
964 if (!Src0RC || !Src1RC)
965 return false;
966
967 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
968 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
969 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
970 return false;
971
972 const DebugLoc &DL = I.getDebugLoc();
973 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
974 .addReg(Src0Reg)
975 .addReg(Src1Reg)
976 .addImm(SubReg);
977
978 I.eraseFromParent();
979 return true;
980}
981
982bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
983 Register DstReg = MI.getOperand(0).getReg();
984 Register SrcReg = MI.getOperand(1).getReg();
985 Register OffsetReg = MI.getOperand(2).getReg();
986 Register WidthReg = MI.getOperand(3).getReg();
987
988 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
989 "scalar BFX instructions are expanded in regbankselect");
990 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
991 "64-bit vector BFX instructions are expanded in regbankselect");
992
993 const DebugLoc &DL = MI.getDebugLoc();
994 MachineBasicBlock *MBB = MI.getParent();
995
996 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
997 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
998 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
999 .addReg(SrcReg)
1000 .addReg(OffsetReg)
1001 .addReg(WidthReg);
1002 MI.eraseFromParent();
1003 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1004 return true;
1005}
1006
1007bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1008 if (STI.getLDSBankCount() != 16)
1009 return selectImpl(MI, *CoverageInfo);
1010
1011 Register Dst = MI.getOperand(0).getReg();
1012 Register Src0 = MI.getOperand(2).getReg();
1013 Register M0Val = MI.getOperand(6).getReg();
1014 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1015 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1016 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1017 return false;
1018
1019 // This requires 2 instructions. It is possible to write a pattern to support
1020 // this, but the generated isel emitter doesn't correctly deal with multiple
1021 // output instructions using the same physical register input. The copy to m0
1022 // is incorrectly placed before the second instruction.
1023 //
1024 // TODO: Match source modifiers.
1025
1026 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1027 const DebugLoc &DL = MI.getDebugLoc();
1028 MachineBasicBlock *MBB = MI.getParent();
1029
1030 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1031 .addReg(M0Val);
1032 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1033 .addImm(2)
1034 .addImm(MI.getOperand(4).getImm()) // $attr
1035 .addImm(MI.getOperand(3).getImm()); // $attrchan
1036
1037 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1038 .addImm(0) // $src0_modifiers
1039 .addReg(Src0) // $src0
1040 .addImm(MI.getOperand(4).getImm()) // $attr
1041 .addImm(MI.getOperand(3).getImm()) // $attrchan
1042 .addImm(0) // $src2_modifiers
1043 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1044 .addImm(MI.getOperand(5).getImm()) // $high
1045 .addImm(0) // $clamp
1046 .addImm(0); // $omod
1047
1048 MI.eraseFromParent();
1049 return true;
1050}
1051
1052// Writelane is special in that it can use SGPR and M0 (which would normally
1053// count as using the constant bus twice - but in this case it is allowed since
1054// the lane selector doesn't count as a use of the constant bus). However, it is
1055// still required to abide by the 1 SGPR rule. Fix this up if we might have
1056// multiple SGPRs.
1057bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1058 // With a constant bus limit of at least 2, there's no issue.
1059 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1060 return selectImpl(MI, *CoverageInfo);
1061
1062 MachineBasicBlock *MBB = MI.getParent();
1063 const DebugLoc &DL = MI.getDebugLoc();
1064 Register VDst = MI.getOperand(0).getReg();
1065 Register Val = MI.getOperand(2).getReg();
1066 Register LaneSelect = MI.getOperand(3).getReg();
1067 Register VDstIn = MI.getOperand(4).getReg();
1068
1069 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1070
1071 std::optional<ValueAndVReg> ConstSelect =
1072 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1073 if (ConstSelect) {
1074 // The selector has to be an inline immediate, so we can use whatever for
1075 // the other operands.
1076 MIB.addReg(Val);
1077 MIB.addImm(ConstSelect->Value.getSExtValue() &
1078 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1079 } else {
1080 std::optional<ValueAndVReg> ConstVal =
1082
1083 // If the value written is an inline immediate, we can get away without a
1084 // copy to m0.
1085 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1086 STI.hasInv2PiInlineImm())) {
1087 MIB.addImm(ConstVal->Value.getSExtValue());
1088 MIB.addReg(LaneSelect);
1089 } else {
1090 MIB.addReg(Val);
1091
1092 // If the lane selector was originally in a VGPR and copied with
1093 // readfirstlane, there's a hazard to read the same SGPR from the
1094 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1095 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1096
1097 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1098 .addReg(LaneSelect);
1099 MIB.addReg(AMDGPU::M0);
1100 }
1101 }
1102
1103 MIB.addReg(VDstIn);
1104
1105 MI.eraseFromParent();
1106 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1107 return true;
1108}
1109
1110// We need to handle this here because tablegen doesn't support matching
1111// instructions with multiple outputs.
1112bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1113 Register Dst0 = MI.getOperand(0).getReg();
1114 Register Dst1 = MI.getOperand(1).getReg();
1115
1116 LLT Ty = MRI->getType(Dst0);
1117 unsigned Opc;
1118 if (Ty == LLT::scalar(32))
1119 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1120 else if (Ty == LLT::scalar(64))
1121 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1122 else
1123 return false;
1124
1125 // TODO: Match source modifiers.
1126
1127 const DebugLoc &DL = MI.getDebugLoc();
1128 MachineBasicBlock *MBB = MI.getParent();
1129
1130 Register Numer = MI.getOperand(3).getReg();
1131 Register Denom = MI.getOperand(4).getReg();
1132 unsigned ChooseDenom = MI.getOperand(5).getImm();
1133
1134 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1135
1136 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1137 .addDef(Dst1)
1138 .addImm(0) // $src0_modifiers
1139 .addUse(Src0) // $src0
1140 .addImm(0) // $src1_modifiers
1141 .addUse(Denom) // $src1
1142 .addImm(0) // $src2_modifiers
1143 .addUse(Numer) // $src2
1144 .addImm(0) // $clamp
1145 .addImm(0); // $omod
1146
1147 MI.eraseFromParent();
1148 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1149 return true;
1150}
1151
1152bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1153 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1154 switch (IntrinsicID) {
1155 case Intrinsic::amdgcn_if_break: {
1156 MachineBasicBlock *BB = I.getParent();
1157
1158 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1159 // SelectionDAG uses for wave32 vs wave64.
1160 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1161 .add(I.getOperand(0))
1162 .add(I.getOperand(2))
1163 .add(I.getOperand(3));
1164
1165 Register DstReg = I.getOperand(0).getReg();
1166 Register Src0Reg = I.getOperand(2).getReg();
1167 Register Src1Reg = I.getOperand(3).getReg();
1168
1169 I.eraseFromParent();
1170
1171 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1172 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1173
1174 return true;
1175 }
1176 case Intrinsic::amdgcn_interp_p1_f16:
1177 return selectInterpP1F16(I);
1178 case Intrinsic::amdgcn_wqm:
1179 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1180 case Intrinsic::amdgcn_softwqm:
1181 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1182 case Intrinsic::amdgcn_strict_wwm:
1183 case Intrinsic::amdgcn_wwm:
1184 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1185 case Intrinsic::amdgcn_strict_wqm:
1186 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1187 case Intrinsic::amdgcn_writelane:
1188 return selectWritelane(I);
1189 case Intrinsic::amdgcn_div_scale:
1190 return selectDivScale(I);
1191 case Intrinsic::amdgcn_icmp:
1192 case Intrinsic::amdgcn_fcmp:
1193 if (selectImpl(I, *CoverageInfo))
1194 return true;
1195 return selectIntrinsicCmp(I);
1196 case Intrinsic::amdgcn_ballot:
1197 return selectBallot(I);
1198 case Intrinsic::amdgcn_reloc_constant:
1199 return selectRelocConstant(I);
1200 case Intrinsic::amdgcn_groupstaticsize:
1201 return selectGroupStaticSize(I);
1202 case Intrinsic::returnaddress:
1203 return selectReturnAddress(I);
1204 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1205 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1206 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1207 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1208 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1209 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1211 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1212 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1213 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1214 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1215 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1216 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1217 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1218 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1219 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1220 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1221 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1222 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1223 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1224 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1225 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1226 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1227 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1228 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1229 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1230 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1231 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1232 return selectSMFMACIntrin(I);
1233 case Intrinsic::amdgcn_permlane16_swap:
1234 case Intrinsic::amdgcn_permlane32_swap:
1235 return selectPermlaneSwapIntrin(I, IntrinsicID);
1236 case Intrinsic::amdgcn_wave_shuffle:
1237 return selectWaveShuffleIntrin(I);
1238 default:
1239 return selectImpl(I, *CoverageInfo);
1240 }
1241}
1242
1244 const GCNSubtarget &ST) {
1245 if (Size != 16 && Size != 32 && Size != 64)
1246 return -1;
1247
1248 if (Size == 16 && !ST.has16BitInsts())
1249 return -1;
1250
1251 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1252 unsigned FakeS16Opc, unsigned S32Opc,
1253 unsigned S64Opc) {
1254 if (Size == 16)
1255 return ST.hasTrue16BitInsts()
1256 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1257 : S16Opc;
1258 if (Size == 32)
1259 return S32Opc;
1260 return S64Opc;
1261 };
1262
1263 switch (P) {
1264 default:
1265 llvm_unreachable("Unknown condition code!");
1266 case CmpInst::ICMP_NE:
1267 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1268 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1269 AMDGPU::V_CMP_NE_U64_e64);
1270 case CmpInst::ICMP_EQ:
1271 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1272 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1273 AMDGPU::V_CMP_EQ_U64_e64);
1274 case CmpInst::ICMP_SGT:
1275 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1276 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1277 AMDGPU::V_CMP_GT_I64_e64);
1278 case CmpInst::ICMP_SGE:
1279 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1280 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1281 AMDGPU::V_CMP_GE_I64_e64);
1282 case CmpInst::ICMP_SLT:
1283 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1284 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1285 AMDGPU::V_CMP_LT_I64_e64);
1286 case CmpInst::ICMP_SLE:
1287 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1288 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1289 AMDGPU::V_CMP_LE_I64_e64);
1290 case CmpInst::ICMP_UGT:
1291 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1292 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1293 AMDGPU::V_CMP_GT_U64_e64);
1294 case CmpInst::ICMP_UGE:
1295 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1296 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1297 AMDGPU::V_CMP_GE_U64_e64);
1298 case CmpInst::ICMP_ULT:
1299 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1300 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1301 AMDGPU::V_CMP_LT_U64_e64);
1302 case CmpInst::ICMP_ULE:
1303 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1304 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1305 AMDGPU::V_CMP_LE_U64_e64);
1306
1307 case CmpInst::FCMP_OEQ:
1308 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1309 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1310 AMDGPU::V_CMP_EQ_F64_e64);
1311 case CmpInst::FCMP_OGT:
1312 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1313 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1314 AMDGPU::V_CMP_GT_F64_e64);
1315 case CmpInst::FCMP_OGE:
1316 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1317 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1318 AMDGPU::V_CMP_GE_F64_e64);
1319 case CmpInst::FCMP_OLT:
1320 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1321 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1322 AMDGPU::V_CMP_LT_F64_e64);
1323 case CmpInst::FCMP_OLE:
1324 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1325 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1326 AMDGPU::V_CMP_LE_F64_e64);
1327 case CmpInst::FCMP_ONE:
1328 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1329 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1330 AMDGPU::V_CMP_NEQ_F64_e64);
1331 case CmpInst::FCMP_ORD:
1332 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1333 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1334 AMDGPU::V_CMP_O_F64_e64);
1335 case CmpInst::FCMP_UNO:
1336 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1337 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1338 AMDGPU::V_CMP_U_F64_e64);
1339 case CmpInst::FCMP_UEQ:
1340 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1341 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1342 AMDGPU::V_CMP_NLG_F64_e64);
1343 case CmpInst::FCMP_UGT:
1344 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1345 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1346 AMDGPU::V_CMP_NLE_F64_e64);
1347 case CmpInst::FCMP_UGE:
1348 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1349 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1350 AMDGPU::V_CMP_NLT_F64_e64);
1351 case CmpInst::FCMP_ULT:
1352 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1353 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1354 AMDGPU::V_CMP_NGE_F64_e64);
1355 case CmpInst::FCMP_ULE:
1356 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1357 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1358 AMDGPU::V_CMP_NGT_F64_e64);
1359 case CmpInst::FCMP_UNE:
1360 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1361 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1362 AMDGPU::V_CMP_NEQ_F64_e64);
1363 case CmpInst::FCMP_TRUE:
1364 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1365 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1366 AMDGPU::V_CMP_TRU_F64_e64);
1368 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1369 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1370 AMDGPU::V_CMP_F_F64_e64);
1371 }
1372}
1373
1374int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1375 unsigned Size) const {
1376 if (Size == 64) {
1377 if (!STI.hasScalarCompareEq64())
1378 return -1;
1379
1380 switch (P) {
1381 case CmpInst::ICMP_NE:
1382 return AMDGPU::S_CMP_LG_U64;
1383 case CmpInst::ICMP_EQ:
1384 return AMDGPU::S_CMP_EQ_U64;
1385 default:
1386 return -1;
1387 }
1388 }
1389
1390 if (Size == 32) {
1391 switch (P) {
1392 case CmpInst::ICMP_NE:
1393 return AMDGPU::S_CMP_LG_U32;
1394 case CmpInst::ICMP_EQ:
1395 return AMDGPU::S_CMP_EQ_U32;
1396 case CmpInst::ICMP_SGT:
1397 return AMDGPU::S_CMP_GT_I32;
1398 case CmpInst::ICMP_SGE:
1399 return AMDGPU::S_CMP_GE_I32;
1400 case CmpInst::ICMP_SLT:
1401 return AMDGPU::S_CMP_LT_I32;
1402 case CmpInst::ICMP_SLE:
1403 return AMDGPU::S_CMP_LE_I32;
1404 case CmpInst::ICMP_UGT:
1405 return AMDGPU::S_CMP_GT_U32;
1406 case CmpInst::ICMP_UGE:
1407 return AMDGPU::S_CMP_GE_U32;
1408 case CmpInst::ICMP_ULT:
1409 return AMDGPU::S_CMP_LT_U32;
1410 case CmpInst::ICMP_ULE:
1411 return AMDGPU::S_CMP_LE_U32;
1412 case CmpInst::FCMP_OEQ:
1413 return AMDGPU::S_CMP_EQ_F32;
1414 case CmpInst::FCMP_OGT:
1415 return AMDGPU::S_CMP_GT_F32;
1416 case CmpInst::FCMP_OGE:
1417 return AMDGPU::S_CMP_GE_F32;
1418 case CmpInst::FCMP_OLT:
1419 return AMDGPU::S_CMP_LT_F32;
1420 case CmpInst::FCMP_OLE:
1421 return AMDGPU::S_CMP_LE_F32;
1422 case CmpInst::FCMP_ONE:
1423 return AMDGPU::S_CMP_LG_F32;
1424 case CmpInst::FCMP_ORD:
1425 return AMDGPU::S_CMP_O_F32;
1426 case CmpInst::FCMP_UNO:
1427 return AMDGPU::S_CMP_U_F32;
1428 case CmpInst::FCMP_UEQ:
1429 return AMDGPU::S_CMP_NLG_F32;
1430 case CmpInst::FCMP_UGT:
1431 return AMDGPU::S_CMP_NLE_F32;
1432 case CmpInst::FCMP_UGE:
1433 return AMDGPU::S_CMP_NLT_F32;
1434 case CmpInst::FCMP_ULT:
1435 return AMDGPU::S_CMP_NGE_F32;
1436 case CmpInst::FCMP_ULE:
1437 return AMDGPU::S_CMP_NGT_F32;
1438 case CmpInst::FCMP_UNE:
1439 return AMDGPU::S_CMP_NEQ_F32;
1440 default:
1441 llvm_unreachable("Unknown condition code!");
1442 }
1443 }
1444
1445 if (Size == 16) {
1446 if (!STI.hasSALUFloatInsts())
1447 return -1;
1448
1449 switch (P) {
1450 case CmpInst::FCMP_OEQ:
1451 return AMDGPU::S_CMP_EQ_F16;
1452 case CmpInst::FCMP_OGT:
1453 return AMDGPU::S_CMP_GT_F16;
1454 case CmpInst::FCMP_OGE:
1455 return AMDGPU::S_CMP_GE_F16;
1456 case CmpInst::FCMP_OLT:
1457 return AMDGPU::S_CMP_LT_F16;
1458 case CmpInst::FCMP_OLE:
1459 return AMDGPU::S_CMP_LE_F16;
1460 case CmpInst::FCMP_ONE:
1461 return AMDGPU::S_CMP_LG_F16;
1462 case CmpInst::FCMP_ORD:
1463 return AMDGPU::S_CMP_O_F16;
1464 case CmpInst::FCMP_UNO:
1465 return AMDGPU::S_CMP_U_F16;
1466 case CmpInst::FCMP_UEQ:
1467 return AMDGPU::S_CMP_NLG_F16;
1468 case CmpInst::FCMP_UGT:
1469 return AMDGPU::S_CMP_NLE_F16;
1470 case CmpInst::FCMP_UGE:
1471 return AMDGPU::S_CMP_NLT_F16;
1472 case CmpInst::FCMP_ULT:
1473 return AMDGPU::S_CMP_NGE_F16;
1474 case CmpInst::FCMP_ULE:
1475 return AMDGPU::S_CMP_NGT_F16;
1476 case CmpInst::FCMP_UNE:
1477 return AMDGPU::S_CMP_NEQ_F16;
1478 default:
1479 llvm_unreachable("Unknown condition code!");
1480 }
1481 }
1482
1483 return -1;
1484}
1485
1486bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1487
1488 MachineBasicBlock *BB = I.getParent();
1489 const DebugLoc &DL = I.getDebugLoc();
1490
1491 Register SrcReg = I.getOperand(2).getReg();
1492 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1493
1494 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1495
1496 Register CCReg = I.getOperand(0).getReg();
1497 if (!isVCC(CCReg, *MRI)) {
1498 int Opcode = getS_CMPOpcode(Pred, Size);
1499 if (Opcode == -1)
1500 return false;
1501 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1502 .add(I.getOperand(2))
1503 .add(I.getOperand(3));
1504 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1505 .addReg(AMDGPU::SCC);
1506 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1507 bool Ret =
1508 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1509 I.eraseFromParent();
1510 return Ret;
1511 }
1512
1513 if (I.getOpcode() == AMDGPU::G_FCMP)
1514 return false;
1515
1516 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1517 if (Opcode == -1)
1518 return false;
1519
1520 MachineInstrBuilder ICmp;
1521 // t16 instructions
1522 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1523 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1524 .addImm(0)
1525 .add(I.getOperand(2))
1526 .addImm(0)
1527 .add(I.getOperand(3))
1528 .addImm(0); // op_sel
1529 } else {
1530 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1531 .add(I.getOperand(2))
1532 .add(I.getOperand(3));
1533 }
1534
1535 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1536 *TRI.getBoolRC(), *MRI);
1537 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1538 I.eraseFromParent();
1539 return true;
1540}
1541
1542bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1543 Register Dst = I.getOperand(0).getReg();
1544 if (isVCC(Dst, *MRI))
1545 return false;
1546
1547 LLT DstTy = MRI->getType(Dst);
1548 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1549 return false;
1550
1551 MachineBasicBlock *BB = I.getParent();
1552 const DebugLoc &DL = I.getDebugLoc();
1553 Register SrcReg = I.getOperand(2).getReg();
1554 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1555
1556 // i1 inputs are not supported in GlobalISel.
1557 if (Size == 1)
1558 return false;
1559
1560 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1561 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1562 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1563 I.eraseFromParent();
1564 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1565 }
1566
1567 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1568 if (Opcode == -1)
1569 return false;
1570
1571 MachineInstrBuilder SelectedMI;
1572 MachineOperand &LHS = I.getOperand(2);
1573 MachineOperand &RHS = I.getOperand(3);
1574 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1575 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1576 Register Src0Reg =
1577 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1578 Register Src1Reg =
1579 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1580 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1581 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1582 SelectedMI.addImm(Src0Mods);
1583 SelectedMI.addReg(Src0Reg);
1584 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1585 SelectedMI.addImm(Src1Mods);
1586 SelectedMI.addReg(Src1Reg);
1587 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1588 SelectedMI.addImm(0); // clamp
1589 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1590 SelectedMI.addImm(0); // op_sel
1591
1592 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1593 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1594
1595 I.eraseFromParent();
1596 return true;
1597}
1598
1599// Ballot has to zero bits in input lane-mask that are zero in current exec,
1600// Done as AND with exec. For inputs that are results of instruction that
1601// implicitly use same exec, for example compares in same basic block or SCC to
1602// VCC copy, use copy.
1605 MachineInstr *MI = MRI.getVRegDef(Reg);
1606 if (MI->getParent() != MBB)
1607 return false;
1608
1609 // Lane mask generated by SCC to VCC copy.
1610 if (MI->getOpcode() == AMDGPU::COPY) {
1611 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1612 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1613 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1614 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1615 return true;
1616 }
1617
1618 // Lane mask generated using compare with same exec.
1619 if (isa<GAnyCmp>(MI))
1620 return true;
1621
1622 Register LHS, RHS;
1623 // Look through AND.
1624 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1625 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1627
1628 return false;
1629}
1630
1631bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1632 MachineBasicBlock *BB = I.getParent();
1633 const DebugLoc &DL = I.getDebugLoc();
1634 Register DstReg = I.getOperand(0).getReg();
1635 Register SrcReg = I.getOperand(2).getReg();
1636 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1637 const unsigned WaveSize = STI.getWavefrontSize();
1638
1639 // In the common case, the return type matches the wave size.
1640 // However we also support emitting i64 ballots in wave32 mode.
1641 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1642 return false;
1643
1644 std::optional<ValueAndVReg> Arg =
1646
1647 Register Dst = DstReg;
1648 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1649 if (BallotSize != WaveSize) {
1650 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1651 }
1652
1653 if (Arg) {
1654 const int64_t Value = Arg->Value.getZExtValue();
1655 if (Value == 0) {
1656 // Dst = S_MOV 0
1657 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1658 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1659 } else {
1660 // Dst = COPY EXEC
1661 assert(Value == 1);
1662 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1663 }
1664 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1665 return false;
1666 } else {
1667 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1668 // Dst = COPY SrcReg
1669 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1670 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1671 return false;
1672 } else {
1673 // Dst = S_AND SrcReg, EXEC
1674 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1675 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1676 .addReg(SrcReg)
1677 .addReg(TRI.getExec())
1678 .setOperandDead(3); // Dead scc
1679 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1680 }
1681 }
1682
1683 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1684 if (BallotSize != WaveSize) {
1685 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1686 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1687 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1688 .addReg(Dst)
1689 .addImm(AMDGPU::sub0)
1690 .addReg(HiReg)
1691 .addImm(AMDGPU::sub1);
1692 }
1693
1694 I.eraseFromParent();
1695 return true;
1696}
1697
1698bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1699 Register DstReg = I.getOperand(0).getReg();
1700 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1701 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1702 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1703 return false;
1704
1705 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1706
1707 Module *M = MF->getFunction().getParent();
1708 const MDNode *Metadata = I.getOperand(2).getMetadata();
1709 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1710 auto *RelocSymbol = cast<GlobalVariable>(
1711 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1712
1713 MachineBasicBlock *BB = I.getParent();
1714 BuildMI(*BB, &I, I.getDebugLoc(),
1715 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1717
1718 I.eraseFromParent();
1719 return true;
1720}
1721
1722bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1723 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1724
1725 Register DstReg = I.getOperand(0).getReg();
1726 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1727 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1728 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1729
1730 MachineBasicBlock *MBB = I.getParent();
1731 const DebugLoc &DL = I.getDebugLoc();
1732
1733 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1734
1735 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1736 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1737 MIB.addImm(MFI->getLDSSize());
1738 } else {
1739 Module *M = MF->getFunction().getParent();
1740 const GlobalValue *GV =
1741 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1743 }
1744
1745 I.eraseFromParent();
1746 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1747 return true;
1748}
1749
1750bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1751 MachineBasicBlock *MBB = I.getParent();
1752 MachineFunction &MF = *MBB->getParent();
1753 const DebugLoc &DL = I.getDebugLoc();
1754
1755 MachineOperand &Dst = I.getOperand(0);
1756 Register DstReg = Dst.getReg();
1757 unsigned Depth = I.getOperand(2).getImm();
1758
1759 const TargetRegisterClass *RC
1760 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1761 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1762 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1763 return false;
1764
1765 // Check for kernel and shader functions
1766 if (Depth != 0 ||
1767 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1768 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1769 .addImm(0);
1770 I.eraseFromParent();
1771 return true;
1772 }
1773
1774 MachineFrameInfo &MFI = MF.getFrameInfo();
1775 // There is a call to @llvm.returnaddress in this function
1776 MFI.setReturnAddressIsTaken(true);
1777
1778 // Get the return address reg and mark it as an implicit live-in
1779 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1780 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1781 AMDGPU::SReg_64RegClass, DL);
1782 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1783 .addReg(LiveIn);
1784 I.eraseFromParent();
1785 return true;
1786}
1787
1788bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1789 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1790 // SelectionDAG uses for wave32 vs wave64.
1791 MachineBasicBlock *BB = MI.getParent();
1792 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1793 .add(MI.getOperand(1));
1794
1795 Register Reg = MI.getOperand(1).getReg();
1796 MI.eraseFromParent();
1797
1798 if (!MRI->getRegClassOrNull(Reg))
1799 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1800 return true;
1801}
1802
1803bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1804 MachineInstr &MI, Intrinsic::ID IntrID) const {
1805 MachineBasicBlock *MBB = MI.getParent();
1806 MachineFunction *MF = MBB->getParent();
1807 const DebugLoc &DL = MI.getDebugLoc();
1808
1809 unsigned IndexOperand = MI.getOperand(7).getImm();
1810 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1811 bool WaveDone = MI.getOperand(9).getImm() != 0;
1812
1813 if (WaveDone && !WaveRelease) {
1814 // TODO: Move this to IR verifier
1815 const Function &Fn = MF->getFunction();
1816 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1817 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1818 }
1819
1820 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1821 IndexOperand &= ~0x3f;
1822 unsigned CountDw = 0;
1823
1824 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1825 CountDw = (IndexOperand >> 24) & 0xf;
1826 IndexOperand &= ~(0xf << 24);
1827
1828 if (CountDw < 1 || CountDw > 4) {
1829 const Function &Fn = MF->getFunction();
1830 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1831 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1832 CountDw = 1;
1833 }
1834 }
1835
1836 if (IndexOperand) {
1837 const Function &Fn = MF->getFunction();
1838 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1839 Fn, "ds_ordered_count: bad index operand", DL));
1840 }
1841
1842 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1843 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1844
1845 unsigned Offset0 = OrderedCountIndex << 2;
1846 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1847
1848 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1849 Offset1 |= (CountDw - 1) << 6;
1850
1851 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1852 Offset1 |= ShaderType << 2;
1853
1854 unsigned Offset = Offset0 | (Offset1 << 8);
1855
1856 Register M0Val = MI.getOperand(2).getReg();
1857 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1858 .addReg(M0Val);
1859
1860 Register DstReg = MI.getOperand(0).getReg();
1861 Register ValReg = MI.getOperand(3).getReg();
1862 MachineInstrBuilder DS =
1863 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1864 .addReg(ValReg)
1865 .addImm(Offset)
1866 .cloneMemRefs(MI);
1867
1868 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1869 return false;
1870
1871 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1872 MI.eraseFromParent();
1873 return true;
1874}
1875
1876static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1877 switch (IntrID) {
1878 case Intrinsic::amdgcn_ds_gws_init:
1879 return AMDGPU::DS_GWS_INIT;
1880 case Intrinsic::amdgcn_ds_gws_barrier:
1881 return AMDGPU::DS_GWS_BARRIER;
1882 case Intrinsic::amdgcn_ds_gws_sema_v:
1883 return AMDGPU::DS_GWS_SEMA_V;
1884 case Intrinsic::amdgcn_ds_gws_sema_br:
1885 return AMDGPU::DS_GWS_SEMA_BR;
1886 case Intrinsic::amdgcn_ds_gws_sema_p:
1887 return AMDGPU::DS_GWS_SEMA_P;
1888 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1889 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1890 default:
1891 llvm_unreachable("not a gws intrinsic");
1892 }
1893}
1894
1895bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1896 Intrinsic::ID IID) const {
1897 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1898 !STI.hasGWSSemaReleaseAll()))
1899 return false;
1900
1901 // intrinsic ID, vsrc, offset
1902 const bool HasVSrc = MI.getNumOperands() == 3;
1903 assert(HasVSrc || MI.getNumOperands() == 2);
1904
1905 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1906 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1907 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1908 return false;
1909
1910 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1911 unsigned ImmOffset;
1912
1913 MachineBasicBlock *MBB = MI.getParent();
1914 const DebugLoc &DL = MI.getDebugLoc();
1915
1916 MachineInstr *Readfirstlane = nullptr;
1917
1918 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1919 // incoming offset, in case there's an add of a constant. We'll have to put it
1920 // back later.
1921 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1922 Readfirstlane = OffsetDef;
1923 BaseOffset = OffsetDef->getOperand(1).getReg();
1924 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1925 }
1926
1927 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1928 // If we have a constant offset, try to use the 0 in m0 as the base.
1929 // TODO: Look into changing the default m0 initialization value. If the
1930 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1931 // the immediate offset.
1932
1933 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1934 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1935 .addImm(0);
1936 } else {
1937 std::tie(BaseOffset, ImmOffset) =
1938 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1939
1940 if (Readfirstlane) {
1941 // We have the constant offset now, so put the readfirstlane back on the
1942 // variable component.
1943 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1944 return false;
1945
1946 Readfirstlane->getOperand(1).setReg(BaseOffset);
1947 BaseOffset = Readfirstlane->getOperand(0).getReg();
1948 } else {
1949 if (!RBI.constrainGenericRegister(BaseOffset,
1950 AMDGPU::SReg_32RegClass, *MRI))
1951 return false;
1952 }
1953
1954 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1955 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1956 .addReg(BaseOffset)
1957 .addImm(16)
1958 .setOperandDead(3); // Dead scc
1959
1960 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1961 .addReg(M0Base);
1962 }
1963
1964 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1965 // offset field) % 64. Some versions of the programming guide omit the m0
1966 // part, or claim it's from offset 0.
1967
1968 unsigned Opc = gwsIntrinToOpcode(IID);
1969 const MCInstrDesc &InstrDesc = TII.get(Opc);
1970
1971 if (HasVSrc) {
1972 Register VSrc = MI.getOperand(1).getReg();
1973
1974 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1975 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1976 const TargetRegisterClass *SubRC =
1977 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1978
1979 if (!SubRC) {
1980 // 32-bit normal case.
1981 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1982 return false;
1983
1984 BuildMI(*MBB, &MI, DL, InstrDesc)
1985 .addReg(VSrc)
1986 .addImm(ImmOffset)
1987 .cloneMemRefs(MI);
1988 } else {
1989 // Requires even register alignment, so create 64-bit value and pad the
1990 // top half with undef.
1991 Register DataReg = MRI->createVirtualRegister(DataRC);
1992 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1993 return false;
1994
1995 Register UndefReg = MRI->createVirtualRegister(SubRC);
1996 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1997 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1998 .addReg(VSrc)
1999 .addImm(AMDGPU::sub0)
2000 .addReg(UndefReg)
2001 .addImm(AMDGPU::sub1);
2002
2003 BuildMI(*MBB, &MI, DL, InstrDesc)
2004 .addReg(DataReg)
2005 .addImm(ImmOffset)
2006 .cloneMemRefs(MI);
2007 }
2008 } else {
2009 BuildMI(*MBB, &MI, DL, InstrDesc)
2010 .addImm(ImmOffset)
2011 .cloneMemRefs(MI);
2012 }
2013
2014 MI.eraseFromParent();
2015 return true;
2016}
2017
2018bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2019 bool IsAppend) const {
2020 Register PtrBase = MI.getOperand(2).getReg();
2021 LLT PtrTy = MRI->getType(PtrBase);
2022 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2023
2024 unsigned Offset;
2025 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2026
2027 // TODO: Should this try to look through readfirstlane like GWS?
2028 if (!isDSOffsetLegal(PtrBase, Offset)) {
2029 PtrBase = MI.getOperand(2).getReg();
2030 Offset = 0;
2031 }
2032
2033 MachineBasicBlock *MBB = MI.getParent();
2034 const DebugLoc &DL = MI.getDebugLoc();
2035 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2036
2037 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2038 .addReg(PtrBase);
2039 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2040 return false;
2041
2042 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2043 .addImm(Offset)
2044 .addImm(IsGDS ? -1 : 0)
2045 .cloneMemRefs(MI);
2046 MI.eraseFromParent();
2047 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2048 return true;
2049}
2050
2051bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2052 MachineFunction *MF = MI.getMF();
2053 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2054
2055 MFInfo->setInitWholeWave();
2056 return selectImpl(MI, *CoverageInfo);
2057}
2058
2059static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2060 bool &IsTexFail) {
2061 if (TexFailCtrl)
2062 IsTexFail = true;
2063
2064 TFE = TexFailCtrl & 0x1;
2065 TexFailCtrl &= ~(uint64_t)0x1;
2066 LWE = TexFailCtrl & 0x2;
2067 TexFailCtrl &= ~(uint64_t)0x2;
2068
2069 return TexFailCtrl == 0;
2070}
2071
2072bool AMDGPUInstructionSelector::selectImageIntrinsic(
2073 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2074 MachineBasicBlock *MBB = MI.getParent();
2075 const DebugLoc &DL = MI.getDebugLoc();
2076 unsigned IntrOpcode = Intr->BaseOpcode;
2077
2078 // For image atomic: use no-return opcode if result is unused.
2079 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2080 Register ResultDef = MI.getOperand(0).getReg();
2081 if (MRI->use_nodbg_empty(ResultDef))
2082 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2083 }
2084
2085 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2087
2088 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2089 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2090 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2091 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2092
2093 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2094
2095 Register VDataIn = AMDGPU::NoRegister;
2096 Register VDataOut = AMDGPU::NoRegister;
2097 LLT VDataTy;
2098 int NumVDataDwords = -1;
2099 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2100 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2101
2102 bool Unorm;
2103 if (!BaseOpcode->Sampler)
2104 Unorm = true;
2105 else
2106 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2107
2108 bool TFE;
2109 bool LWE;
2110 bool IsTexFail = false;
2111 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2112 TFE, LWE, IsTexFail))
2113 return false;
2114
2115 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2116 const bool IsA16 = (Flags & 1) != 0;
2117 const bool IsG16 = (Flags & 2) != 0;
2118
2119 // A16 implies 16 bit gradients if subtarget doesn't support G16
2120 if (IsA16 && !STI.hasG16() && !IsG16)
2121 return false;
2122
2123 unsigned DMask = 0;
2124 unsigned DMaskLanes = 0;
2125
2126 if (BaseOpcode->Atomic) {
2127 if (!BaseOpcode->NoReturn)
2128 VDataOut = MI.getOperand(0).getReg();
2129 VDataIn = MI.getOperand(2).getReg();
2130 LLT Ty = MRI->getType(VDataIn);
2131
2132 // Be careful to allow atomic swap on 16-bit element vectors.
2133 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2134 Ty.getSizeInBits() == 128 :
2135 Ty.getSizeInBits() == 64;
2136
2137 if (BaseOpcode->AtomicX2) {
2138 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2139
2140 DMask = Is64Bit ? 0xf : 0x3;
2141 NumVDataDwords = Is64Bit ? 4 : 2;
2142 } else {
2143 DMask = Is64Bit ? 0x3 : 0x1;
2144 NumVDataDwords = Is64Bit ? 2 : 1;
2145 }
2146 } else {
2147 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2148 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2149
2150 if (BaseOpcode->Store) {
2151 VDataIn = MI.getOperand(1).getReg();
2152 VDataTy = MRI->getType(VDataIn);
2153 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2154 } else if (BaseOpcode->NoReturn) {
2155 NumVDataDwords = 0;
2156 } else {
2157 VDataOut = MI.getOperand(0).getReg();
2158 VDataTy = MRI->getType(VDataOut);
2159 NumVDataDwords = DMaskLanes;
2160
2161 if (IsD16 && !STI.hasUnpackedD16VMem())
2162 NumVDataDwords = (DMaskLanes + 1) / 2;
2163 }
2164 }
2165
2166 // Set G16 opcode
2167 if (Subtarget->hasG16() && IsG16) {
2168 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2170 assert(G16MappingInfo);
2171 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2172 }
2173
2174 // TODO: Check this in verifier.
2175 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2176
2177 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2178 // Keep GLC only when the atomic's result is actually used.
2179 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2181 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2183 return false;
2184
2185 int NumVAddrRegs = 0;
2186 int NumVAddrDwords = 0;
2187 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2188 // Skip the $noregs and 0s inserted during legalization.
2189 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2190 if (!AddrOp.isReg())
2191 continue; // XXX - Break?
2192
2193 Register Addr = AddrOp.getReg();
2194 if (!Addr)
2195 break;
2196
2197 ++NumVAddrRegs;
2198 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2199 }
2200
2201 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2202 // NSA, these should have been packed into a single value in the first
2203 // address register
2204 const bool UseNSA =
2205 NumVAddrRegs != 1 &&
2206 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2207 : NumVAddrDwords == NumVAddrRegs);
2208 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2209 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2210 return false;
2211 }
2212
2213 if (IsTexFail)
2214 ++NumVDataDwords;
2215
2216 int Opcode = -1;
2217 if (IsGFX12Plus) {
2218 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2219 NumVDataDwords, NumVAddrDwords);
2220 } else if (IsGFX11Plus) {
2221 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2222 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2223 : AMDGPU::MIMGEncGfx11Default,
2224 NumVDataDwords, NumVAddrDwords);
2225 } else if (IsGFX10Plus) {
2226 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2227 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2228 : AMDGPU::MIMGEncGfx10Default,
2229 NumVDataDwords, NumVAddrDwords);
2230 } else {
2231 if (Subtarget->hasGFX90AInsts()) {
2232 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2233 NumVDataDwords, NumVAddrDwords);
2234 if (Opcode == -1) {
2235 LLVM_DEBUG(
2236 dbgs()
2237 << "requested image instruction is not supported on this GPU\n");
2238 return false;
2239 }
2240 }
2241 if (Opcode == -1 &&
2242 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2243 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2244 NumVDataDwords, NumVAddrDwords);
2245 if (Opcode == -1)
2246 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2247 NumVDataDwords, NumVAddrDwords);
2248 }
2249 if (Opcode == -1)
2250 return false;
2251
2252 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2253 .cloneMemRefs(MI);
2254
2255 if (VDataOut) {
2256 if (BaseOpcode->AtomicX2) {
2257 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2258
2259 Register TmpReg = MRI->createVirtualRegister(
2260 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2261 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2262
2263 MIB.addDef(TmpReg);
2264 if (!MRI->use_empty(VDataOut)) {
2265 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2266 .addReg(TmpReg, RegState::Kill, SubReg);
2267 }
2268
2269 } else {
2270 MIB.addDef(VDataOut); // vdata output
2271 }
2272 }
2273
2274 if (VDataIn)
2275 MIB.addReg(VDataIn); // vdata input
2276
2277 for (int I = 0; I != NumVAddrRegs; ++I) {
2278 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2279 if (SrcOp.isReg()) {
2280 assert(SrcOp.getReg() != 0);
2281 MIB.addReg(SrcOp.getReg());
2282 }
2283 }
2284
2285 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2286 if (BaseOpcode->Sampler)
2287 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2288
2289 MIB.addImm(DMask); // dmask
2290
2291 if (IsGFX10Plus)
2292 MIB.addImm(DimInfo->Encoding);
2293 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2294 MIB.addImm(Unorm);
2295
2296 MIB.addImm(CPol);
2297 MIB.addImm(IsA16 && // a16 or r128
2298 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2299 if (IsGFX10Plus)
2300 MIB.addImm(IsA16 ? -1 : 0);
2301
2302 if (!Subtarget->hasGFX90AInsts()) {
2303 MIB.addImm(TFE); // tfe
2304 } else if (TFE) {
2305 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2306 return false;
2307 }
2308
2309 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2310 MIB.addImm(LWE); // lwe
2311 if (!IsGFX10Plus)
2312 MIB.addImm(DimInfo->DA ? -1 : 0);
2313 if (BaseOpcode->HasD16)
2314 MIB.addImm(IsD16 ? -1 : 0);
2315
2316 MI.eraseFromParent();
2317 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2318 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2319 return true;
2320}
2321
2322// We need to handle this here because tablegen doesn't support matching
2323// instructions with multiple outputs.
2324bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2325 MachineInstr &MI) const {
2326 Register Dst0 = MI.getOperand(0).getReg();
2327 Register Dst1 = MI.getOperand(1).getReg();
2328
2329 const DebugLoc &DL = MI.getDebugLoc();
2330 MachineBasicBlock *MBB = MI.getParent();
2331
2332 Register Addr = MI.getOperand(3).getReg();
2333 Register Data0 = MI.getOperand(4).getReg();
2334 Register Data1 = MI.getOperand(5).getReg();
2335 unsigned Offset = MI.getOperand(6).getImm();
2336
2337 unsigned Opc;
2338 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2339 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2340 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2341 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2342 break;
2343 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2344 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2345 break;
2346 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2347 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2348 break;
2349 }
2350
2351 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2352 .addDef(Dst1)
2353 .addUse(Addr)
2354 .addUse(Data0)
2355 .addUse(Data1)
2356 .addImm(Offset)
2357 .cloneMemRefs(MI);
2358
2359 MI.eraseFromParent();
2360 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2361 return true;
2362}
2363
2364bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2365 MachineInstr &I) const {
2366 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2367 switch (IntrinsicID) {
2368 case Intrinsic::amdgcn_end_cf:
2369 return selectEndCfIntrinsic(I);
2370 case Intrinsic::amdgcn_ds_ordered_add:
2371 case Intrinsic::amdgcn_ds_ordered_swap:
2372 return selectDSOrderedIntrinsic(I, IntrinsicID);
2373 case Intrinsic::amdgcn_ds_gws_init:
2374 case Intrinsic::amdgcn_ds_gws_barrier:
2375 case Intrinsic::amdgcn_ds_gws_sema_v:
2376 case Intrinsic::amdgcn_ds_gws_sema_br:
2377 case Intrinsic::amdgcn_ds_gws_sema_p:
2378 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2379 return selectDSGWSIntrinsic(I, IntrinsicID);
2380 case Intrinsic::amdgcn_ds_append:
2381 return selectDSAppendConsume(I, true);
2382 case Intrinsic::amdgcn_ds_consume:
2383 return selectDSAppendConsume(I, false);
2384 case Intrinsic::amdgcn_init_whole_wave:
2385 return selectInitWholeWave(I);
2386 case Intrinsic::amdgcn_raw_buffer_load_lds:
2387 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2388 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2389 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2390 case Intrinsic::amdgcn_struct_buffer_load_lds:
2391 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2392 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2393 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2394 return selectBufferLoadLds(I);
2395 // Until we can store both the address space of the global and the LDS
2396 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2397 // that the argument is a global pointer (buffer pointers have been handled by
2398 // a LLVM IR-level lowering).
2399 case Intrinsic::amdgcn_load_to_lds:
2400 case Intrinsic::amdgcn_load_async_to_lds:
2401 case Intrinsic::amdgcn_global_load_lds:
2402 case Intrinsic::amdgcn_global_load_async_lds:
2403 return selectGlobalLoadLds(I);
2404 case Intrinsic::amdgcn_tensor_load_to_lds:
2405 case Intrinsic::amdgcn_tensor_store_from_lds:
2406 return selectTensorLoadStore(I, IntrinsicID);
2407 case Intrinsic::amdgcn_asyncmark:
2408 case Intrinsic::amdgcn_wait_asyncmark:
2409 if (!Subtarget->hasAsyncMark())
2410 return false;
2411 break;
2412 case Intrinsic::amdgcn_exp_compr:
2413 if (!STI.hasCompressedExport()) {
2414 Function &F = I.getMF()->getFunction();
2415 F.getContext().diagnose(
2416 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2417 I.getDebugLoc(), DS_Error));
2418 return false;
2419 }
2420 break;
2421 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2422 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2423 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2424 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2425 return selectDSBvhStackIntrinsic(I);
2426 case Intrinsic::amdgcn_s_alloc_vgpr: {
2427 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2428 // SCC. We then need to COPY it into the result vreg.
2429 MachineBasicBlock *MBB = I.getParent();
2430 const DebugLoc &DL = I.getDebugLoc();
2431
2432 Register ResReg = I.getOperand(0).getReg();
2433
2434 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2435 .add(I.getOperand(2));
2436 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2437 .addReg(AMDGPU::SCC);
2438 I.eraseFromParent();
2439 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2440 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2441 }
2442 case Intrinsic::amdgcn_s_barrier_init:
2443 case Intrinsic::amdgcn_s_barrier_signal_var:
2444 return selectNamedBarrierInit(I, IntrinsicID);
2445 case Intrinsic::amdgcn_s_wakeup_barrier: {
2446 if (!STI.hasSWakeupBarrier()) {
2447 Function &F = I.getMF()->getFunction();
2448 F.getContext().diagnose(
2449 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2450 I.getDebugLoc(), DS_Error));
2451 return false;
2452 }
2453 return selectNamedBarrierInst(I, IntrinsicID);
2454 }
2455 case Intrinsic::amdgcn_s_barrier_join:
2456 case Intrinsic::amdgcn_s_get_named_barrier_state:
2457 return selectNamedBarrierInst(I, IntrinsicID);
2458 case Intrinsic::amdgcn_s_get_barrier_state:
2459 return selectSGetBarrierState(I, IntrinsicID);
2460 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2461 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2462 }
2463 return selectImpl(I, *CoverageInfo);
2464}
2465
2466bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2467 if (selectImpl(I, *CoverageInfo))
2468 return true;
2469
2470 MachineBasicBlock *BB = I.getParent();
2471 const DebugLoc &DL = I.getDebugLoc();
2472
2473 Register DstReg = I.getOperand(0).getReg();
2474 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2475 assert(Size <= 32 || Size == 64);
2476 const MachineOperand &CCOp = I.getOperand(1);
2477 Register CCReg = CCOp.getReg();
2478 if (!isVCC(CCReg, *MRI)) {
2479 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2480 AMDGPU::S_CSELECT_B32;
2481 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2482 .addReg(CCReg);
2483
2484 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2485 // bank, because it does not cover the register class that we used to represent
2486 // for it. So we need to manually set the register class here.
2487 if (!MRI->getRegClassOrNull(CCReg))
2488 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2489 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2490 .add(I.getOperand(2))
2491 .add(I.getOperand(3));
2492
2494 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2495 I.eraseFromParent();
2496 return true;
2497 }
2498
2499 // Wide VGPR select should have been split in RegBankSelect.
2500 if (Size > 32)
2501 return false;
2502
2503 MachineInstr *Select =
2504 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2505 .addImm(0)
2506 .add(I.getOperand(3))
2507 .addImm(0)
2508 .add(I.getOperand(2))
2509 .add(I.getOperand(1));
2510
2512 I.eraseFromParent();
2513 return true;
2514}
2515
2516bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2517 Register DstReg = I.getOperand(0).getReg();
2518 Register SrcReg = I.getOperand(1).getReg();
2519 const LLT DstTy = MRI->getType(DstReg);
2520 const LLT SrcTy = MRI->getType(SrcReg);
2521 const LLT S1 = LLT::scalar(1);
2522
2523 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2524 const RegisterBank *DstRB;
2525 if (DstTy == S1) {
2526 // This is a special case. We don't treat s1 for legalization artifacts as
2527 // vcc booleans.
2528 DstRB = SrcRB;
2529 } else {
2530 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2531 if (SrcRB != DstRB)
2532 return false;
2533 }
2534
2535 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2536
2537 unsigned DstSize = DstTy.getSizeInBits();
2538 unsigned SrcSize = SrcTy.getSizeInBits();
2539
2540 const TargetRegisterClass *SrcRC =
2541 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2542 const TargetRegisterClass *DstRC =
2543 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2544 if (!SrcRC || !DstRC)
2545 return false;
2546
2547 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2548 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2549 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2550 return false;
2551 }
2552
2553 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2554 assert(STI.useRealTrue16Insts());
2555 const DebugLoc &DL = I.getDebugLoc();
2556 MachineBasicBlock *MBB = I.getParent();
2557 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2558 .addReg(SrcReg, {}, AMDGPU::lo16);
2559 I.eraseFromParent();
2560 return true;
2561 }
2562
2563 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2564 MachineBasicBlock *MBB = I.getParent();
2565 const DebugLoc &DL = I.getDebugLoc();
2566
2567 Register LoReg = MRI->createVirtualRegister(DstRC);
2568 Register HiReg = MRI->createVirtualRegister(DstRC);
2569 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2570 .addReg(SrcReg, {}, AMDGPU::sub0);
2571 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2572 .addReg(SrcReg, {}, AMDGPU::sub1);
2573
2574 if (IsVALU && STI.hasSDWA()) {
2575 // Write the low 16-bits of the high element into the high 16-bits of the
2576 // low element.
2577 MachineInstr *MovSDWA =
2578 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2579 .addImm(0) // $src0_modifiers
2580 .addReg(HiReg) // $src0
2581 .addImm(0) // $clamp
2582 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2583 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2584 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2585 .addReg(LoReg, RegState::Implicit);
2586 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2587 } else {
2588 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2589 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2590 Register ImmReg = MRI->createVirtualRegister(DstRC);
2591 if (IsVALU) {
2592 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2593 .addImm(16)
2594 .addReg(HiReg);
2595 } else {
2596 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2597 .addReg(HiReg)
2598 .addImm(16)
2599 .setOperandDead(3); // Dead scc
2600 }
2601
2602 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2603 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2604 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2605
2606 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2607 .addImm(0xffff);
2608 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2609 .addReg(LoReg)
2610 .addReg(ImmReg);
2611 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2612 .addReg(TmpReg0)
2613 .addReg(TmpReg1);
2614
2615 if (!IsVALU) {
2616 And.setOperandDead(3); // Dead scc
2617 Or.setOperandDead(3); // Dead scc
2618 }
2619 }
2620
2621 I.eraseFromParent();
2622 return true;
2623 }
2624
2625 if (!DstTy.isScalar())
2626 return false;
2627
2628 if (SrcSize > 32) {
2629 unsigned SubRegIdx = DstSize < 32
2630 ? static_cast<unsigned>(AMDGPU::sub0)
2631 : TRI.getSubRegFromChannel(0, DstSize / 32);
2632 if (SubRegIdx == AMDGPU::NoSubRegister)
2633 return false;
2634
2635 // Deal with weird cases where the class only partially supports the subreg
2636 // index.
2637 const TargetRegisterClass *SrcWithSubRC
2638 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2639 if (!SrcWithSubRC)
2640 return false;
2641
2642 if (SrcWithSubRC != SrcRC) {
2643 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2644 return false;
2645 }
2646
2647 I.getOperand(1).setSubReg(SubRegIdx);
2648 }
2649
2650 I.setDesc(TII.get(TargetOpcode::COPY));
2651 return true;
2652}
2653
2654/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2655static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2657 int SignedMask = static_cast<int>(Mask);
2658 return SignedMask >= -16 && SignedMask <= 64;
2659}
2660
2661// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2662const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2663 Register Reg, const MachineRegisterInfo &MRI,
2664 const TargetRegisterInfo &TRI) const {
2665 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2666 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2667 return RB;
2668
2669 // Ignore the type, since we don't use vcc in artifacts.
2670 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2671 return &RBI.getRegBankFromRegClass(*RC, LLT());
2672 return nullptr;
2673}
2674
2675bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2676 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2677 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2678 const DebugLoc &DL = I.getDebugLoc();
2679 MachineBasicBlock &MBB = *I.getParent();
2680 const Register DstReg = I.getOperand(0).getReg();
2681 const Register SrcReg = I.getOperand(1).getReg();
2682
2683 const LLT DstTy = MRI->getType(DstReg);
2684 const LLT SrcTy = MRI->getType(SrcReg);
2685 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2686 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2687 const unsigned DstSize = DstTy.getSizeInBits();
2688 if (!DstTy.isScalar())
2689 return false;
2690
2691 // Artifact casts should never use vcc.
2692 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2693
2694 // FIXME: This should probably be illegal and split earlier.
2695 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2696 if (DstSize <= 32)
2697 return selectCOPY(I);
2698
2699 const TargetRegisterClass *SrcRC =
2700 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2701 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2702 const TargetRegisterClass *DstRC =
2703 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2704
2705 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2706 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2707 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2708 .addReg(SrcReg)
2709 .addImm(AMDGPU::sub0)
2710 .addReg(UndefReg)
2711 .addImm(AMDGPU::sub1);
2712 I.eraseFromParent();
2713
2714 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2715 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2716 }
2717
2718 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2719 // 64-bit should have been split up in RegBankSelect
2720
2721 // Try to use an and with a mask if it will save code size.
2722 unsigned Mask;
2723 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2724 MachineInstr *ExtI =
2725 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2726 .addImm(Mask)
2727 .addReg(SrcReg);
2728 I.eraseFromParent();
2729 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2730 return true;
2731 }
2732
2733 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2734 MachineInstr *ExtI =
2735 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2736 .addReg(SrcReg)
2737 .addImm(0) // Offset
2738 .addImm(SrcSize); // Width
2739 I.eraseFromParent();
2740 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2741 return true;
2742 }
2743
2744 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2745 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2746 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2747 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2748 return false;
2749
2750 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2751 const unsigned SextOpc = SrcSize == 8 ?
2752 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2753 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2754 .addReg(SrcReg);
2755 I.eraseFromParent();
2756 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2757 }
2758
2759 // Using a single 32-bit SALU to calculate the high half is smaller than
2760 // S_BFE with a literal constant operand.
2761 if (DstSize > 32 && SrcSize == 32) {
2762 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2763 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2764 if (Signed) {
2765 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2766 .addReg(SrcReg, {}, SubReg)
2767 .addImm(31)
2768 .setOperandDead(3); // Dead scc
2769 } else {
2770 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2771 .addImm(0);
2772 }
2773 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2774 .addReg(SrcReg, {}, SubReg)
2775 .addImm(AMDGPU::sub0)
2776 .addReg(HiReg)
2777 .addImm(AMDGPU::sub1);
2778 I.eraseFromParent();
2779 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2780 *MRI);
2781 }
2782
2783 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2784 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2785
2786 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2787 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2788 // We need a 64-bit register source, but the high bits don't matter.
2789 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2790 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2791 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2792
2793 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2794 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2795 .addReg(SrcReg, {}, SubReg)
2796 .addImm(AMDGPU::sub0)
2797 .addReg(UndefReg)
2798 .addImm(AMDGPU::sub1);
2799
2800 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2801 .addReg(ExtReg)
2802 .addImm(SrcSize << 16);
2803
2804 I.eraseFromParent();
2805 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2806 }
2807
2808 unsigned Mask;
2809 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2810 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2811 .addReg(SrcReg)
2812 .addImm(Mask)
2813 .setOperandDead(3); // Dead scc
2814 } else {
2815 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2816 .addReg(SrcReg)
2817 .addImm(SrcSize << 16);
2818 }
2819
2820 I.eraseFromParent();
2821 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2822 }
2823
2824 return false;
2825}
2826
2830
2832 Register BitcastSrc;
2833 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2834 Reg = BitcastSrc;
2835 return Reg;
2836}
2837
2839 Register &Out) {
2840 // When unmerging a register that is composed of 2 x 16-bit values allow to
2841 // use an extract hi instruction for the upper 16 bits. We only need to check
2842 // the size of `In` as all defs are guaranteed to be the same type for
2843 // GUnmerge.
2844 if (auto *Unmerge = dyn_cast<GUnmerge>(MRI.getVRegDef(In))) {
2845 if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(1).getReg() == In &&
2846 MRI.getType(In).getSizeInBits() == 16) {
2847 Out = Unmerge->getSourceReg();
2848 return true;
2849 }
2850 }
2851
2852 Register Trunc;
2853 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2854 return false;
2855
2856 Register LShlSrc;
2857 Register Cst;
2858 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2859 Cst = stripCopy(Cst, MRI);
2860 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2861 Out = stripBitCast(LShlSrc, MRI);
2862 return true;
2863 }
2864 }
2865
2866 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2867 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2868 return false;
2869
2870 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2871 LLT::fixed_vector(2, 16));
2872
2873 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2874 assert(Mask.size() == 2);
2875
2876 if (Mask[0] == 1 && Mask[1] <= 1) {
2877 Out = Shuffle->getOperand(0).getReg();
2878 return true;
2879 }
2880
2881 return false;
2882}
2883
2884bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2885 if (!Subtarget->hasSALUFloatInsts())
2886 return false;
2887
2888 Register Dst = I.getOperand(0).getReg();
2889 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2890 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2891 return false;
2892
2893 Register Src = I.getOperand(1).getReg();
2894
2895 if (MRI->getType(Dst) == LLT::scalar(32) &&
2896 MRI->getType(Src) == LLT::scalar(16)) {
2897 if (isExtractHiElt(*MRI, Src, Src)) {
2898 MachineBasicBlock *BB = I.getParent();
2899 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2900 .addUse(Src);
2901 I.eraseFromParent();
2902 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2903 }
2904 }
2905
2906 return false;
2907}
2908
2909bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2910 // Only manually handle the f64 SGPR case.
2911 //
2912 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2913 // the bit ops theoretically have a second result due to the implicit def of
2914 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2915 // that is easy by disabling the check. The result works, but uses a
2916 // nonsensical sreg32orlds_and_sreg_1 regclass.
2917 //
2918 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2919 // the variadic REG_SEQUENCE operands.
2920
2921 Register Dst = MI.getOperand(0).getReg();
2922 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2923 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2924 MRI->getType(Dst) != LLT::scalar(64))
2925 return false;
2926
2927 Register Src = MI.getOperand(1).getReg();
2928 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2929 if (Fabs)
2930 Src = Fabs->getOperand(1).getReg();
2931
2932 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2933 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2934 return false;
2935
2936 MachineBasicBlock *BB = MI.getParent();
2937 const DebugLoc &DL = MI.getDebugLoc();
2938 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2939 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2940 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2941 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2942
2943 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2944 .addReg(Src, {}, AMDGPU::sub0);
2945 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2946 .addReg(Src, {}, AMDGPU::sub1);
2947 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2948 .addImm(0x80000000);
2949
2950 // Set or toggle sign bit.
2951 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2952 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2953 .addReg(HiReg)
2954 .addReg(ConstReg)
2955 .setOperandDead(3); // Dead scc
2956 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2957 .addReg(LoReg)
2958 .addImm(AMDGPU::sub0)
2959 .addReg(OpReg)
2960 .addImm(AMDGPU::sub1);
2961 MI.eraseFromParent();
2962 return true;
2963}
2964
2965// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2966bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2967 Register Dst = MI.getOperand(0).getReg();
2968 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2969 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2970 MRI->getType(Dst) != LLT::scalar(64))
2971 return false;
2972
2973 Register Src = MI.getOperand(1).getReg();
2974 MachineBasicBlock *BB = MI.getParent();
2975 const DebugLoc &DL = MI.getDebugLoc();
2976 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2977 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2978 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2979 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2980
2981 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2982 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2983 return false;
2984
2985 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2986 .addReg(Src, {}, AMDGPU::sub0);
2987 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2988 .addReg(Src, {}, AMDGPU::sub1);
2989 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2990 .addImm(0x7fffffff);
2991
2992 // Clear sign bit.
2993 // TODO: Should this used S_BITSET0_*?
2994 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2995 .addReg(HiReg)
2996 .addReg(ConstReg)
2997 .setOperandDead(3); // Dead scc
2998 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2999 .addReg(LoReg)
3000 .addImm(AMDGPU::sub0)
3001 .addReg(OpReg)
3002 .addImm(AMDGPU::sub1);
3003
3004 MI.eraseFromParent();
3005 return true;
3006}
3007
3008static bool isConstant(const MachineInstr &MI) {
3009 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
3010}
3011
3012void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
3013 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
3014
3015 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
3016 const MachineInstr *PtrMI =
3017 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
3018
3019 assert(PtrMI);
3020
3021 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3022 return;
3023
3024 GEPInfo GEPInfo;
3025
3026 for (unsigned i = 1; i != 3; ++i) {
3027 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3028 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3029 assert(OpDef);
3030 if (i == 2 && isConstant(*OpDef)) {
3031 // TODO: Could handle constant base + variable offset, but a combine
3032 // probably should have commuted it.
3033 assert(GEPInfo.Imm == 0);
3034 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3035 continue;
3036 }
3037 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3038 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3039 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3040 else
3041 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3042 }
3043
3044 AddrInfo.push_back(GEPInfo);
3045 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3046}
3047
3048bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3049 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3050}
3051
3052bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3053 if (!MI.hasOneMemOperand())
3054 return false;
3055
3056 const MachineMemOperand *MMO = *MI.memoperands_begin();
3057 const Value *Ptr = MMO->getValue();
3058
3059 // UndefValue means this is a load of a kernel input. These are uniform.
3060 // Sometimes LDS instructions have constant pointers.
3061 // If Ptr is null, then that means this mem operand contains a
3062 // PseudoSourceValue like GOT.
3064 return true;
3065
3067 return true;
3068
3069 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3070 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3071 AMDGPU::SGPRRegBankID;
3072
3073 const Instruction *I = dyn_cast<Instruction>(Ptr);
3074 return I && I->getMetadata("amdgpu.uniform");
3075}
3076
3077bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3078 for (const GEPInfo &GEPInfo : AddrInfo) {
3079 if (!GEPInfo.VgprParts.empty())
3080 return true;
3081 }
3082 return false;
3083}
3084
3085void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3086 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3087 unsigned AS = PtrTy.getAddressSpace();
3089 STI.ldsRequiresM0Init()) {
3090 MachineBasicBlock *BB = I.getParent();
3091
3092 // If DS instructions require M0 initialization, insert it before selecting.
3093 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3094 .addImm(-1);
3095 }
3096}
3097
3098bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3099 MachineInstr &I) const {
3100 initM0(I);
3101 return selectImpl(I, *CoverageInfo);
3102}
3103
3105 if (Reg.isPhysical())
3106 return false;
3107
3109 const unsigned Opcode = MI.getOpcode();
3110
3111 if (Opcode == AMDGPU::COPY)
3112 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3113
3114 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3115 Opcode == AMDGPU::G_XOR)
3116 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3117 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3118
3119 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3120 return GI->is(Intrinsic::amdgcn_class);
3121
3122 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3123}
3124
3125bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3126 MachineBasicBlock *BB = I.getParent();
3127 MachineOperand &CondOp = I.getOperand(0);
3128 Register CondReg = CondOp.getReg();
3129 const DebugLoc &DL = I.getDebugLoc();
3130
3131 unsigned BrOpcode;
3132 Register CondPhysReg;
3133 const TargetRegisterClass *ConstrainRC;
3134
3135 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3136 // whether the branch is uniform when selecting the instruction. In
3137 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3138 // RegBankSelect knows what it's doing if the branch condition is scc, even
3139 // though it currently does not.
3140 if (!isVCC(CondReg, *MRI)) {
3141 if (MRI->getType(CondReg) != LLT::scalar(32))
3142 return false;
3143
3144 CondPhysReg = AMDGPU::SCC;
3145 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3146 ConstrainRC = &AMDGPU::SReg_32RegClass;
3147 } else {
3148 // FIXME: Should scc->vcc copies and with exec?
3149
3150 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3151 // need to insert an and with exec.
3152 if (!isVCmpResult(CondReg, *MRI)) {
3153 const bool Is64 = STI.isWave64();
3154 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3155 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3156
3157 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3158 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3159 .addReg(CondReg)
3160 .addReg(Exec)
3161 .setOperandDead(3); // Dead scc
3162 CondReg = TmpReg;
3163 }
3164
3165 CondPhysReg = TRI.getVCC();
3166 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3167 ConstrainRC = TRI.getBoolRC();
3168 }
3169
3170 if (!MRI->getRegClassOrNull(CondReg))
3171 MRI->setRegClass(CondReg, ConstrainRC);
3172
3173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3174 .addReg(CondReg);
3175 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3176 .addMBB(I.getOperand(1).getMBB());
3177
3178 I.eraseFromParent();
3179 return true;
3180}
3181
3182bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3183 MachineInstr &I) const {
3184 Register DstReg = I.getOperand(0).getReg();
3185 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3186 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3187 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3188 if (IsVGPR)
3189 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3190
3191 return RBI.constrainGenericRegister(
3192 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3193}
3194
3195bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3196 Register DstReg = I.getOperand(0).getReg();
3197 Register SrcReg = I.getOperand(1).getReg();
3198 Register MaskReg = I.getOperand(2).getReg();
3199 LLT Ty = MRI->getType(DstReg);
3200 LLT MaskTy = MRI->getType(MaskReg);
3201 MachineBasicBlock *BB = I.getParent();
3202 const DebugLoc &DL = I.getDebugLoc();
3203
3204 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3205 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3206 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3207 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3208 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3209 return false;
3210
3211 // Try to avoid emitting a bit operation when we only need to touch half of
3212 // the 64-bit pointer.
3213 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3214 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3215 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3216
3217 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3218 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3219
3220 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3221 !CanCopyLow32 && !CanCopyHi32) {
3222 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3223 .addReg(SrcReg)
3224 .addReg(MaskReg)
3225 .setOperandDead(3); // Dead scc
3226 I.eraseFromParent();
3227 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3228 return true;
3229 }
3230
3231 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3232 const TargetRegisterClass &RegRC
3233 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3234
3235 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3236 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3237 const TargetRegisterClass *MaskRC =
3238 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3239
3240 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3241 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3242 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3243 return false;
3244
3245 if (Ty.getSizeInBits() == 32) {
3246 assert(MaskTy.getSizeInBits() == 32 &&
3247 "ptrmask should have been narrowed during legalize");
3248
3249 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3250 .addReg(SrcReg)
3251 .addReg(MaskReg);
3252
3253 if (!IsVGPR)
3254 NewOp.setOperandDead(3); // Dead scc
3255 I.eraseFromParent();
3256 return true;
3257 }
3258
3259 Register HiReg = MRI->createVirtualRegister(&RegRC);
3260 Register LoReg = MRI->createVirtualRegister(&RegRC);
3261
3262 // Extract the subregisters from the source pointer.
3263 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3264 .addReg(SrcReg, {}, AMDGPU::sub0);
3265 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3266 .addReg(SrcReg, {}, AMDGPU::sub1);
3267
3268 Register MaskedLo, MaskedHi;
3269
3270 if (CanCopyLow32) {
3271 // If all the bits in the low half are 1, we only need a copy for it.
3272 MaskedLo = LoReg;
3273 } else {
3274 // Extract the mask subregister and apply the and.
3275 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3276 MaskedLo = MRI->createVirtualRegister(&RegRC);
3277
3278 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3279 .addReg(MaskReg, {}, AMDGPU::sub0);
3280 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3281 .addReg(LoReg)
3282 .addReg(MaskLo);
3283 }
3284
3285 if (CanCopyHi32) {
3286 // If all the bits in the high half are 1, we only need a copy for it.
3287 MaskedHi = HiReg;
3288 } else {
3289 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3290 MaskedHi = MRI->createVirtualRegister(&RegRC);
3291
3292 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3293 .addReg(MaskReg, {}, AMDGPU::sub1);
3294 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3295 .addReg(HiReg)
3296 .addReg(MaskHi);
3297 }
3298
3299 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3300 .addReg(MaskedLo)
3301 .addImm(AMDGPU::sub0)
3302 .addReg(MaskedHi)
3303 .addImm(AMDGPU::sub1);
3304 I.eraseFromParent();
3305 return true;
3306}
3307
3308/// Return the register to use for the index value, and the subregister to use
3309/// for the indirectly accessed register.
3310static std::pair<Register, unsigned>
3312 const TargetRegisterClass *SuperRC, Register IdxReg,
3313 unsigned EltSize, GISelValueTracking &ValueTracking) {
3314 Register IdxBaseReg;
3315 int Offset;
3316
3317 std::tie(IdxBaseReg, Offset) =
3318 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3319 if (IdxBaseReg == AMDGPU::NoRegister) {
3320 // This will happen if the index is a known constant. This should ordinarily
3321 // be legalized out, but handle it as a register just in case.
3322 assert(Offset == 0);
3323 IdxBaseReg = IdxReg;
3324 }
3325
3326 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3327
3328 // Skip out of bounds offsets, or else we would end up using an undefined
3329 // register.
3330 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3331 return std::pair(IdxReg, SubRegs[0]);
3332 return std::pair(IdxBaseReg, SubRegs[Offset]);
3333}
3334
3335bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3336 MachineInstr &MI) const {
3337 Register DstReg = MI.getOperand(0).getReg();
3338 Register SrcReg = MI.getOperand(1).getReg();
3339 Register IdxReg = MI.getOperand(2).getReg();
3340
3341 LLT DstTy = MRI->getType(DstReg);
3342 LLT SrcTy = MRI->getType(SrcReg);
3343
3344 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3345 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3346 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3347
3348 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3349 // into a waterfall loop.
3350 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3351 return false;
3352
3353 const TargetRegisterClass *SrcRC =
3354 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3355 const TargetRegisterClass *DstRC =
3356 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3357 if (!SrcRC || !DstRC)
3358 return false;
3359 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3360 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3361 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3362 return false;
3363
3364 MachineBasicBlock *BB = MI.getParent();
3365 const DebugLoc &DL = MI.getDebugLoc();
3366 const bool Is64 = DstTy.getSizeInBits() == 64;
3367
3368 unsigned SubReg;
3369 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3370 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3371
3372 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3373 if (DstTy.getSizeInBits() != 32 && !Is64)
3374 return false;
3375
3376 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3377 .addReg(IdxReg);
3378
3379 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3380 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3381 .addReg(SrcReg, {}, SubReg)
3382 .addReg(SrcReg, RegState::Implicit);
3383 MI.eraseFromParent();
3384 return true;
3385 }
3386
3387 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3388 return false;
3389
3390 if (!STI.useVGPRIndexMode()) {
3391 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3392 .addReg(IdxReg);
3393 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3394 .addReg(SrcReg, {}, SubReg)
3395 .addReg(SrcReg, RegState::Implicit);
3396 MI.eraseFromParent();
3397 return true;
3398 }
3399
3400 const MCInstrDesc &GPRIDXDesc =
3401 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3402 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3403 .addReg(SrcReg)
3404 .addReg(IdxReg)
3405 .addImm(SubReg);
3406
3407 MI.eraseFromParent();
3408 return true;
3409}
3410
3411// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3412bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3413 MachineInstr &MI) const {
3414 Register DstReg = MI.getOperand(0).getReg();
3415 Register VecReg = MI.getOperand(1).getReg();
3416 Register ValReg = MI.getOperand(2).getReg();
3417 Register IdxReg = MI.getOperand(3).getReg();
3418
3419 LLT VecTy = MRI->getType(DstReg);
3420 LLT ValTy = MRI->getType(ValReg);
3421 unsigned VecSize = VecTy.getSizeInBits();
3422 unsigned ValSize = ValTy.getSizeInBits();
3423
3424 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3425 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3426 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3427
3428 assert(VecTy.getElementType() == ValTy);
3429
3430 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3431 // into a waterfall loop.
3432 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3433 return false;
3434
3435 const TargetRegisterClass *VecRC =
3436 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3437 const TargetRegisterClass *ValRC =
3438 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3439
3440 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3441 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3442 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3443 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3444 return false;
3445
3446 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3447 return false;
3448
3449 unsigned SubReg;
3450 std::tie(IdxReg, SubReg) =
3451 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3452
3453 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3454 STI.useVGPRIndexMode();
3455
3456 MachineBasicBlock *BB = MI.getParent();
3457 const DebugLoc &DL = MI.getDebugLoc();
3458
3459 if (!IndexMode) {
3460 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3461 .addReg(IdxReg);
3462
3463 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3464 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3465 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3466 .addReg(VecReg)
3467 .addReg(ValReg)
3468 .addImm(SubReg);
3469 MI.eraseFromParent();
3470 return true;
3471 }
3472
3473 const MCInstrDesc &GPRIDXDesc =
3474 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3475 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3476 .addReg(VecReg)
3477 .addReg(ValReg)
3478 .addReg(IdxReg)
3479 .addImm(SubReg);
3480
3481 MI.eraseFromParent();
3482 return true;
3483}
3484
3485static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3486 switch (Intr) {
3487 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3488 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3489 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3490 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3491 case Intrinsic::amdgcn_load_async_to_lds:
3492 case Intrinsic::amdgcn_global_load_async_lds:
3493 return true;
3494 }
3495 return false;
3496}
3497
3498bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3499 if (!Subtarget->hasVMemToLDSLoad())
3500 return false;
3501 unsigned Opc;
3502 unsigned Size = MI.getOperand(3).getImm();
3503 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3504
3505 // The struct intrinsic variants add one additional operand over raw.
3506 const bool HasVIndex = MI.getNumOperands() == 9;
3507 Register VIndex;
3508 int OpOffset = 0;
3509 if (HasVIndex) {
3510 VIndex = MI.getOperand(4).getReg();
3511 OpOffset = 1;
3512 }
3513
3514 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3515 std::optional<ValueAndVReg> MaybeVOffset =
3517 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3518
3519 switch (Size) {
3520 default:
3521 return false;
3522 case 1:
3523 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3524 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3525 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3526 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3527 break;
3528 case 2:
3529 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3530 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3531 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3532 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3533 break;
3534 case 4:
3535 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3536 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3537 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3538 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3539 break;
3540 case 12:
3541 if (!Subtarget->hasLDSLoadB96_B128())
3542 return false;
3543
3544 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3545 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3546 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3547 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3548 break;
3549 case 16:
3550 if (!Subtarget->hasLDSLoadB96_B128())
3551 return false;
3552
3553 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3554 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3555 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3556 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3557 break;
3558 }
3559
3560 MachineBasicBlock *MBB = MI.getParent();
3561 const DebugLoc &DL = MI.getDebugLoc();
3562 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3563 .add(MI.getOperand(2));
3564
3565 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3566
3567 if (HasVIndex && HasVOffset) {
3568 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3569 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3570 .addReg(VIndex)
3571 .addImm(AMDGPU::sub0)
3572 .addReg(VOffset)
3573 .addImm(AMDGPU::sub1);
3574
3575 MIB.addReg(IdxReg);
3576 } else if (HasVIndex) {
3577 MIB.addReg(VIndex);
3578 } else if (HasVOffset) {
3579 MIB.addReg(VOffset);
3580 }
3581
3582 MIB.add(MI.getOperand(1)); // rsrc
3583 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3584 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3585 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3586 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3587 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3588 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3589 MIB.addImm(
3590 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3591 ? 1
3592 : 0); // swz
3593 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3594
3595 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3596 // Don't set the offset value here because the pointer points to the base of
3597 // the buffer.
3598 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3599
3600 MachinePointerInfo StorePtrI = LoadPtrI;
3601 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3605
3606 auto F = LoadMMO->getFlags() &
3608 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3609 Size, LoadMMO->getBaseAlign());
3610
3611 MachineMemOperand *StoreMMO =
3612 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3613 sizeof(int32_t), LoadMMO->getBaseAlign());
3614
3615 MIB.setMemRefs({LoadMMO, StoreMMO});
3616
3617 MI.eraseFromParent();
3618 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3619 return true;
3620}
3621
3622/// Match a zero extend from a 32-bit value to 64-bits.
3623Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3624 Register ZExtSrc;
3625 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3626 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3627
3628 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3629 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3630 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3631 return Register();
3632
3633 assert(Def->getNumOperands() == 3 &&
3634 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3635 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3636 return Def->getOperand(1).getReg();
3637 }
3638
3639 return Register();
3640}
3641
3642/// Match a sign extend from a 32-bit value to 64-bits.
3643Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3644 Register SExtSrc;
3645 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3646 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3647
3648 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3649 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3650 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3651 return Register();
3652
3653 assert(Def->getNumOperands() == 3 &&
3654 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3655 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3656 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3657 m_SpecificICst(31))))
3658 return Def->getOperand(1).getReg();
3659
3660 if (VT->signBitIsZero(Reg))
3661 return matchZeroExtendFromS32(Reg);
3662
3663 return Register();
3664}
3665
3666/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3667/// is 32-bit.
3669AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3670 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3671 : matchZeroExtendFromS32(Reg);
3672}
3673
3674/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3675/// is 32-bit.
3677AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3678 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3679 : matchSignExtendFromS32(Reg);
3680}
3681
3683AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3684 bool IsSigned) const {
3685 if (IsSigned)
3686 return matchSignExtendFromS32OrS32(Reg);
3687
3688 return matchZeroExtendFromS32OrS32(Reg);
3689}
3690
3691Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3692 Register AnyExtSrc;
3693 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3694 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3695
3696 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3697 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3698 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3699 return Register();
3700
3701 assert(Def->getNumOperands() == 3 &&
3702 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3703
3704 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3705 return Def->getOperand(1).getReg();
3706
3707 return Register();
3708}
3709
3710bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3711 if (!Subtarget->hasVMemToLDSLoad())
3712 return false;
3713
3714 unsigned Opc;
3715 unsigned Size = MI.getOperand(3).getImm();
3716 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3717
3718 switch (Size) {
3719 default:
3720 return false;
3721 case 1:
3722 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3723 break;
3724 case 2:
3725 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3726 break;
3727 case 4:
3728 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3729 break;
3730 case 12:
3731 if (!Subtarget->hasLDSLoadB96_B128())
3732 return false;
3733 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3734 break;
3735 case 16:
3736 if (!Subtarget->hasLDSLoadB96_B128())
3737 return false;
3738 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3739 break;
3740 }
3741
3742 MachineBasicBlock *MBB = MI.getParent();
3743 const DebugLoc &DL = MI.getDebugLoc();
3744 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3745 .add(MI.getOperand(2));
3746
3747 Register Addr = MI.getOperand(1).getReg();
3748 Register VOffset;
3749 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3750 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3751 if (!isSGPR(Addr)) {
3752 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3753 if (isSGPR(AddrDef->Reg)) {
3754 Addr = AddrDef->Reg;
3755 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3756 Register SAddr =
3757 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3758 if (isSGPR(SAddr)) {
3759 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3760 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3761 Addr = SAddr;
3762 VOffset = Off;
3763 }
3764 }
3765 }
3766 }
3767
3768 if (isSGPR(Addr)) {
3770 if (!VOffset) {
3771 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3772 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3773 .addImm(0);
3774 }
3775 }
3776
3777 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3778 .addReg(Addr);
3779
3780 if (isSGPR(Addr))
3781 MIB.addReg(VOffset);
3782
3783 MIB.add(MI.getOperand(4)); // offset
3784
3785 unsigned Aux = MI.getOperand(5).getImm();
3786 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3787 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3788
3789 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3790 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3791 LoadPtrI.Offset = MI.getOperand(4).getImm();
3792 MachinePointerInfo StorePtrI = LoadPtrI;
3793 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3797 auto F = LoadMMO->getFlags() &
3799 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3800 Size, LoadMMO->getBaseAlign());
3801 MachineMemOperand *StoreMMO =
3802 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3803 sizeof(int32_t), Align(4));
3804
3805 MIB.setMemRefs({LoadMMO, StoreMMO});
3806
3807 MI.eraseFromParent();
3808 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3809 return true;
3810}
3811
3812bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3813 Intrinsic::ID IID) const {
3814 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3815 unsigned Opc =
3816 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3817 int NumGroups = 4;
3818
3819 // A lamda function to check whether an operand is a vector of all 0s.
3820 const auto isAllZeros = [&](MachineOperand &Opnd) {
3821 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3822 if (!DefMI)
3823 return false;
3824 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3825 };
3826
3827 // Use _D2 version if both group 2 and 3 are zero-initialized.
3828 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3829 NumGroups = 2;
3830 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3831 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3832 }
3833
3834 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3835 // for now because all existing targets only support up to 4 groups.
3836 MachineBasicBlock *MBB = MI.getParent();
3837 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3838 .add(MI.getOperand(1)) // D# group 0
3839 .add(MI.getOperand(2)); // D# group 1
3840
3841 if (NumGroups >= 4) { // Has at least 4 groups
3842 MIB.add(MI.getOperand(3)) // D# group 2
3843 .add(MI.getOperand(4)); // D# group 3
3844 }
3845
3846 MIB.addImm(0) // r128
3847 .add(MI.getOperand(6)); // cpol
3848
3849 MI.eraseFromParent();
3850 return true;
3851}
3852
3853bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3854 MachineInstr &MI) const {
3855 unsigned OpcodeOpIdx =
3856 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3857 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3858 MI.removeOperand(OpcodeOpIdx);
3859 MI.addImplicitDefUseOperands(*MI.getMF());
3860 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3861 return true;
3862}
3863
3864// FIXME: This should be removed and let the patterns select. We just need the
3865// AGPR/VGPR combination versions.
3866bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3867 unsigned Opc;
3868 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3869 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3870 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3871 break;
3872 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3873 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3874 break;
3875 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3876 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3877 break;
3878 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3879 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3880 break;
3881 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3882 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3883 break;
3884 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3885 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3886 break;
3887 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3888 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3889 break;
3890 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3891 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3892 break;
3893 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3894 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3895 break;
3896 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3897 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3898 break;
3899 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3900 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3901 break;
3902 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3903 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3904 break;
3905 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3906 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3907 break;
3908 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3909 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3910 break;
3911 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3912 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3913 break;
3914 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3915 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3916 break;
3917 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3918 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3919 break;
3920 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3921 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3922 break;
3923 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3924 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3925 break;
3926 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3927 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3928 break;
3929 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3930 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3931 break;
3932 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3933 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3934 break;
3935 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3936 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3937 break;
3938 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3939 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3940 break;
3941 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3942 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3943 break;
3944 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3945 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3946 break;
3947 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3948 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3949 break;
3950 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3951 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3952 break;
3953 default:
3954 llvm_unreachable("unhandled smfmac intrinsic");
3955 }
3956
3957 auto VDst_In = MI.getOperand(4);
3958
3959 MI.setDesc(TII.get(Opc));
3960 MI.removeOperand(4); // VDst_In
3961 MI.removeOperand(1); // Intrinsic ID
3962 MI.addOperand(VDst_In); // Readd VDst_In to the end
3963 MI.addImplicitDefUseOperands(*MI.getMF());
3964 const MCInstrDesc &MCID = MI.getDesc();
3965 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3966 MI.getOperand(0).setIsEarlyClobber(true);
3967 }
3968 return true;
3969}
3970
3971bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3972 MachineInstr &MI, Intrinsic::ID IntrID) const {
3973 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3974 !Subtarget->hasPermlane16Swap())
3975 return false;
3976 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3977 !Subtarget->hasPermlane32Swap())
3978 return false;
3979
3980 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3981 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3982 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3983
3984 MI.removeOperand(2);
3985 MI.setDesc(TII.get(Opcode));
3986 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3987
3988 MachineOperand &FI = MI.getOperand(4);
3990
3991 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3992 return true;
3993}
3994
3995bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3996 Register DstReg = MI.getOperand(0).getReg();
3997 Register SrcReg = MI.getOperand(1).getReg();
3998 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3999 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4000 MachineBasicBlock *MBB = MI.getParent();
4001 const DebugLoc &DL = MI.getDebugLoc();
4002
4003 if (IsVALU) {
4004 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
4005 .addImm(Subtarget->getWavefrontSizeLog2())
4006 .addReg(SrcReg);
4007 } else {
4008 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
4009 .addReg(SrcReg)
4010 .addImm(Subtarget->getWavefrontSizeLog2())
4011 .setOperandDead(3); // Dead scc
4012 }
4013
4014 const TargetRegisterClass &RC =
4015 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
4016 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
4017 return false;
4018
4019 MI.eraseFromParent();
4020 return true;
4021}
4022
4023bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4024 MachineInstr &MI) const {
4025 assert(MI.getNumOperands() == 4);
4026 MachineBasicBlock *MBB = MI.getParent();
4027 const DebugLoc &DL = MI.getDebugLoc();
4028
4029 Register DstReg = MI.getOperand(0).getReg();
4030 Register ValReg = MI.getOperand(2).getReg();
4031 Register IdxReg = MI.getOperand(3).getReg();
4032
4033 const LLT DstTy = MRI->getType(DstReg);
4034 unsigned DstSize = DstTy.getSizeInBits();
4035 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4036 const TargetRegisterClass *DstRC =
4037 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4038
4039 if (DstTy != LLT::scalar(32))
4040 return false;
4041
4042 if (!Subtarget->supportsBPermute())
4043 return false;
4044
4045 // If we can bpermute across the whole wave, then just do that
4046 if (Subtarget->supportsWaveWideBPermute()) {
4047 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4048 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4049 .addImm(2)
4050 .addReg(IdxReg);
4051
4052 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4053 .addReg(ShiftIdxReg)
4054 .addReg(ValReg)
4055 .addImm(0);
4056 } else {
4057 // Otherwise, we need to make use of whole wave mode
4058 assert(Subtarget->isWave64());
4059
4060 // Set inactive lanes to poison
4061 Register UndefValReg =
4062 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4063 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4064
4065 Register UndefExecReg = MRI->createVirtualRegister(
4066 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4067 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4068
4069 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4070 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4071 .addImm(0)
4072 .addReg(ValReg)
4073 .addImm(0)
4074 .addReg(UndefValReg)
4075 .addReg(UndefExecReg);
4076
4077 // ds_bpermute requires index to be multiplied by 4
4078 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4079 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4080 .addImm(2)
4081 .addReg(IdxReg);
4082
4083 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4084 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4085 .addImm(0)
4086 .addReg(ShiftIdxReg)
4087 .addImm(0)
4088 .addReg(UndefValReg)
4089 .addReg(UndefExecReg);
4090
4091 // Get permutation of each half, then we'll select which one to use
4092 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4093 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4094 .addReg(PoisonIdxReg)
4095 .addReg(PoisonValReg)
4096 .addImm(0);
4097
4098 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4099 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4100 .addReg(PoisonValReg);
4101
4102 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4103 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4104 .addReg(PoisonIdxReg)
4105 .addReg(SwappedValReg)
4106 .addImm(0);
4107
4108 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4109 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4110 .addReg(OppSidePermReg);
4111
4112 // Select which side to take the permute from
4113 // We can get away with only using mbcnt_lo here since we're only
4114 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4115 // returns 32 for lanes 32-63.
4116 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4117 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4118 .addImm(-1)
4119 .addImm(0);
4120
4121 Register XORReg = MRI->createVirtualRegister(DstRC);
4122 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4123 .addReg(ThreadIDReg)
4124 .addReg(PoisonIdxReg);
4125
4126 Register ANDReg = MRI->createVirtualRegister(DstRC);
4127 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4128 .addReg(XORReg)
4129 .addImm(32);
4130
4131 Register CompareReg = MRI->createVirtualRegister(
4132 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4133 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4134 .addReg(ANDReg)
4135 .addImm(0);
4136
4137 // Finally do the selection
4138 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4139 .addImm(0)
4140 .addReg(WWMSwapPermReg)
4141 .addImm(0)
4142 .addReg(SameSidePermReg)
4143 .addReg(CompareReg);
4144 }
4145
4146 MI.eraseFromParent();
4147 return true;
4148}
4149
4150// Match BITOP3 operation and return a number of matched instructions plus
4151// truth table.
4152static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4154 const MachineRegisterInfo &MRI) {
4155 unsigned NumOpcodes = 0;
4156 uint8_t LHSBits, RHSBits;
4157
4158 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4159 // Define truth table given Src0, Src1, Src2 bits permutations:
4160 // 0 0 0
4161 // 0 0 1
4162 // 0 1 0
4163 // 0 1 1
4164 // 1 0 0
4165 // 1 0 1
4166 // 1 1 0
4167 // 1 1 1
4168 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4169
4170 if (mi_match(Op, MRI, m_AllOnesInt())) {
4171 Bits = 0xff;
4172 return true;
4173 }
4174 if (mi_match(Op, MRI, m_ZeroInt())) {
4175 Bits = 0;
4176 return true;
4177 }
4178
4179 for (unsigned I = 0; I < Src.size(); ++I) {
4180 // Try to find existing reused operand
4181 if (Src[I] == Op) {
4182 Bits = SrcBits[I];
4183 return true;
4184 }
4185 // Try to replace parent operator
4186 if (Src[I] == R) {
4187 Bits = SrcBits[I];
4188 Src[I] = Op;
4189 return true;
4190 }
4191 }
4192
4193 if (Src.size() == 3) {
4194 // No room left for operands. Try one last time, there can be a 'not' of
4195 // one of our source operands. In this case we can compute the bits
4196 // without growing Src vector.
4197 Register LHS;
4198 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4200 for (unsigned I = 0; I < Src.size(); ++I) {
4201 if (Src[I] == LHS) {
4202 Bits = ~SrcBits[I];
4203 return true;
4204 }
4205 }
4206 }
4207
4208 return false;
4209 }
4210
4211 Bits = SrcBits[Src.size()];
4212 Src.push_back(Op);
4213 return true;
4214 };
4215
4216 MachineInstr *MI = MRI.getVRegDef(R);
4217 switch (MI->getOpcode()) {
4218 case TargetOpcode::G_AND:
4219 case TargetOpcode::G_OR:
4220 case TargetOpcode::G_XOR: {
4221 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4222 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4223
4224 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4225 if (!getOperandBits(LHS, LHSBits) ||
4226 !getOperandBits(RHS, RHSBits)) {
4227 Src = std::move(Backup);
4228 return std::make_pair(0, 0);
4229 }
4230
4231 // Recursion is naturally limited by the size of the operand vector.
4232 auto Op = BitOp3_Op(LHS, Src, MRI);
4233 if (Op.first) {
4234 NumOpcodes += Op.first;
4235 LHSBits = Op.second;
4236 }
4237
4238 Op = BitOp3_Op(RHS, Src, MRI);
4239 if (Op.first) {
4240 NumOpcodes += Op.first;
4241 RHSBits = Op.second;
4242 }
4243 break;
4244 }
4245 default:
4246 return std::make_pair(0, 0);
4247 }
4248
4249 uint8_t TTbl;
4250 switch (MI->getOpcode()) {
4251 case TargetOpcode::G_AND:
4252 TTbl = LHSBits & RHSBits;
4253 break;
4254 case TargetOpcode::G_OR:
4255 TTbl = LHSBits | RHSBits;
4256 break;
4257 case TargetOpcode::G_XOR:
4258 TTbl = LHSBits ^ RHSBits;
4259 break;
4260 default:
4261 break;
4262 }
4263
4264 return std::make_pair(NumOpcodes + 1, TTbl);
4265}
4266
4267bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4268 if (!Subtarget->hasBitOp3Insts())
4269 return false;
4270
4271 Register DstReg = MI.getOperand(0).getReg();
4272 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4273 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4274 if (!IsVALU)
4275 return false;
4276
4278 uint8_t TTbl;
4279 unsigned NumOpcodes;
4280
4281 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4282
4283 // Src.empty() case can happen if all operands are all zero or all ones.
4284 // Normally it shall be optimized out before reaching this.
4285 if (NumOpcodes < 2 || Src.empty())
4286 return false;
4287
4288 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4289 if (NumOpcodes == 2 && IsB32) {
4290 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4291 // asm more readable. This cannot be modeled with AddedComplexity because
4292 // selector does not know how many operations did we match.
4293 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4294 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4295 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4296 return false;
4297 } else if (NumOpcodes < 4) {
4298 // For a uniform case threshold should be higher to account for moves
4299 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4300 // in SGPRs and a readtfirstlane after.
4301 return false;
4302 }
4303
4304 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4305 if (!IsB32 && STI.hasTrue16BitInsts())
4306 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4307 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4308 unsigned CBL = STI.getConstantBusLimit(Opc);
4309 MachineBasicBlock *MBB = MI.getParent();
4310 const DebugLoc &DL = MI.getDebugLoc();
4311
4312 for (unsigned I = 0; I < Src.size(); ++I) {
4313 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4314 if (RB->getID() != AMDGPU::SGPRRegBankID)
4315 continue;
4316 if (CBL > 0) {
4317 --CBL;
4318 continue;
4319 }
4320 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4321 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4322 .addReg(Src[I]);
4323 Src[I] = NewReg;
4324 }
4325
4326 // Last operand can be ignored, turning a ternary operation into a binary.
4327 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4328 // 'c' with 'a' here without changing the answer. In some pathological
4329 // cases it should be possible to get an operation with a single operand
4330 // too if optimizer would not catch it.
4331 while (Src.size() < 3)
4332 Src.push_back(Src[0]);
4333
4334 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4335 if (!IsB32)
4336 MIB.addImm(0); // src_mod0
4337 MIB.addReg(Src[0]);
4338 if (!IsB32)
4339 MIB.addImm(0); // src_mod1
4340 MIB.addReg(Src[1]);
4341 if (!IsB32)
4342 MIB.addImm(0); // src_mod2
4343 MIB.addReg(Src[2])
4344 .addImm(TTbl);
4345 if (!IsB32)
4346 MIB.addImm(0); // op_sel
4347
4348 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4349 MI.eraseFromParent();
4350
4351 return true;
4352}
4353
4354bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4355 Register SrcReg = MI.getOperand(0).getReg();
4356 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4357 return false;
4358
4359 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4360 Register SP =
4361 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4362 Register WaveAddr = getWaveAddress(DefMI);
4363 MachineBasicBlock *MBB = MI.getParent();
4364 const DebugLoc &DL = MI.getDebugLoc();
4365
4366 if (!WaveAddr) {
4367 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4368 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4369 .addReg(SrcReg)
4370 .addImm(Subtarget->getWavefrontSizeLog2())
4371 .setOperandDead(3); // Dead scc
4372 }
4373
4374 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4375 .addReg(WaveAddr);
4376
4377 MI.eraseFromParent();
4378 return true;
4379}
4380
4382
4383 if (!I.isPreISelOpcode()) {
4384 if (I.isCopy())
4385 return selectCOPY(I);
4386 return true;
4387 }
4388
4389 switch (I.getOpcode()) {
4390 case TargetOpcode::G_AND:
4391 case TargetOpcode::G_OR:
4392 case TargetOpcode::G_XOR:
4393 if (selectBITOP3(I))
4394 return true;
4395 if (selectImpl(I, *CoverageInfo))
4396 return true;
4397 return selectG_AND_OR_XOR(I);
4398 case TargetOpcode::G_ADD:
4399 case TargetOpcode::G_SUB:
4400 case TargetOpcode::G_PTR_ADD:
4401 if (selectImpl(I, *CoverageInfo))
4402 return true;
4403 return selectG_ADD_SUB(I);
4404 case TargetOpcode::G_UADDO:
4405 case TargetOpcode::G_USUBO:
4406 case TargetOpcode::G_UADDE:
4407 case TargetOpcode::G_USUBE:
4408 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4409 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4410 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4411 return selectG_AMDGPU_MAD_64_32(I);
4412 case TargetOpcode::G_INTTOPTR:
4413 case TargetOpcode::G_BITCAST:
4414 case TargetOpcode::G_PTRTOINT:
4415 case TargetOpcode::G_FREEZE:
4416 return selectCOPY(I);
4417 case TargetOpcode::G_FNEG:
4418 if (selectImpl(I, *CoverageInfo))
4419 return true;
4420 return selectG_FNEG(I);
4421 case TargetOpcode::G_FABS:
4422 if (selectImpl(I, *CoverageInfo))
4423 return true;
4424 return selectG_FABS(I);
4425 case TargetOpcode::G_EXTRACT:
4426 return selectG_EXTRACT(I);
4427 case TargetOpcode::G_MERGE_VALUES:
4428 case TargetOpcode::G_CONCAT_VECTORS:
4429 return selectG_MERGE_VALUES(I);
4430 case TargetOpcode::G_UNMERGE_VALUES:
4431 return selectG_UNMERGE_VALUES(I);
4432 case TargetOpcode::G_BUILD_VECTOR:
4433 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4434 return selectG_BUILD_VECTOR(I);
4435 case TargetOpcode::G_IMPLICIT_DEF:
4436 return selectG_IMPLICIT_DEF(I);
4437 case TargetOpcode::G_INSERT:
4438 return selectG_INSERT(I);
4439 case TargetOpcode::G_INTRINSIC:
4440 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4441 return selectG_INTRINSIC(I);
4442 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4443 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4444 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4445 case TargetOpcode::G_ICMP:
4446 case TargetOpcode::G_FCMP:
4447 if (selectG_ICMP_or_FCMP(I))
4448 return true;
4449 return selectImpl(I, *CoverageInfo);
4450 case TargetOpcode::G_LOAD:
4451 case TargetOpcode::G_ZEXTLOAD:
4452 case TargetOpcode::G_SEXTLOAD:
4453 case TargetOpcode::G_STORE:
4454 case TargetOpcode::G_ATOMIC_CMPXCHG:
4455 case TargetOpcode::G_ATOMICRMW_XCHG:
4456 case TargetOpcode::G_ATOMICRMW_ADD:
4457 case TargetOpcode::G_ATOMICRMW_SUB:
4458 case TargetOpcode::G_ATOMICRMW_AND:
4459 case TargetOpcode::G_ATOMICRMW_OR:
4460 case TargetOpcode::G_ATOMICRMW_XOR:
4461 case TargetOpcode::G_ATOMICRMW_MIN:
4462 case TargetOpcode::G_ATOMICRMW_MAX:
4463 case TargetOpcode::G_ATOMICRMW_UMIN:
4464 case TargetOpcode::G_ATOMICRMW_UMAX:
4465 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4466 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4467 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4468 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4469 case TargetOpcode::G_ATOMICRMW_FADD:
4470 case TargetOpcode::G_ATOMICRMW_FMIN:
4471 case TargetOpcode::G_ATOMICRMW_FMAX:
4472 return selectG_LOAD_STORE_ATOMICRMW(I);
4473 case TargetOpcode::G_SELECT:
4474 return selectG_SELECT(I);
4475 case TargetOpcode::G_TRUNC:
4476 return selectG_TRUNC(I);
4477 case TargetOpcode::G_SEXT:
4478 case TargetOpcode::G_ZEXT:
4479 case TargetOpcode::G_ANYEXT:
4480 case TargetOpcode::G_SEXT_INREG:
4481 // This is a workaround. For extension from type i1, `selectImpl()` uses
4482 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4483 // i1 can only be hold in a SGPR class.
4484 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4485 selectImpl(I, *CoverageInfo))
4486 return true;
4487 return selectG_SZA_EXT(I);
4488 case TargetOpcode::G_FPEXT:
4489 if (selectG_FPEXT(I))
4490 return true;
4491 return selectImpl(I, *CoverageInfo);
4492 case TargetOpcode::G_BRCOND:
4493 return selectG_BRCOND(I);
4494 case TargetOpcode::G_GLOBAL_VALUE:
4495 return selectG_GLOBAL_VALUE(I);
4496 case TargetOpcode::G_PTRMASK:
4497 return selectG_PTRMASK(I);
4498 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4499 return selectG_EXTRACT_VECTOR_ELT(I);
4500 case TargetOpcode::G_INSERT_VECTOR_ELT:
4501 return selectG_INSERT_VECTOR_ELT(I);
4502 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4503 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4504 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4505 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4506 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4507 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4509 assert(Intr && "not an image intrinsic with image pseudo");
4510 return selectImageIntrinsic(I, Intr);
4511 }
4512 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4513 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4514 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4515 return selectBVHIntersectRayIntrinsic(I);
4516 case AMDGPU::G_SBFX:
4517 case AMDGPU::G_UBFX:
4518 return selectG_SBFX_UBFX(I);
4519 case AMDGPU::G_SI_CALL:
4520 I.setDesc(TII.get(AMDGPU::SI_CALL));
4521 return true;
4522 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4523 return selectWaveAddress(I);
4524 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4525 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4526 return true;
4527 }
4528 case AMDGPU::G_STACKRESTORE:
4529 return selectStackRestore(I);
4530 case AMDGPU::G_PHI:
4531 return selectPHI(I);
4532 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4533 return selectCOPY_SCC_VCC(I);
4534 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4535 return selectCOPY_VCC_SCC(I);
4536 case AMDGPU::G_AMDGPU_READANYLANE:
4537 return selectReadAnyLane(I);
4538 case TargetOpcode::G_CONSTANT:
4539 case TargetOpcode::G_FCONSTANT:
4540 default:
4541 return selectImpl(I, *CoverageInfo);
4542 }
4543 return false;
4544}
4545
4547AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4548 return {{
4549 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4550 }};
4551
4552}
4553
4554std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4555 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4556 unsigned Mods = 0;
4557 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4558
4559 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4560 Src = MI->getOperand(1).getReg();
4561 Mods |= SISrcMods::NEG;
4562 MI = getDefIgnoringCopies(Src, *MRI);
4563 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4564 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4565 // denormal mode, but we're implicitly canonicalizing in a source operand.
4566 const ConstantFP *LHS =
4567 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4568 if (LHS && LHS->isZero()) {
4569 Mods |= SISrcMods::NEG;
4570 Src = MI->getOperand(2).getReg();
4571 }
4572 }
4573
4574 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4575 Src = MI->getOperand(1).getReg();
4576 Mods |= SISrcMods::ABS;
4577 }
4578
4579 if (OpSel)
4580 Mods |= SISrcMods::OP_SEL_0;
4581
4582 return std::pair(Src, Mods);
4583}
4584
4585std::pair<Register, unsigned>
4586AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4587 unsigned Mods;
4588 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4589 Mods |= SISrcMods::OP_SEL_1;
4590 return std::pair(Src, Mods);
4591}
4592
4593Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4594 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4595 bool ForceVGPR) const {
4596 if ((Mods != 0 || ForceVGPR) &&
4597 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4598
4599 // If we looked through copies to find source modifiers on an SGPR operand,
4600 // we now have an SGPR register source. To avoid potentially violating the
4601 // constant bus restriction, we need to insert a copy to a VGPR.
4602 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4603 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4604 TII.get(AMDGPU::COPY), VGPRSrc)
4605 .addReg(Src);
4606 Src = VGPRSrc;
4607 }
4608
4609 return Src;
4610}
4611
4612///
4613/// This will select either an SGPR or VGPR operand and will save us from
4614/// having to write an extra tablegen pattern.
4616AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4617 return {{
4618 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4619 }};
4620}
4621
4623AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4624 Register Src;
4625 unsigned Mods;
4626 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4627
4628 return {{
4629 [=](MachineInstrBuilder &MIB) {
4630 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4631 },
4632 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4633 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4634 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4635 }};
4636}
4637
4639AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4640 Register Src;
4641 unsigned Mods;
4642 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4643 /*IsCanonicalizing=*/true,
4644 /*AllowAbs=*/false);
4645
4646 return {{
4647 [=](MachineInstrBuilder &MIB) {
4648 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4649 },
4650 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4651 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4652 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4653 }};
4654}
4655
4657AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4658 return {{
4659 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4660 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4661 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4662 }};
4663}
4664
4666AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4667 Register Src;
4668 unsigned Mods;
4669 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4670
4671 return {{
4672 [=](MachineInstrBuilder &MIB) {
4673 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4674 },
4675 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4676 }};
4677}
4678
4680AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4681 MachineOperand &Root) const {
4682 Register Src;
4683 unsigned Mods;
4684 std::tie(Src, Mods) =
4685 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4686
4687 return {{
4688 [=](MachineInstrBuilder &MIB) {
4689 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4690 },
4691 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4692 }};
4693}
4694
4696AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4697 Register Src;
4698 unsigned Mods;
4699 std::tie(Src, Mods) =
4700 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4701 /*AllowAbs=*/false);
4702
4703 return {{
4704 [=](MachineInstrBuilder &MIB) {
4705 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4706 },
4707 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4708 }};
4709}
4710
4712AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4713 Register Reg = Root.getReg();
4714 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4715 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4716 return {};
4717 return {{
4718 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4719 }};
4720}
4721
4722enum class SrcStatus {
4727 // This means current op = [op_upper, op_lower] and src = -op_lower.
4730 // This means current op = [op_upper, op_lower] and src = [op_upper,
4731 // -op_lower].
4739};
4740/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4741static bool isTruncHalf(const MachineInstr *MI,
4742 const MachineRegisterInfo &MRI) {
4743 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4744 return false;
4745
4746 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4747 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4748 return DstSize * 2 == SrcSize;
4749}
4750
4751/// Test if the MI is logic shift right with half bits,
4752/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4753static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4754 if (MI->getOpcode() != AMDGPU::G_LSHR)
4755 return false;
4756
4757 Register ShiftSrc;
4758 std::optional<ValueAndVReg> ShiftAmt;
4759 if (mi_match(MI->getOperand(0).getReg(), MRI,
4760 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4761 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4762 unsigned Shift = ShiftAmt->Value.getZExtValue();
4763 return Shift * 2 == SrcSize;
4764 }
4765 return false;
4766}
4767
4768/// Test if the MI is shift left with half bits,
4769/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4770static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4771 if (MI->getOpcode() != AMDGPU::G_SHL)
4772 return false;
4773
4774 Register ShiftSrc;
4775 std::optional<ValueAndVReg> ShiftAmt;
4776 if (mi_match(MI->getOperand(0).getReg(), MRI,
4777 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4778 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4779 unsigned Shift = ShiftAmt->Value.getZExtValue();
4780 return Shift * 2 == SrcSize;
4781 }
4782 return false;
4783}
4784
4785/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4786static bool isUnmergeHalf(const MachineInstr *MI,
4787 const MachineRegisterInfo &MRI) {
4788 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4789 return false;
4790 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4791 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4792}
4793
4795
4797 const MachineRegisterInfo &MRI) {
4798 LLT OpTy = MRI.getType(Reg);
4799 if (OpTy.isScalar())
4800 return TypeClass::SCALAR;
4801 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4804}
4805
4807 const MachineRegisterInfo &MRI) {
4808 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4809 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4810 return SrcStatus::INVALID;
4811
4812 switch (S) {
4813 case SrcStatus::IS_SAME:
4814 if (NegType == TypeClass::VECTOR_OF_TWO) {
4815 // Vector of 2:
4816 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4817 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4818 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4819 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4821 }
4822 if (NegType == TypeClass::SCALAR) {
4823 // Scalar:
4824 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4825 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4826 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4827 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4828 return SrcStatus::IS_HI_NEG;
4829 }
4830 break;
4832 if (NegType == TypeClass::VECTOR_OF_TWO) {
4833 // Vector of 2:
4834 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4835 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4836 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4837 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4838 return SrcStatus::IS_LO_NEG;
4839 }
4840 if (NegType == TypeClass::SCALAR) {
4841 // Scalar:
4842 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4843 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4844 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4845 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4846 return SrcStatus::IS_SAME;
4847 }
4848 break;
4850 if (NegType == TypeClass::VECTOR_OF_TWO) {
4851 // Vector of 2:
4852 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4853 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4854 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4855 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4856 return SrcStatus::IS_HI_NEG;
4857 }
4858 if (NegType == TypeClass::SCALAR) {
4859 // Scalar:
4860 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4861 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4862 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4863 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4865 }
4866 break;
4868 if (NegType == TypeClass::VECTOR_OF_TWO) {
4869 // Vector of 2:
4870 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4871 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4872 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4873 // [SrcHi, SrcLo] = [OpHi, OpLo]
4874 return SrcStatus::IS_SAME;
4875 }
4876 if (NegType == TypeClass::SCALAR) {
4877 // Scalar:
4878 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4879 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4880 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4881 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4882 return SrcStatus::IS_LO_NEG;
4883 }
4884 break;
4886 // Vector of 2:
4887 // Src = CurrUpper
4888 // Curr = [CurrUpper, CurrLower]
4889 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4890 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4891 // Src = -OpUpper
4892 //
4893 // Scalar:
4894 // Src = CurrUpper
4895 // Curr = [CurrUpper, CurrLower]
4896 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4897 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4898 // Src = -OpUpper
4901 if (NegType == TypeClass::VECTOR_OF_TWO) {
4902 // Vector of 2:
4903 // Src = CurrLower
4904 // Curr = [CurrUpper, CurrLower]
4905 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4906 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4907 // Src = -OpLower
4909 }
4910 if (NegType == TypeClass::SCALAR) {
4911 // Scalar:
4912 // Src = CurrLower
4913 // Curr = [CurrUpper, CurrLower]
4914 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4915 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4916 // Src = OpLower
4918 }
4919 break;
4921 // Vector of 2:
4922 // Src = -CurrUpper
4923 // Curr = [CurrUpper, CurrLower]
4924 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4925 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4926 // Src = -(-OpUpper) = OpUpper
4927 //
4928 // Scalar:
4929 // Src = -CurrUpper
4930 // Curr = [CurrUpper, CurrLower]
4931 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4932 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4933 // Src = -(-OpUpper) = OpUpper
4936 if (NegType == TypeClass::VECTOR_OF_TWO) {
4937 // Vector of 2:
4938 // Src = -CurrLower
4939 // Curr = [CurrUpper, CurrLower]
4940 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4941 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4942 // Src = -(-OpLower) = OpLower
4944 }
4945 if (NegType == TypeClass::SCALAR) {
4946 // Scalar:
4947 // Src = -CurrLower
4948 // Curr = [CurrUpper, CurrLower]
4949 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4950 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4951 // Src = -OpLower
4953 }
4954 break;
4955 default:
4956 break;
4957 }
4958 llvm_unreachable("unexpected SrcStatus & NegType combination");
4959}
4960
4961static std::optional<std::pair<Register, SrcStatus>>
4962calcNextStatus(std::pair<Register, SrcStatus> Curr,
4963 const MachineRegisterInfo &MRI) {
4964 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4965
4966 unsigned Opc = MI->getOpcode();
4967
4968 // Handle general Opc cases.
4969 switch (Opc) {
4970 case AMDGPU::G_BITCAST:
4971 return std::optional<std::pair<Register, SrcStatus>>(
4972 {MI->getOperand(1).getReg(), Curr.second});
4973 case AMDGPU::COPY:
4974 if (MI->getOperand(1).getReg().isPhysical())
4975 return std::nullopt;
4976 return std::optional<std::pair<Register, SrcStatus>>(
4977 {MI->getOperand(1).getReg(), Curr.second});
4978 case AMDGPU::G_FNEG: {
4979 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4980 if (Stat == SrcStatus::INVALID)
4981 return std::nullopt;
4982 return std::optional<std::pair<Register, SrcStatus>>(
4983 {MI->getOperand(1).getReg(), Stat});
4984 }
4985 default:
4986 break;
4987 }
4988
4989 // Calc next Stat from current Stat.
4990 switch (Curr.second) {
4991 case SrcStatus::IS_SAME:
4992 if (isTruncHalf(MI, MRI))
4993 return std::optional<std::pair<Register, SrcStatus>>(
4994 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4995 else if (isUnmergeHalf(MI, MRI)) {
4996 if (Curr.first == MI->getOperand(0).getReg())
4997 return std::optional<std::pair<Register, SrcStatus>>(
4998 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4999 return std::optional<std::pair<Register, SrcStatus>>(
5000 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
5001 }
5002 break;
5004 if (isTruncHalf(MI, MRI)) {
5005 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5006 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
5007 // = [OpLowerHi, OpLowerLo]
5008 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5009 // = [-OpLowerHi, OpLowerLo]
5010 // = -OpLower
5011 return std::optional<std::pair<Register, SrcStatus>>(
5012 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5013 }
5014 if (isUnmergeHalf(MI, MRI)) {
5015 if (Curr.first == MI->getOperand(0).getReg())
5016 return std::optional<std::pair<Register, SrcStatus>>(
5017 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5018 return std::optional<std::pair<Register, SrcStatus>>(
5019 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5020 }
5021 break;
5023 if (isShlHalf(MI, MRI))
5024 return std::optional<std::pair<Register, SrcStatus>>(
5025 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5026 break;
5028 if (isLshrHalf(MI, MRI))
5029 return std::optional<std::pair<Register, SrcStatus>>(
5030 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5031 break;
5033 if (isShlHalf(MI, MRI))
5034 return std::optional<std::pair<Register, SrcStatus>>(
5035 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5036 break;
5038 if (isLshrHalf(MI, MRI))
5039 return std::optional<std::pair<Register, SrcStatus>>(
5040 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5041 break;
5042 default:
5043 break;
5044 }
5045 return std::nullopt;
5046}
5047
5048/// This is used to control valid status that current MI supports. For example,
5049/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5050/// bit on VOP3P.
5051/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5052/// for different MI on different arch
5054private:
5055 bool HasNeg = false;
5056 // Assume all complex pattern of VOP3P have opsel.
5057 bool HasOpsel = true;
5058
5059public:
5061 const MachineInstr *MI = MRI.getVRegDef(Reg);
5062 unsigned Opc = MI->getOpcode();
5063
5064 if (Opc == TargetOpcode::G_INTRINSIC) {
5065 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5066 // Only float point intrinsic has neg & neg_hi bits.
5067 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5068 HasNeg = true;
5070 // Keep same for generic op.
5071 HasNeg = true;
5072 }
5073 }
5074 bool checkOptions(SrcStatus Stat) const {
5075 if (!HasNeg &&
5076 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5077 return false;
5078 }
5079 if (!HasOpsel &&
5080 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5081 return false;
5082 }
5083 return true;
5084 }
5085};
5086
5089 int MaxDepth = 3) {
5090 int Depth = 0;
5091 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5093
5094 while (Depth <= MaxDepth && Curr.has_value()) {
5095 Depth++;
5096 if (SO.checkOptions(Curr.value().second))
5097 Statlist.push_back(Curr.value());
5098 Curr = calcNextStatus(Curr.value(), MRI);
5099 }
5100
5101 return Statlist;
5102}
5103
5104static std::pair<Register, SrcStatus>
5106 int MaxDepth = 3) {
5107 int Depth = 0;
5108 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5109 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5110
5111 while (Depth <= MaxDepth && Curr.has_value()) {
5112 Depth++;
5113 SrcStatus Stat = Curr.value().second;
5114 if (SO.checkOptions(Stat)) {
5115 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5117 LastSameOrNeg = Curr.value();
5118 }
5119 Curr = calcNextStatus(Curr.value(), MRI);
5120 }
5121
5122 return LastSameOrNeg;
5123}
5124
5125static bool isSameBitWidth(Register Reg1, Register Reg2,
5126 const MachineRegisterInfo &MRI) {
5127 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5128 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5129 return Width1 == Width2;
5130}
5131
5132static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5133 // SrcStatus::IS_LOWER_HALF remain 0.
5134 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5135 Mods ^= SISrcMods::NEG_HI;
5136 Mods |= SISrcMods::OP_SEL_1;
5137 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5138 Mods |= SISrcMods::OP_SEL_1;
5139 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5140 Mods ^= SISrcMods::NEG_HI;
5141 else if (HiStat == SrcStatus::IS_HI_NEG)
5142 Mods ^= SISrcMods::NEG_HI;
5143
5144 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5145 Mods ^= SISrcMods::NEG;
5146 Mods |= SISrcMods::OP_SEL_0;
5147 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5148 Mods |= SISrcMods::OP_SEL_0;
5149 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5150 Mods |= SISrcMods::NEG;
5151 else if (LoStat == SrcStatus::IS_HI_NEG)
5152 Mods ^= SISrcMods::NEG;
5153
5154 return Mods;
5155}
5156
5157static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5158 Register RootReg, const SIInstrInfo &TII,
5159 const MachineRegisterInfo &MRI) {
5160 auto IsHalfState = [](SrcStatus S) {
5163 };
5164 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5165 IsHalfState(HiStat);
5166}
5167
5168std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5169 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5170 unsigned Mods = 0;
5171 // No modification if Root type is not form of <2 x Type>.
5172 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5173 Mods |= SISrcMods::OP_SEL_1;
5174 return {RootReg, Mods};
5175 }
5176
5177 SearchOptions SO(RootReg, MRI);
5178
5179 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5180
5181 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5183 else if (Stat.second == SrcStatus::IS_HI_NEG)
5184 Mods ^= SISrcMods::NEG_HI;
5185 else if (Stat.second == SrcStatus::IS_LO_NEG)
5186 Mods ^= SISrcMods::NEG;
5187
5188 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5189
5190 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5191 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5192 Mods |= SISrcMods::OP_SEL_1;
5193 return {Stat.first, Mods};
5194 }
5195
5197 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5198
5199 if (StatlistHi.empty()) {
5200 Mods |= SISrcMods::OP_SEL_1;
5201 return {Stat.first, Mods};
5202 }
5203
5205 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5206
5207 if (StatlistLo.empty()) {
5208 Mods |= SISrcMods::OP_SEL_1;
5209 return {Stat.first, Mods};
5210 }
5211
5212 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5213 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5214 if (StatlistHi[I].first == StatlistLo[J].first &&
5215 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5216 StatlistHi[I].first, RootReg, TII, MRI))
5217 return {StatlistHi[I].first,
5218 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5219 }
5220 }
5221 // Packed instructions do not have abs modifiers.
5222 Mods |= SISrcMods::OP_SEL_1;
5223
5224 return {Stat.first, Mods};
5225}
5226
5227// Removed unused function `getAllKindImm` to eliminate dead code.
5228
5229static bool checkRB(Register Reg, unsigned int RBNo,
5230 const AMDGPURegisterBankInfo &RBI,
5231 const MachineRegisterInfo &MRI,
5232 const TargetRegisterInfo &TRI) {
5233 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5234 return RB->getID() == RBNo;
5235}
5236
5237// This function is used to get the correct register bank for returned reg.
5238// Assume:
5239// 1. VOP3P is always legal for VGPR.
5240// 2. RootOp's regbank is legal.
5241// Thus
5242// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5243// 2. If RootOp is VGPR, then NewOp must be VGPR.
5245 const AMDGPURegisterBankInfo &RBI,
5247 const TargetRegisterInfo &TRI,
5248 const SIInstrInfo &TII) {
5249 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5250 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5251 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5252 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5253 return NewReg;
5254
5255 MachineInstr *MI = MRI.getVRegDef(RootReg);
5256 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5257 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5258 return RootReg;
5259 }
5260
5261 MachineBasicBlock *BB = MI->getParent();
5262 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5263
5265 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5266 .addReg(NewReg);
5267
5268 // Only accept VGPR.
5269 return MIB->getOperand(0).getReg();
5270}
5271
5273AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5274 bool IsDOT) const {
5275 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5276 Register Reg;
5277 unsigned Mods;
5278 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5279
5280 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5281 return {{
5282 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5283 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5284 }};
5285}
5286
5288AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5289
5290 return selectVOP3PRetHelper(Root);
5291}
5292
5294AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5295
5296 return selectVOP3PRetHelper(Root, true);
5297}
5298
5300AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5301 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5302 Register Src;
5303 unsigned Mods;
5304 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
5305 if (Mods != SISrcMods::OP_SEL_1)
5306 return {};
5307
5308 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5309}
5310
5312AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5313 Register Src;
5314 unsigned Mods;
5315 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5316
5317 return {{
5318 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5319 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5320 }};
5321}
5322
5324AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5325 Register Src;
5326 unsigned Mods;
5327 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5328 if (Mods != SISrcMods::OP_SEL_1)
5329 return {};
5330
5331 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5332}
5333
5335AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5336 MachineOperand &Root) const {
5337 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5338 "expected i1 value");
5339 unsigned Mods = SISrcMods::OP_SEL_1;
5340 if (Root.getImm() != 0)
5341 Mods |= SISrcMods::OP_SEL_0;
5342
5343 return {{
5344 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5345 }};
5346}
5347
5349 MachineInstr *InsertPt,
5350 MachineRegisterInfo &MRI) {
5351 const TargetRegisterClass *DstRegClass;
5352 switch (Elts.size()) {
5353 case 8:
5354 DstRegClass = &AMDGPU::VReg_256RegClass;
5355 break;
5356 case 4:
5357 DstRegClass = &AMDGPU::VReg_128RegClass;
5358 break;
5359 case 2:
5360 DstRegClass = &AMDGPU::VReg_64RegClass;
5361 break;
5362 default:
5363 llvm_unreachable("unhandled Reg sequence size");
5364 }
5365
5366 MachineIRBuilder B(*InsertPt);
5367 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5368 .addDef(MRI.createVirtualRegister(DstRegClass));
5369 for (unsigned i = 0; i < Elts.size(); ++i) {
5370 MIB.addReg(Elts[i]);
5372 }
5373 return MIB->getOperand(0).getReg();
5374}
5375
5376static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5378 MachineInstr *InsertPt,
5379 MachineRegisterInfo &MRI) {
5380 if (ModOpcode == TargetOpcode::G_FNEG) {
5381 Mods |= SISrcMods::NEG;
5382 // Check if all elements also have abs modifier
5383 SmallVector<Register, 8> NegAbsElts;
5384 for (auto El : Elts) {
5385 Register FabsSrc;
5386 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5387 break;
5388 NegAbsElts.push_back(FabsSrc);
5389 }
5390 if (Elts.size() != NegAbsElts.size()) {
5391 // Neg
5392 Src = buildRegSequence(Elts, InsertPt, MRI);
5393 } else {
5394 // Neg and Abs
5395 Mods |= SISrcMods::NEG_HI;
5396 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5397 }
5398 } else {
5399 assert(ModOpcode == TargetOpcode::G_FABS);
5400 // Abs
5401 Mods |= SISrcMods::NEG_HI;
5402 Src = buildRegSequence(Elts, InsertPt, MRI);
5403 }
5404}
5405
5407AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5408 Register Src = Root.getReg();
5409 unsigned Mods = SISrcMods::OP_SEL_1;
5411
5412 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5413 assert(BV->getNumSources() > 0);
5414 // Based on first element decide which mod we match, neg or abs
5415 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5416 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5417 ? AMDGPU::G_FNEG
5418 : AMDGPU::G_FABS;
5419 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5420 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5421 if (ElF32->getOpcode() != ModOpcode)
5422 break;
5423 EltsF32.push_back(ElF32->getOperand(1).getReg());
5424 }
5425
5426 // All elements had ModOpcode modifier
5427 if (BV->getNumSources() == EltsF32.size()) {
5428 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5429 *MRI);
5430 }
5431 }
5432
5433 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5434 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5435}
5436
5438AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5439 Register Src = Root.getReg();
5440 unsigned Mods = SISrcMods::OP_SEL_1;
5441 SmallVector<Register, 8> EltsV2F16;
5442
5443 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5444 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5445 Register FNegSrc;
5446 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5447 break;
5448 EltsV2F16.push_back(FNegSrc);
5449 }
5450
5451 // All elements had ModOpcode modifier
5452 if (CV->getNumSources() == EltsV2F16.size()) {
5453 Mods |= SISrcMods::NEG;
5454 Mods |= SISrcMods::NEG_HI;
5455 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5456 }
5457 }
5458
5459 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5460 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5461}
5462
5464AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5465 Register Src = Root.getReg();
5466 unsigned Mods = SISrcMods::OP_SEL_1;
5467 SmallVector<Register, 8> EltsV2F16;
5468
5469 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5470 assert(CV->getNumSources() > 0);
5471 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5472 // Based on first element decide which mod we match, neg or abs
5473 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5474 ? AMDGPU::G_FNEG
5475 : AMDGPU::G_FABS;
5476
5477 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5478 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5479 if (ElV2F16->getOpcode() != ModOpcode)
5480 break;
5481 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5482 }
5483
5484 // All elements had ModOpcode modifier
5485 if (CV->getNumSources() == EltsV2F16.size()) {
5486 MachineIRBuilder B(*Root.getParent());
5487 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5488 *MRI);
5489 }
5490 }
5491
5492 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5493 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5494}
5495
5497AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5498 std::optional<FPValueAndVReg> FPValReg;
5499 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5500 if (TII.isInlineConstant(FPValReg->Value)) {
5501 return {{[=](MachineInstrBuilder &MIB) {
5502 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5503 }}};
5504 }
5505 // Non-inlineable splat floats should not fall-through for integer immediate
5506 // checks.
5507 return {};
5508 }
5509
5510 APInt ICst;
5511 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5512 if (TII.isInlineConstant(ICst)) {
5513 return {
5514 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5515 }
5516 }
5517
5518 return {};
5519}
5520
5522AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5523 Register Src =
5524 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5525 unsigned Key = 0;
5526
5527 Register ShiftSrc;
5528 std::optional<ValueAndVReg> ShiftAmt;
5529 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5530 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5531 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5532 Key = ShiftAmt->Value.getZExtValue() / 8;
5533 Src = ShiftSrc;
5534 }
5535
5536 return {{
5537 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5538 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5539 }};
5540}
5541
5543AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5544
5545 Register Src =
5546 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5547 unsigned Key = 0;
5548
5549 Register ShiftSrc;
5550 std::optional<ValueAndVReg> ShiftAmt;
5551 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5552 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5553 ShiftAmt->Value.getZExtValue() == 16) {
5554 Src = ShiftSrc;
5555 Key = 1;
5556 }
5557
5558 return {{
5559 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5560 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5561 }};
5562}
5563
5565AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5566 Register Src =
5567 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5568 unsigned Key = 0;
5569
5570 Register S32 = matchZeroExtendFromS32(Src);
5571 if (!S32)
5572 S32 = matchAnyExtendFromS32(Src);
5573
5574 if (S32) {
5575 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5576 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5577 assert(Def->getNumOperands() == 3);
5578 Register DstReg1 = Def->getOperand(1).getReg();
5579 if (mi_match(S32, *MRI,
5580 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5581 Src = Def->getOperand(2).getReg();
5582 Key = 1;
5583 }
5584 }
5585 }
5586
5587 return {{
5588 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5589 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5590 }};
5591}
5592
5594AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5595 Register Src;
5596 unsigned Mods;
5597 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5598
5599 // FIXME: Handle op_sel
5600 return {{
5601 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5602 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5603 }};
5604}
5605
5606// FIXME-TRUE16 remove when fake16 is removed
5608AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5609 Register Src;
5610 unsigned Mods;
5611 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5612 /*IsCanonicalizing=*/true,
5613 /*AllowAbs=*/false,
5614 /*OpSel=*/false);
5615
5616 return {{
5617 [=](MachineInstrBuilder &MIB) {
5618 MIB.addReg(
5619 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5620 },
5621 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5622 }};
5623}
5624
5626AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5627 Register Src;
5628 unsigned Mods;
5629 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5630 /*IsCanonicalizing=*/true,
5631 /*AllowAbs=*/false,
5632 /*OpSel=*/true);
5633
5634 return {{
5635 [=](MachineInstrBuilder &MIB) {
5636 MIB.addReg(
5637 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5638 },
5639 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5640 }};
5641}
5642
5643// Given \p Offset and load specified by the \p Root operand check if \p Offset
5644// is a multiple of the load byte size. If it is update \p Offset to a
5645// pre-scaled value and return true.
5646bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5648 bool IsSigned) const {
5649 if (!Subtarget->hasScaleOffset())
5650 return false;
5651
5652 const MachineInstr &MI = *Root.getParent();
5653 MachineMemOperand *MMO = *MI.memoperands_begin();
5654
5655 if (!MMO->getSize().hasValue())
5656 return false;
5657
5658 uint64_t Size = MMO->getSize().getValue();
5659
5660 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5661 if (!OffsetReg)
5662 OffsetReg = Offset;
5663
5664 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5665 OffsetReg = Def->Reg;
5666
5667 Register Op0;
5668 MachineInstr *Mul;
5669 bool ScaleOffset =
5670 (isPowerOf2_64(Size) &&
5671 mi_match(OffsetReg, *MRI,
5672 m_GShl(m_Reg(Op0),
5675 mi_match(OffsetReg, *MRI,
5677 m_Copy(m_SpecificICst(Size))))) ||
5678 mi_match(
5679 OffsetReg, *MRI,
5680 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5681 m_Reg(Op0), m_SpecificICst(Size))) ||
5682 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5683 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5684 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5685 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5686 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5687 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5688 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5689 mi_match(Mul->getOperand(3).getReg(), *MRI,
5691 m_Copy(m_SpecificICst(Size))))) &&
5692 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5693
5694 if (ScaleOffset)
5695 Offset = Op0;
5696
5697 return ScaleOffset;
5698}
5699
5700bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5701 Register &Base,
5702 Register *SOffset,
5703 int64_t *Offset,
5704 bool *ScaleOffset) const {
5705 MachineInstr *MI = Root.getParent();
5706 MachineBasicBlock *MBB = MI->getParent();
5707
5708 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5709 // then we can select all ptr + 32-bit offsets.
5710 SmallVector<GEPInfo, 4> AddrInfo;
5711 getAddrModeInfo(*MI, *MRI, AddrInfo);
5712
5713 if (AddrInfo.empty())
5714 return false;
5715
5716 const GEPInfo &GEPI = AddrInfo[0];
5717 std::optional<int64_t> EncodedImm;
5718
5719 if (ScaleOffset)
5720 *ScaleOffset = false;
5721
5722 if (SOffset && Offset) {
5723 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5724 /*HasSOffset=*/true);
5725 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5726 AddrInfo.size() > 1) {
5727 const GEPInfo &GEPI2 = AddrInfo[1];
5728 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5729 Register OffsetReg = GEPI2.SgprParts[1];
5730 if (ScaleOffset)
5731 *ScaleOffset =
5732 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5733 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5734 if (OffsetReg) {
5735 Base = GEPI2.SgprParts[0];
5736 *SOffset = OffsetReg;
5737 *Offset = *EncodedImm;
5738 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5739 return true;
5740
5741 // For unbuffered smem loads, it is illegal for the Immediate Offset
5742 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5743 // is negative. Handle the case where the Immediate Offset + SOffset
5744 // is negative.
5745 auto SKnown = VT->getKnownBits(*SOffset);
5746 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5747 return false;
5748
5749 return true;
5750 }
5751 }
5752 }
5753 return false;
5754 }
5755
5756 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5757 /*HasSOffset=*/false);
5758 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5759 Base = GEPI.SgprParts[0];
5760 *Offset = *EncodedImm;
5761 return true;
5762 }
5763
5764 // SGPR offset is unsigned.
5765 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5766 GEPI.Imm != 0) {
5767 // If we make it this far we have a load with an 32-bit immediate offset.
5768 // It is OK to select this using a sgpr offset, because we have already
5769 // failed trying to select this load into one of the _IMM variants since
5770 // the _IMM Patterns are considered before the _SGPR patterns.
5771 Base = GEPI.SgprParts[0];
5772 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5773 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5774 .addImm(GEPI.Imm);
5775 return true;
5776 }
5777
5778 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5779 Register OffsetReg = GEPI.SgprParts[1];
5780 if (ScaleOffset)
5781 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5782 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5783 if (OffsetReg) {
5784 Base = GEPI.SgprParts[0];
5785 *SOffset = OffsetReg;
5786 return true;
5787 }
5788 }
5789
5790 return false;
5791}
5792
5794AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5795 Register Base;
5796 int64_t Offset;
5797 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5798 /* ScaleOffset */ nullptr))
5799 return std::nullopt;
5800
5801 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5802 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5803}
5804
5806AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5807 SmallVector<GEPInfo, 4> AddrInfo;
5808 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5809
5810 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5811 return std::nullopt;
5812
5813 const GEPInfo &GEPInfo = AddrInfo[0];
5814 Register PtrReg = GEPInfo.SgprParts[0];
5815 std::optional<int64_t> EncodedImm =
5816 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5817 if (!EncodedImm)
5818 return std::nullopt;
5819
5820 return {{
5821 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5822 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5823 }};
5824}
5825
5827AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5828 Register Base, SOffset;
5829 bool ScaleOffset;
5830 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5831 &ScaleOffset))
5832 return std::nullopt;
5833
5834 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5835 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5836 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5837 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5838}
5839
5841AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5842 Register Base, SOffset;
5843 int64_t Offset;
5844 bool ScaleOffset;
5845 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5846 return std::nullopt;
5847
5848 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5849 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5850 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5851 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5852 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5853}
5854
5855std::pair<Register, int>
5856AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5857 uint64_t FlatVariant) const {
5858 MachineInstr *MI = Root.getParent();
5859
5860 auto Default = std::pair(Root.getReg(), 0);
5861
5862 if (!STI.hasFlatInstOffsets())
5863 return Default;
5864
5865 Register PtrBase;
5866 int64_t ConstOffset;
5867 bool IsInBounds;
5868 std::tie(PtrBase, ConstOffset, IsInBounds) =
5869 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5870
5871 // Adding the offset to the base address with an immediate in a FLAT
5872 // instruction must not change the memory aperture in which the address falls.
5873 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5874 // instructions.
5875 if (ConstOffset == 0 ||
5876 (FlatVariant == SIInstrFlags::FlatScratch &&
5877 !isFlatScratchBaseLegal(Root.getReg())) ||
5878 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5879 return Default;
5880
5881 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5882 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5883 return Default;
5884
5885 return std::pair(PtrBase, ConstOffset);
5886}
5887
5889AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5890 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5891
5892 return {{
5893 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5894 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5895 }};
5896}
5897
5899AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5900 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5901
5902 return {{
5903 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5904 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5905 }};
5906}
5907
5909AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5910 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5911
5912 return {{
5913 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5914 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5915 }};
5916}
5917
5918// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5920AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5921 unsigned CPolBits,
5922 bool NeedIOffset) const {
5923 Register Addr = Root.getReg();
5924 Register PtrBase;
5925 int64_t ConstOffset;
5926 int64_t ImmOffset = 0;
5927
5928 // Match the immediate offset first, which canonically is moved as low as
5929 // possible.
5930 std::tie(PtrBase, ConstOffset, std::ignore) =
5931 getPtrBaseWithConstantOffset(Addr, *MRI);
5932
5933 if (ConstOffset != 0) {
5934 if (NeedIOffset &&
5935 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5937 Addr = PtrBase;
5938 ImmOffset = ConstOffset;
5939 } else {
5940 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5941 if (isSGPR(PtrBaseDef->Reg)) {
5942 if (ConstOffset > 0) {
5943 // Offset is too large.
5944 //
5945 // saddr + large_offset -> saddr +
5946 // (voffset = large_offset & ~MaxOffset) +
5947 // (large_offset & MaxOffset);
5948 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5949 if (NeedIOffset) {
5950 std::tie(SplitImmOffset, RemainderOffset) =
5951 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5953 }
5954
5955 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5956 : isUInt<32>(RemainderOffset)) {
5957 MachineInstr *MI = Root.getParent();
5958 MachineBasicBlock *MBB = MI->getParent();
5959 Register HighBits =
5960 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5961
5962 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5963 HighBits)
5964 .addImm(RemainderOffset);
5965
5966 if (NeedIOffset)
5967 return {{
5968 [=](MachineInstrBuilder &MIB) {
5969 MIB.addReg(PtrBase);
5970 }, // saddr
5971 [=](MachineInstrBuilder &MIB) {
5972 MIB.addReg(HighBits);
5973 }, // voffset
5974 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5975 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5976 }};
5977 return {{
5978 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5979 [=](MachineInstrBuilder &MIB) {
5980 MIB.addReg(HighBits);
5981 }, // voffset
5982 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5983 }};
5984 }
5985 }
5986
5987 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5988 // is 1 we would need to perform 1 or 2 extra moves for each half of
5989 // the constant and it is better to do a scalar add and then issue a
5990 // single VALU instruction to materialize zero. Otherwise it is less
5991 // instructions to perform VALU adds with immediates or inline literals.
5992 unsigned NumLiterals =
5993 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5994 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5995 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5996 return std::nullopt;
5997 }
5998 }
5999 }
6000
6001 // Match the variable offset.
6002 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6003 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6004 // Look through the SGPR->VGPR copy.
6005 Register SAddr =
6006 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
6007
6008 if (isSGPR(SAddr)) {
6009 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
6010
6011 // It's possible voffset is an SGPR here, but the copy to VGPR will be
6012 // inserted later.
6013 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
6014 Subtarget->hasSignedGVSOffset());
6015 if (Register VOffset = matchExtendFromS32OrS32(
6016 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
6017 if (NeedIOffset)
6018 return {{[=](MachineInstrBuilder &MIB) { // saddr
6019 MIB.addReg(SAddr);
6020 },
6021 [=](MachineInstrBuilder &MIB) { // voffset
6022 MIB.addReg(VOffset);
6023 },
6024 [=](MachineInstrBuilder &MIB) { // offset
6025 MIB.addImm(ImmOffset);
6026 },
6027 [=](MachineInstrBuilder &MIB) { // cpol
6028 MIB.addImm(CPolBits |
6029 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6030 }}};
6031 return {{[=](MachineInstrBuilder &MIB) { // saddr
6032 MIB.addReg(SAddr);
6033 },
6034 [=](MachineInstrBuilder &MIB) { // voffset
6035 MIB.addReg(VOffset);
6036 },
6037 [=](MachineInstrBuilder &MIB) { // cpol
6038 MIB.addImm(CPolBits |
6039 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6040 }}};
6041 }
6042 }
6043 }
6044
6045 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6046 // drop this.
6047 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6048 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6049 return std::nullopt;
6050
6051 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6052 // moves required to copy a 64-bit SGPR to VGPR.
6053 MachineInstr *MI = Root.getParent();
6054 MachineBasicBlock *MBB = MI->getParent();
6055 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6056
6057 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6058 .addImm(0);
6059
6060 if (NeedIOffset)
6061 return {{
6062 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6063 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6064 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6065 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6066 }};
6067 return {{
6068 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6069 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6070 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6071 }};
6072}
6073
6075AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6076 return selectGlobalSAddr(Root, 0);
6077}
6078
6080AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6081 const MachineInstr &I = *Root.getParent();
6082
6083 // We are assuming CPol is always the last operand of the intrinsic.
6084 auto PassedCPol =
6085 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6086 return selectGlobalSAddr(Root, PassedCPol);
6087}
6088
6090AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6091 const MachineInstr &I = *Root.getParent();
6092
6093 // We are assuming CPol is second from last operand of the intrinsic.
6094 auto PassedCPol =
6095 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6096 return selectGlobalSAddr(Root, PassedCPol);
6097}
6098
6100AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6101 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6102}
6103
6105AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6106 MachineOperand &Root) const {
6107 const MachineInstr &I = *Root.getParent();
6108
6109 // We are assuming CPol is always the last operand of the intrinsic.
6110 auto PassedCPol =
6111 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6112 return selectGlobalSAddr(Root, PassedCPol, false);
6113}
6114
6116AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6117 MachineOperand &Root) const {
6118 const MachineInstr &I = *Root.getParent();
6119
6120 // We are assuming CPol is second from last operand of the intrinsic.
6121 auto PassedCPol =
6122 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6123 return selectGlobalSAddr(Root, PassedCPol, false);
6124}
6125
6127AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6128 Register Addr = Root.getReg();
6129 Register PtrBase;
6130 int64_t ConstOffset;
6131 int64_t ImmOffset = 0;
6132
6133 // Match the immediate offset first, which canonically is moved as low as
6134 // possible.
6135 std::tie(PtrBase, ConstOffset, std::ignore) =
6136 getPtrBaseWithConstantOffset(Addr, *MRI);
6137
6138 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6139 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6141 Addr = PtrBase;
6142 ImmOffset = ConstOffset;
6143 }
6144
6145 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6146 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6147 int FI = AddrDef->MI->getOperand(1).getIndex();
6148 return {{
6149 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6150 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6151 }};
6152 }
6153
6154 Register SAddr = AddrDef->Reg;
6155
6156 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6157 Register LHS = AddrDef->MI->getOperand(1).getReg();
6158 Register RHS = AddrDef->MI->getOperand(2).getReg();
6159 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6160 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6161
6162 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6163 isSGPR(RHSDef->Reg)) {
6164 int FI = LHSDef->MI->getOperand(1).getIndex();
6165 MachineInstr &I = *Root.getParent();
6166 MachineBasicBlock *BB = I.getParent();
6167 const DebugLoc &DL = I.getDebugLoc();
6168 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6169
6170 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6171 .addFrameIndex(FI)
6172 .addReg(RHSDef->Reg)
6173 .setOperandDead(3); // Dead scc
6174 }
6175 }
6176
6177 if (!isSGPR(SAddr))
6178 return std::nullopt;
6179
6180 return {{
6181 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6182 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6183 }};
6184}
6185
6186// Check whether the flat scratch SVS swizzle bug affects this access.
6187bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6188 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6189 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6190 return false;
6191
6192 // The bug affects the swizzling of SVS accesses if there is any carry out
6193 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6194 // voffset to (soffset + inst_offset).
6195 auto VKnown = VT->getKnownBits(VAddr);
6196 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6197 KnownBits::makeConstant(APInt(32, ImmOffset)));
6198 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6199 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6200 return (VMax & 3) + (SMax & 3) >= 4;
6201}
6202
6204AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6205 Register Addr = Root.getReg();
6206 Register PtrBase;
6207 int64_t ConstOffset;
6208 int64_t ImmOffset = 0;
6209
6210 // Match the immediate offset first, which canonically is moved as low as
6211 // possible.
6212 std::tie(PtrBase, ConstOffset, std::ignore) =
6213 getPtrBaseWithConstantOffset(Addr, *MRI);
6214
6215 Register OrigAddr = Addr;
6216 if (ConstOffset != 0 &&
6217 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6219 Addr = PtrBase;
6220 ImmOffset = ConstOffset;
6221 }
6222
6223 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6224 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6225 return std::nullopt;
6226
6227 Register RHS = AddrDef->MI->getOperand(2).getReg();
6228 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6229 return std::nullopt;
6230
6231 Register LHS = AddrDef->MI->getOperand(1).getReg();
6232 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6233
6234 if (OrigAddr != Addr) {
6235 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6236 return std::nullopt;
6237 } else {
6238 if (!isFlatScratchBaseLegalSV(OrigAddr))
6239 return std::nullopt;
6240 }
6241
6242 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6243 return std::nullopt;
6244
6245 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6247 : 0;
6248
6249 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6250 int FI = LHSDef->MI->getOperand(1).getIndex();
6251 return {{
6252 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6253 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6254 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6255 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6256 }};
6257 }
6258
6259 if (!isSGPR(LHS))
6260 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6261 LHS = Def->Reg;
6262
6263 if (!isSGPR(LHS))
6264 return std::nullopt;
6265
6266 return {{
6267 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6268 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6269 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6270 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6271 }};
6272}
6273
6275AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6276 MachineInstr *MI = Root.getParent();
6277 MachineBasicBlock *MBB = MI->getParent();
6278 MachineFunction *MF = MBB->getParent();
6279 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6280
6281 int64_t Offset = 0;
6282 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6284 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6285
6286 // TODO: Should this be inside the render function? The iterator seems to
6287 // move.
6288 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6289 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6290 HighBits)
6291 .addImm(Offset & ~MaxOffset);
6292
6293 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6294 MIB.addReg(Info->getScratchRSrcReg());
6295 },
6296 [=](MachineInstrBuilder &MIB) { // vaddr
6297 MIB.addReg(HighBits);
6298 },
6299 [=](MachineInstrBuilder &MIB) { // soffset
6300 // Use constant zero for soffset and rely on eliminateFrameIndex
6301 // to choose the appropriate frame register if need be.
6302 MIB.addImm(0);
6303 },
6304 [=](MachineInstrBuilder &MIB) { // offset
6305 MIB.addImm(Offset & MaxOffset);
6306 }}};
6307 }
6308
6309 assert(Offset == 0 || Offset == -1);
6310
6311 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6312 // offsets.
6313 std::optional<int> FI;
6314 Register VAddr = Root.getReg();
6315
6316 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6317 Register PtrBase;
6318 int64_t ConstOffset;
6319 std::tie(PtrBase, ConstOffset, std::ignore) =
6320 getPtrBaseWithConstantOffset(VAddr, *MRI);
6321 if (ConstOffset != 0) {
6322 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6323 (!STI.privateMemoryResourceIsRangeChecked() ||
6324 VT->signBitIsZero(PtrBase))) {
6325 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6326 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6327 FI = PtrBaseDef->getOperand(1).getIndex();
6328 else
6329 VAddr = PtrBase;
6330 Offset = ConstOffset;
6331 }
6332 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6333 FI = RootDef->getOperand(1).getIndex();
6334 }
6335
6336 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6337 MIB.addReg(Info->getScratchRSrcReg());
6338 },
6339 [=](MachineInstrBuilder &MIB) { // vaddr
6340 if (FI)
6341 MIB.addFrameIndex(*FI);
6342 else
6343 MIB.addReg(VAddr);
6344 },
6345 [=](MachineInstrBuilder &MIB) { // soffset
6346 // Use constant zero for soffset and rely on eliminateFrameIndex
6347 // to choose the appropriate frame register if need be.
6348 MIB.addImm(0);
6349 },
6350 [=](MachineInstrBuilder &MIB) { // offset
6351 MIB.addImm(Offset);
6352 }}};
6353}
6354
6355bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6356 int64_t Offset) const {
6357 if (!isUInt<16>(Offset))
6358 return false;
6359
6360 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6361 return true;
6362
6363 // On Southern Islands instruction with a negative base value and an offset
6364 // don't seem to work.
6365 return VT->signBitIsZero(Base);
6366}
6367
6368bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6369 int64_t Offset1,
6370 unsigned Size) const {
6371 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6372 return false;
6373 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6374 return false;
6375
6376 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6377 return true;
6378
6379 // On Southern Islands instruction with a negative base value and an offset
6380 // don't seem to work.
6381 return VT->signBitIsZero(Base);
6382}
6383
6384// Return whether the operation has NoUnsignedWrap property.
6385static bool isNoUnsignedWrap(MachineInstr *Addr) {
6386 return Addr->getOpcode() == TargetOpcode::G_OR ||
6387 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6389}
6390
6391// Check that the base address of flat scratch load/store in the form of `base +
6392// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6393// requirement). We always treat the first operand as the base address here.
6394bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6395 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6396
6397 if (isNoUnsignedWrap(AddrMI))
6398 return true;
6399
6400 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6401 // values.
6402 if (STI.hasSignedScratchOffsets())
6403 return true;
6404
6405 Register LHS = AddrMI->getOperand(1).getReg();
6406 Register RHS = AddrMI->getOperand(2).getReg();
6407
6408 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6409 std::optional<ValueAndVReg> RhsValReg =
6411 // If the immediate offset is negative and within certain range, the base
6412 // address cannot also be negative. If the base is also negative, the sum
6413 // would be either negative or much larger than the valid range of scratch
6414 // memory a thread can access.
6415 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6416 RhsValReg->Value.getSExtValue() > -0x40000000)
6417 return true;
6418 }
6419
6420 return VT->signBitIsZero(LHS);
6421}
6422
6423// Check address value in SGPR/VGPR are legal for flat scratch in the form
6424// of: SGPR + VGPR.
6425bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6426 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6427
6428 if (isNoUnsignedWrap(AddrMI))
6429 return true;
6430
6431 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6432 // values.
6433 if (STI.hasSignedScratchOffsets())
6434 return true;
6435
6436 Register LHS = AddrMI->getOperand(1).getReg();
6437 Register RHS = AddrMI->getOperand(2).getReg();
6438 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6439}
6440
6441// Check address value in SGPR/VGPR are legal for flat scratch in the form
6442// of: SGPR + VGPR + Imm.
6443bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6444 Register Addr) const {
6445 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6446 // values.
6447 if (STI.hasSignedScratchOffsets())
6448 return true;
6449
6450 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6451 Register Base = AddrMI->getOperand(1).getReg();
6452 std::optional<DefinitionAndSourceRegister> BaseDef =
6454 std::optional<ValueAndVReg> RHSOffset =
6456 assert(RHSOffset);
6457
6458 // If the immediate offset is negative and within certain range, the base
6459 // address cannot also be negative. If the base is also negative, the sum
6460 // would be either negative or much larger than the valid range of scratch
6461 // memory a thread can access.
6462 if (isNoUnsignedWrap(BaseDef->MI) &&
6463 (isNoUnsignedWrap(AddrMI) ||
6464 (RHSOffset->Value.getSExtValue() < 0 &&
6465 RHSOffset->Value.getSExtValue() > -0x40000000)))
6466 return true;
6467
6468 Register LHS = BaseDef->MI->getOperand(1).getReg();
6469 Register RHS = BaseDef->MI->getOperand(2).getReg();
6470 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6471}
6472
6473bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6474 unsigned ShAmtBits) const {
6475 assert(MI.getOpcode() == TargetOpcode::G_AND);
6476
6477 std::optional<APInt> RHS =
6478 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6479 if (!RHS)
6480 return false;
6481
6482 if (RHS->countr_one() >= ShAmtBits)
6483 return true;
6484
6485 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6486 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6487}
6488
6490AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6491 MachineOperand &Root) const {
6492 Register Reg = Root.getReg();
6493 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6494
6495 std::optional<DefinitionAndSourceRegister> Def =
6497 assert(Def && "this shouldn't be an optional result");
6498 Reg = Def->Reg;
6499
6500 if (Register WaveBase = getWaveAddress(Def->MI)) {
6501 return {{
6502 [=](MachineInstrBuilder &MIB) { // rsrc
6503 MIB.addReg(Info->getScratchRSrcReg());
6504 },
6505 [=](MachineInstrBuilder &MIB) { // soffset
6506 MIB.addReg(WaveBase);
6507 },
6508 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6509 }};
6510 }
6511
6512 int64_t Offset = 0;
6513
6514 // FIXME: Copy check is a hack
6516 if (mi_match(Reg, *MRI,
6517 m_GPtrAdd(m_Reg(BasePtr),
6519 if (!TII.isLegalMUBUFImmOffset(Offset))
6520 return {};
6521 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6522 Register WaveBase = getWaveAddress(BasePtrDef);
6523 if (!WaveBase)
6524 return {};
6525
6526 return {{
6527 [=](MachineInstrBuilder &MIB) { // rsrc
6528 MIB.addReg(Info->getScratchRSrcReg());
6529 },
6530 [=](MachineInstrBuilder &MIB) { // soffset
6531 MIB.addReg(WaveBase);
6532 },
6533 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6534 }};
6535 }
6536
6537 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6538 !TII.isLegalMUBUFImmOffset(Offset))
6539 return {};
6540
6541 return {{
6542 [=](MachineInstrBuilder &MIB) { // rsrc
6543 MIB.addReg(Info->getScratchRSrcReg());
6544 },
6545 [=](MachineInstrBuilder &MIB) { // soffset
6546 MIB.addImm(0);
6547 },
6548 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6549 }};
6550}
6551
6552std::pair<Register, unsigned>
6553AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6554 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6555 int64_t ConstAddr = 0;
6556
6557 Register PtrBase;
6558 int64_t Offset;
6559 std::tie(PtrBase, Offset, std::ignore) =
6560 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6561
6562 if (Offset) {
6563 if (isDSOffsetLegal(PtrBase, Offset)) {
6564 // (add n0, c0)
6565 return std::pair(PtrBase, Offset);
6566 }
6567 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6568 // TODO
6569
6570
6571 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6572 // TODO
6573
6574 }
6575
6576 return std::pair(Root.getReg(), 0);
6577}
6578
6580AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6581 Register Reg;
6582 unsigned Offset;
6583 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6584 return {{
6585 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6586 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6587 }};
6588}
6589
6591AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6592 return selectDSReadWrite2(Root, 4);
6593}
6594
6596AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6597 return selectDSReadWrite2(Root, 8);
6598}
6599
6601AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6602 unsigned Size) const {
6603 Register Reg;
6604 unsigned Offset;
6605 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6606 return {{
6607 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6608 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6609 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6610 }};
6611}
6612
6613std::pair<Register, unsigned>
6614AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6615 unsigned Size) const {
6616 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6617 int64_t ConstAddr = 0;
6618
6619 Register PtrBase;
6620 int64_t Offset;
6621 std::tie(PtrBase, Offset, std::ignore) =
6622 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6623
6624 if (Offset) {
6625 int64_t OffsetValue0 = Offset;
6626 int64_t OffsetValue1 = Offset + Size;
6627 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6628 // (add n0, c0)
6629 return std::pair(PtrBase, OffsetValue0 / Size);
6630 }
6631 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6632 // TODO
6633
6634 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6635 // TODO
6636
6637 }
6638
6639 return std::pair(Root.getReg(), 0);
6640}
6641
6642/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6643/// the base value with the constant offset, and if the offset computation is
6644/// known to be inbounds. There may be intervening copies between \p Root and
6645/// the identified constant. Returns \p Root, 0, false if this does not match
6646/// the pattern.
6647std::tuple<Register, int64_t, bool>
6648AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6649 Register Root, const MachineRegisterInfo &MRI) const {
6650 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6651 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6652 return {Root, 0, false};
6653
6654 MachineOperand &RHS = RootI->getOperand(2);
6655 std::optional<ValueAndVReg> MaybeOffset =
6657 if (!MaybeOffset)
6658 return {Root, 0, false};
6659 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6660 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6661 IsInBounds};
6662}
6663
6665 MIB.addImm(0);
6666}
6667
6668/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6669/// BasePtr is not valid, a null base pointer will be used.
6671 uint32_t FormatLo, uint32_t FormatHi,
6672 Register BasePtr) {
6673 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6674 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6675 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6676 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6677
6678 B.buildInstr(AMDGPU::S_MOV_B32)
6679 .addDef(RSrc2)
6680 .addImm(FormatLo);
6681 B.buildInstr(AMDGPU::S_MOV_B32)
6682 .addDef(RSrc3)
6683 .addImm(FormatHi);
6684
6685 // Build the half of the subregister with the constants before building the
6686 // full 128-bit register. If we are building multiple resource descriptors,
6687 // this will allow CSEing of the 2-component register.
6688 B.buildInstr(AMDGPU::REG_SEQUENCE)
6689 .addDef(RSrcHi)
6690 .addReg(RSrc2)
6691 .addImm(AMDGPU::sub0)
6692 .addReg(RSrc3)
6693 .addImm(AMDGPU::sub1);
6694
6695 Register RSrcLo = BasePtr;
6696 if (!BasePtr) {
6697 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6698 B.buildInstr(AMDGPU::S_MOV_B64)
6699 .addDef(RSrcLo)
6700 .addImm(0);
6701 }
6702
6703 B.buildInstr(AMDGPU::REG_SEQUENCE)
6704 .addDef(RSrc)
6705 .addReg(RSrcLo)
6706 .addImm(AMDGPU::sub0_sub1)
6707 .addReg(RSrcHi)
6708 .addImm(AMDGPU::sub2_sub3);
6709
6710 return RSrc;
6711}
6712
6714 const SIInstrInfo &TII, Register BasePtr) {
6715 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6716
6717 // FIXME: Why are half the "default" bits ignored based on the addressing
6718 // mode?
6719 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6720}
6721
6723 const SIInstrInfo &TII, Register BasePtr) {
6724 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6725
6726 // FIXME: Why are half the "default" bits ignored based on the addressing
6727 // mode?
6728 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6729}
6730
6731AMDGPUInstructionSelector::MUBUFAddressData
6732AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6733 MUBUFAddressData Data;
6734 Data.N0 = Src;
6735
6736 Register PtrBase;
6737 int64_t Offset;
6738
6739 std::tie(PtrBase, Offset, std::ignore) =
6740 getPtrBaseWithConstantOffset(Src, *MRI);
6741 if (isUInt<32>(Offset)) {
6742 Data.N0 = PtrBase;
6743 Data.Offset = Offset;
6744 }
6745
6746 if (MachineInstr *InputAdd
6747 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6748 Data.N2 = InputAdd->getOperand(1).getReg();
6749 Data.N3 = InputAdd->getOperand(2).getReg();
6750
6751 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6752 // FIXME: Don't know this was defined by operand 0
6753 //
6754 // TODO: Remove this when we have copy folding optimizations after
6755 // RegBankSelect.
6756 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6757 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6758 }
6759
6760 return Data;
6761}
6762
6763/// Return if the addr64 mubuf mode should be used for the given address.
6764bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6765 // (ptr_add N2, N3) -> addr64, or
6766 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6767 if (Addr.N2)
6768 return true;
6769
6770 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6771 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6772}
6773
6774/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6775/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6776/// component.
6777void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6778 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6779 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6780 return;
6781
6782 // Illegal offset, store it in soffset.
6783 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6784 B.buildInstr(AMDGPU::S_MOV_B32)
6785 .addDef(SOffset)
6786 .addImm(ImmOffset);
6787 ImmOffset = 0;
6788}
6789
6790bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6791 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6792 Register &SOffset, int64_t &Offset) const {
6793 // FIXME: Predicates should stop this from reaching here.
6794 // addr64 bit was removed for volcanic islands.
6795 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6796 return false;
6797
6798 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6799 if (!shouldUseAddr64(AddrData))
6800 return false;
6801
6802 Register N0 = AddrData.N0;
6803 Register N2 = AddrData.N2;
6804 Register N3 = AddrData.N3;
6805 Offset = AddrData.Offset;
6806
6807 // Base pointer for the SRD.
6808 Register SRDPtr;
6809
6810 if (N2) {
6811 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6812 assert(N3);
6813 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6814 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6815 // addr64, and construct the default resource from a 0 address.
6816 VAddr = N0;
6817 } else {
6818 SRDPtr = N3;
6819 VAddr = N2;
6820 }
6821 } else {
6822 // N2 is not divergent.
6823 SRDPtr = N2;
6824 VAddr = N3;
6825 }
6826 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6827 // Use the default null pointer in the resource
6828 VAddr = N0;
6829 } else {
6830 // N0 -> offset, or
6831 // (N0 + C1) -> offset
6832 SRDPtr = N0;
6833 }
6834
6835 MachineIRBuilder B(*Root.getParent());
6836 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6837 splitIllegalMUBUFOffset(B, SOffset, Offset);
6838 return true;
6839}
6840
6841bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6842 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6843 int64_t &Offset) const {
6844
6845 // FIXME: Pattern should not reach here.
6846 if (STI.useFlatForGlobal())
6847 return false;
6848
6849 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6850 if (shouldUseAddr64(AddrData))
6851 return false;
6852
6853 // N0 -> offset, or
6854 // (N0 + C1) -> offset
6855 Register SRDPtr = AddrData.N0;
6856 Offset = AddrData.Offset;
6857
6858 // TODO: Look through extensions for 32-bit soffset.
6859 MachineIRBuilder B(*Root.getParent());
6860
6861 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6862 splitIllegalMUBUFOffset(B, SOffset, Offset);
6863 return true;
6864}
6865
6867AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6868 Register VAddr;
6869 Register RSrcReg;
6870 Register SOffset;
6871 int64_t Offset = 0;
6872
6873 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6874 return {};
6875
6876 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6877 // pattern.
6878 return {{
6879 [=](MachineInstrBuilder &MIB) { // rsrc
6880 MIB.addReg(RSrcReg);
6881 },
6882 [=](MachineInstrBuilder &MIB) { // vaddr
6883 MIB.addReg(VAddr);
6884 },
6885 [=](MachineInstrBuilder &MIB) { // soffset
6886 if (SOffset)
6887 MIB.addReg(SOffset);
6888 else if (STI.hasRestrictedSOffset())
6889 MIB.addReg(AMDGPU::SGPR_NULL);
6890 else
6891 MIB.addImm(0);
6892 },
6893 [=](MachineInstrBuilder &MIB) { // offset
6894 MIB.addImm(Offset);
6895 },
6896 addZeroImm, // cpol
6897 addZeroImm, // tfe
6898 addZeroImm // swz
6899 }};
6900}
6901
6903AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6904 Register RSrcReg;
6905 Register SOffset;
6906 int64_t Offset = 0;
6907
6908 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6909 return {};
6910
6911 return {{
6912 [=](MachineInstrBuilder &MIB) { // rsrc
6913 MIB.addReg(RSrcReg);
6914 },
6915 [=](MachineInstrBuilder &MIB) { // soffset
6916 if (SOffset)
6917 MIB.addReg(SOffset);
6918 else if (STI.hasRestrictedSOffset())
6919 MIB.addReg(AMDGPU::SGPR_NULL);
6920 else
6921 MIB.addImm(0);
6922 },
6923 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6924 addZeroImm, // cpol
6925 addZeroImm, // tfe
6926 addZeroImm, // swz
6927 }};
6928}
6929
6931AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6932
6933 Register SOffset = Root.getReg();
6934
6935 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6936 SOffset = AMDGPU::SGPR_NULL;
6937
6938 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6939}
6940
6941/// Get an immediate that must be 32-bits, and treated as zero extended.
6942static std::optional<uint64_t>
6944 // getIConstantVRegVal sexts any values, so see if that matters.
6945 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6946 if (!OffsetVal || !isInt<32>(*OffsetVal))
6947 return std::nullopt;
6948 return Lo_32(*OffsetVal);
6949}
6950
6952AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6953 std::optional<uint64_t> OffsetVal =
6954 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6955 if (!OffsetVal)
6956 return {};
6957
6958 std::optional<int64_t> EncodedImm =
6959 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6960 if (!EncodedImm)
6961 return {};
6962
6963 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6964}
6965
6967AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6968 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6969
6970 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6971 if (!OffsetVal)
6972 return {};
6973
6974 std::optional<int64_t> EncodedImm =
6976 if (!EncodedImm)
6977 return {};
6978
6979 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6980}
6981
6983AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6984 // Match the (soffset + offset) pair as a 32-bit register base and
6985 // an immediate offset.
6986 Register SOffset;
6987 unsigned Offset;
6988 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6989 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6990 if (!SOffset)
6991 return std::nullopt;
6992
6993 std::optional<int64_t> EncodedOffset =
6994 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6995 if (!EncodedOffset)
6996 return std::nullopt;
6997
6998 assert(MRI->getType(SOffset) == LLT::scalar(32));
6999 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
7000 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
7001}
7002
7003std::pair<Register, unsigned>
7004AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
7005 bool &Matched) const {
7006 Matched = false;
7007
7008 Register Src;
7009 unsigned Mods;
7010 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
7011
7012 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
7013 assert(MRI->getType(Src) == LLT::scalar(16));
7014
7015 // Only change Src if src modifier could be gained. In such cases new Src
7016 // could be sgpr but this does not violate constant bus restriction for
7017 // instruction that is being selected.
7018 Src = stripBitCast(Src, *MRI);
7019
7020 const auto CheckAbsNeg = [&]() {
7021 // Be careful about folding modifiers if we already have an abs. fneg is
7022 // applied last, so we don't want to apply an earlier fneg.
7023 if ((Mods & SISrcMods::ABS) == 0) {
7024 unsigned ModsTmp;
7025 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7026
7027 if ((ModsTmp & SISrcMods::NEG) != 0)
7028 Mods ^= SISrcMods::NEG;
7029
7030 if ((ModsTmp & SISrcMods::ABS) != 0)
7031 Mods |= SISrcMods::ABS;
7032 }
7033 };
7034
7035 CheckAbsNeg();
7036
7037 // op_sel/op_sel_hi decide the source type and source.
7038 // If the source's op_sel_hi is set, it indicates to do a conversion from
7039 // fp16. If the sources's op_sel is set, it picks the high half of the
7040 // source register.
7041
7042 Mods |= SISrcMods::OP_SEL_1;
7043
7044 if (isExtractHiElt(*MRI, Src, Src)) {
7045 Mods |= SISrcMods::OP_SEL_0;
7046 CheckAbsNeg();
7047 }
7048
7049 Matched = true;
7050 }
7051
7052 return {Src, Mods};
7053}
7054
7056AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7057 MachineOperand &Root) const {
7058 Register Src;
7059 unsigned Mods;
7060 bool Matched;
7061 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7062 if (!Matched)
7063 return {};
7064
7065 return {{
7066 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7067 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7068 }};
7069}
7070
7072AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7073 Register Src;
7074 unsigned Mods;
7075 bool Matched;
7076 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7077
7078 return {{
7079 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7080 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7081 }};
7082}
7083
7084bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7085 MachineInstr &I, Intrinsic::ID IntrID) const {
7086 MachineBasicBlock *MBB = I.getParent();
7087 const DebugLoc &DL = I.getDebugLoc();
7088 Register CCReg = I.getOperand(0).getReg();
7089
7090 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7091 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7092
7093 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7094 .addImm(I.getOperand(2).getImm());
7095
7096 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7097
7098 I.eraseFromParent();
7099 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7100 *MRI);
7101}
7102
7103bool AMDGPUInstructionSelector::selectSGetBarrierState(
7104 MachineInstr &I, Intrinsic::ID IntrID) const {
7105 MachineBasicBlock *MBB = I.getParent();
7106 const DebugLoc &DL = I.getDebugLoc();
7107 const MachineOperand &BarOp = I.getOperand(2);
7108 std::optional<int64_t> BarValImm =
7109 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7110
7111 if (!BarValImm) {
7112 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7113 .addReg(BarOp.getReg());
7114 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7115 }
7116 MachineInstrBuilder MIB;
7117 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7118 : AMDGPU::S_GET_BARRIER_STATE_M0;
7119 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7120
7121 auto DstReg = I.getOperand(0).getReg();
7122 const TargetRegisterClass *DstRC =
7123 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7124 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7125 return false;
7126 MIB.addDef(DstReg);
7127 if (BarValImm) {
7128 MIB.addImm(*BarValImm);
7129 }
7130 I.eraseFromParent();
7131 return true;
7132}
7133
7134unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7135 if (HasInlineConst) {
7136 switch (IntrID) {
7137 default:
7138 llvm_unreachable("not a named barrier op");
7139 case Intrinsic::amdgcn_s_barrier_join:
7140 return AMDGPU::S_BARRIER_JOIN_IMM;
7141 case Intrinsic::amdgcn_s_wakeup_barrier:
7142 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7143 case Intrinsic::amdgcn_s_get_named_barrier_state:
7144 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7145 };
7146 } else {
7147 switch (IntrID) {
7148 default:
7149 llvm_unreachable("not a named barrier op");
7150 case Intrinsic::amdgcn_s_barrier_join:
7151 return AMDGPU::S_BARRIER_JOIN_M0;
7152 case Intrinsic::amdgcn_s_wakeup_barrier:
7153 return AMDGPU::S_WAKEUP_BARRIER_M0;
7154 case Intrinsic::amdgcn_s_get_named_barrier_state:
7155 return AMDGPU::S_GET_BARRIER_STATE_M0;
7156 };
7157 }
7158}
7159
7160bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7161 MachineInstr &I, Intrinsic::ID IntrID) const {
7162 MachineBasicBlock *MBB = I.getParent();
7163 const DebugLoc &DL = I.getDebugLoc();
7164 const MachineOperand &BarOp = I.getOperand(1);
7165 const MachineOperand &CntOp = I.getOperand(2);
7166
7167 // A member count of 0 means "keep existing member count". That plus a known
7168 // constant value for the barrier ID lets us use the immarg form.
7169 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7170 std::optional<int64_t> CntImm =
7171 getIConstantVRegSExtVal(CntOp.getReg(), *MRI);
7172 if (CntImm && *CntImm == 0) {
7173 std::optional<int64_t> BarValImm =
7174 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7175 if (BarValImm) {
7176 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7177 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
7178 .addImm(BarID);
7179 I.eraseFromParent();
7180 return true;
7181 }
7182 }
7183 }
7184
7185 // BarID = (BarOp >> 4) & 0x3F
7186 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7187 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7188 .add(BarOp)
7189 .addImm(4u)
7190 .setOperandDead(3); // Dead scc
7191
7192 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7193 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7194 .addReg(TmpReg0)
7195 .addImm(0x3F)
7196 .setOperandDead(3); // Dead scc
7197
7198 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7199 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7200 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7201 .add(CntOp)
7202 .addImm(0x3F)
7203 .setOperandDead(3); // Dead scc
7204
7205 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7206 constexpr unsigned ShAmt = 16;
7207 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7208 .addReg(TmpReg2)
7209 .addImm(ShAmt)
7210 .setOperandDead(3); // Dead scc
7211
7212 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7213 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7214 .addReg(TmpReg1)
7215 .addReg(TmpReg3)
7216 .setOperandDead(3); // Dead scc;
7217
7218 auto CopyMIB =
7219 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7220 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7221
7222 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7223 ? AMDGPU::S_BARRIER_INIT_M0
7224 : AMDGPU::S_BARRIER_SIGNAL_M0;
7225 MachineInstrBuilder MIB;
7226 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7227
7228 I.eraseFromParent();
7229 return true;
7230}
7231
7232bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7233 MachineInstr &I, Intrinsic::ID IntrID) const {
7234 MachineBasicBlock *MBB = I.getParent();
7235 const DebugLoc &DL = I.getDebugLoc();
7236 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7237 ? I.getOperand(2)
7238 : I.getOperand(1);
7239 std::optional<int64_t> BarValImm =
7240 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7241
7242 if (!BarValImm) {
7243 // BarID = (BarOp >> 4) & 0x3F
7244 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7245 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7246 .addReg(BarOp.getReg())
7247 .addImm(4u)
7248 .setOperandDead(3); // Dead scc;
7249
7250 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7251 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7252 .addReg(TmpReg0)
7253 .addImm(0x3F)
7254 .setOperandDead(3); // Dead scc;
7255
7256 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7257 .addReg(TmpReg1);
7258 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7259 }
7260
7261 MachineInstrBuilder MIB;
7262 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7263 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7264
7265 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7266 auto DstReg = I.getOperand(0).getReg();
7267 const TargetRegisterClass *DstRC =
7268 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7269 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7270 return false;
7271 MIB.addDef(DstReg);
7272 }
7273
7274 if (BarValImm) {
7275 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7276 MIB.addImm(BarId);
7277 }
7278
7279 I.eraseFromParent();
7280 return true;
7281}
7282
7283void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7284 const MachineInstr &MI,
7285 int OpIdx) const {
7286 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7287 "Expected G_CONSTANT");
7288 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7289}
7290
7291void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7292 const MachineInstr &MI,
7293 int OpIdx) const {
7294 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7295 "Expected G_CONSTANT");
7296 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7297}
7298
7299void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7300 const MachineInstr &MI,
7301 int OpIdx) const {
7302 const MachineOperand &Op = MI.getOperand(1);
7303 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7304 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7305}
7306
7307void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7308 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7309 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7310 "Expected G_CONSTANT");
7311 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7312}
7313
7314/// This only really exists to satisfy DAG type checking machinery, so is a
7315/// no-op here.
7316void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7317 const MachineInstr &MI,
7318 int OpIdx) const {
7319 const MachineOperand &Op = MI.getOperand(OpIdx);
7320 int64_t Imm;
7321 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7322 MIB.addImm(Imm);
7323 else
7324 MIB.addImm(Op.getImm());
7325}
7326
7327void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7328 const MachineInstr &MI,
7329 int OpIdx) const {
7330 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7331}
7332
7333void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7334 const MachineInstr &MI,
7335 int OpIdx) const {
7336 assert(OpIdx >= 0 && "expected to match an immediate operand");
7337 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7338}
7339
7340void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7341 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7342 assert(OpIdx >= 0 && "expected to match an immediate operand");
7343 MIB.addImm(
7344 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7345}
7346
7347void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7348 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7349 assert(OpIdx >= 0 && "expected to match an immediate operand");
7350 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7352 : (int64_t)SISrcMods::DST_OP_SEL);
7353}
7354
7355void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7356 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7357 assert(OpIdx >= 0 && "expected to match an immediate operand");
7358 MIB.addImm(
7359 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7360}
7361
7362void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7363 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7364 assert(OpIdx >= 0 && "expected to match an immediate operand");
7365 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7366 ? (int64_t)(SISrcMods::OP_SEL_0)
7367 : 0);
7368}
7369
7370void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7371 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7372 assert(OpIdx >= 0 && "expected to match an immediate operand");
7373 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7374 : 0);
7375}
7376
7377void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7378 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7379 assert(OpIdx >= 0 && "expected to match an immediate operand");
7380 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7381 : 0);
7382}
7383
7384void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7385 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7386 assert(OpIdx >= 0 && "expected to match an immediate operand");
7387 MIB.addImm(
7388 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7389}
7390
7391void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7392 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7393 assert(OpIdx >= 0 && "expected to match an immediate operand");
7394 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7395 ? (int64_t)SISrcMods::DST_OP_SEL
7396 : 0);
7397}
7398
7399void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7400 const MachineInstr &MI,
7401 int OpIdx) const {
7402 assert(OpIdx >= 0 && "expected to match an immediate operand");
7403 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7406}
7407
7408void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7409 const MachineInstr &MI,
7410 int OpIdx) const {
7411 assert(OpIdx >= 0 && "expected to match an immediate operand");
7412 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7415 MIB.addImm(Swizzle);
7416}
7417
7418void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7419 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7420 assert(OpIdx >= 0 && "expected to match an immediate operand");
7421 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7424 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7425}
7426
7427void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7428 const MachineInstr &MI,
7429 int OpIdx) const {
7430 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7431}
7432
7433void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7434 const MachineInstr &MI,
7435 int OpIdx) const {
7436 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7437 int ExpVal = APF.getExactLog2Abs();
7438 assert(ExpVal != INT_MIN);
7439 MIB.addImm(ExpVal);
7440}
7441
7442void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7443 const MachineInstr &MI,
7444 int OpIdx) const {
7445 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7446 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7447 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7448 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7449 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7450}
7451
7452void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7453 const MachineInstr &MI,
7454 int OpIdx) const {
7455 unsigned Mods = SISrcMods::OP_SEL_1;
7456 if (MI.getOperand(OpIdx).getImm())
7457 Mods ^= SISrcMods::NEG;
7458 MIB.addImm((int64_t)Mods);
7459}
7460
7461void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7462 const MachineInstr &MI,
7463 int OpIdx) const {
7464 unsigned Mods = SISrcMods::OP_SEL_1;
7465 if (MI.getOperand(OpIdx).getImm())
7467 MIB.addImm((int64_t)Mods);
7468}
7469
7470void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7471 const MachineInstr &MI,
7472 int OpIdx) const {
7473 unsigned Val = MI.getOperand(OpIdx).getImm();
7474 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7475 if (Val == 1) // neg
7476 Mods ^= SISrcMods::NEG;
7477 if (Val == 2) // abs
7478 Mods ^= SISrcMods::ABS;
7479 if (Val == 3) // neg and abs
7480 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7481 MIB.addImm((int64_t)Mods);
7482}
7483
7484void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7485 const MachineInstr &MI,
7486 int OpIdx) const {
7487 uint32_t V = MI.getOperand(2).getImm();
7490 if (!Subtarget->hasSafeCUPrefetch())
7491 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7492 MIB.addImm(V);
7493}
7494
7495/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7496void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7497 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7498 unsigned Val = MI.getOperand(OpIdx).getImm();
7499 unsigned New = 0;
7500 if (Val & 0x1)
7502 if (Val & 0x2)
7504 MIB.addImm(New);
7505}
7506
7507bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7508 return TII.isInlineConstant(Imm);
7509}
7510
7511bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7512 return TII.isInlineConstant(Imm);
7513}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1594
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858
@ Offset
Definition DWP.cpp:557
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1440
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:57
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:156
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:494
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:469
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:501
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.