Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Warning:line 3337, column 5
Value stored to 'MI' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUInstructionSelector.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-06-21-164211-33944-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPU.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
24#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
25#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26#include "llvm/IR/DiagnosticInfo.h"
27
28#define DEBUG_TYPE"amdgpu-isel" "amdgpu-isel"
29
30using namespace llvm;
31using namespace MIPatternMatch;
32
33static cl::opt<bool> AllowRiskySelect(
34 "amdgpu-global-isel-risky-select",
35 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
36 cl::init(false),
37 cl::ReallyHidden);
38
39#define GET_GLOBALISEL_IMPL
40#define AMDGPUSubtarget GCNSubtarget
41#include "AMDGPUGenGlobalISel.inc"
42#undef GET_GLOBALISEL_IMPL
43#undef AMDGPUSubtarget
44
45AMDGPUInstructionSelector::AMDGPUInstructionSelector(
46 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
47 const AMDGPUTargetMachine &TM)
48 : InstructionSelector(), TII(*STI.getInstrInfo()),
49 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
50 STI(STI),
51 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
52#define GET_GLOBALISEL_PREDICATES_INIT
53#include "AMDGPUGenGlobalISel.inc"
54#undef GET_GLOBALISEL_PREDICATES_INIT
55#define GET_GLOBALISEL_TEMPORARIES_INIT
56#include "AMDGPUGenGlobalISel.inc"
57#undef GET_GLOBALISEL_TEMPORARIES_INIT
58{
59}
60
61const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE"amdgpu-isel"; }
62
63void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
64 CodeGenCoverage &CoverageInfo,
65 ProfileSummaryInfo *PSI,
66 BlockFrequencyInfo *BFI) {
67 MRI = &MF.getRegInfo();
68 Subtarget = &MF.getSubtarget<GCNSubtarget>();
69 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
70}
71
72bool AMDGPUInstructionSelector::isVCC(Register Reg,
73 const MachineRegisterInfo &MRI) const {
74 // The verifier is oblivious to s1 being a valid value for wavesize registers.
75 if (Reg.isPhysical())
76 return false;
77
78 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
79 const TargetRegisterClass *RC =
80 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
81 if (RC) {
82 const LLT Ty = MRI.getType(Reg);
83 return RC->hasSuperClassEq(TRI.getBoolRC()) &&
84 Ty.isValid() && Ty.getSizeInBits() == 1;
85 }
86
87 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
88 return RB->getID() == AMDGPU::VCCRegBankID;
89}
90
91bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
92 unsigned NewOpc) const {
93 MI.setDesc(TII.get(NewOpc));
94 MI.RemoveOperand(1); // Remove intrinsic ID.
95 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
96
97 MachineOperand &Dst = MI.getOperand(0);
98 MachineOperand &Src = MI.getOperand(1);
99
100 // TODO: This should be legalized to s32 if needed
101 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
102 return false;
103
104 const TargetRegisterClass *DstRC
105 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
106 const TargetRegisterClass *SrcRC
107 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
108 if (!DstRC || DstRC != SrcRC)
109 return false;
110
111 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
112 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
113}
114
115bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
116 const DebugLoc &DL = I.getDebugLoc();
117 MachineBasicBlock *BB = I.getParent();
118 I.setDesc(TII.get(TargetOpcode::COPY));
119
120 const MachineOperand &Src = I.getOperand(1);
121 MachineOperand &Dst = I.getOperand(0);
122 Register DstReg = Dst.getReg();
123 Register SrcReg = Src.getReg();
124
125 if (isVCC(DstReg, *MRI)) {
126 if (SrcReg == AMDGPU::SCC) {
127 const TargetRegisterClass *RC
128 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
129 if (!RC)
130 return true;
131 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
132 }
133
134 if (!isVCC(SrcReg, *MRI)) {
135 // TODO: Should probably leave the copy and let copyPhysReg expand it.
136 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
137 return false;
138
139 const TargetRegisterClass *SrcRC
140 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
141
142 Optional<ValueAndVReg> ConstVal =
143 getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
144 if (ConstVal) {
145 unsigned MovOpc =
146 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
147 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
148 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
149 } else {
150 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
151
152 // We can't trust the high bits at this point, so clear them.
153
154 // TODO: Skip masking high bits if def is known boolean.
155
156 unsigned AndOpc =
157 TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
158 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
159 .addImm(1)
160 .addReg(SrcReg);
161 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
162 .addImm(0)
163 .addReg(MaskedReg);
164 }
165
166 if (!MRI->getRegClassOrNull(SrcReg))
167 MRI->setRegClass(SrcReg, SrcRC);
168 I.eraseFromParent();
169 return true;
170 }
171
172 const TargetRegisterClass *RC =
173 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
174 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
175 return false;
176
177 return true;
178 }
179
180 for (const MachineOperand &MO : I.operands()) {
181 if (MO.getReg().isPhysical())
182 continue;
183
184 const TargetRegisterClass *RC =
185 TRI.getConstrainedRegClassForOperand(MO, *MRI);
186 if (!RC)
187 continue;
188 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
189 }
190 return true;
191}
192
193bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
194 const Register DefReg = I.getOperand(0).getReg();
195 const LLT DefTy = MRI->getType(DefReg);
196 if (DefTy == LLT::scalar(1)) {
197 if (!AllowRiskySelect) {
198 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Skipping risky boolean phi\n"
; } } while (false)
;
199 return false;
200 }
201
202 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Selecting risky boolean phi\n"
; } } while (false)
;
203 }
204
205 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
206
207 const RegClassOrRegBank &RegClassOrBank =
208 MRI->getRegClassOrRegBank(DefReg);
209
210 const TargetRegisterClass *DefRC
211 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
212 if (!DefRC) {
213 if (!DefTy.isValid()) {
214 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "PHI operand has no type, not a gvreg?\n"
; } } while (false)
;
215 return false;
216 }
217
218 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
219 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
220 if (!DefRC) {
221 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "PHI operand has unexpected size/bank\n"
; } } while (false)
;
222 return false;
223 }
224 }
225
226 // TODO: Verify that all registers have the same bank
227 I.setDesc(TII.get(TargetOpcode::PHI));
228 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
229}
230
231MachineOperand
232AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
233 const TargetRegisterClass &SubRC,
234 unsigned SubIdx) const {
235
236 MachineInstr *MI = MO.getParent();
237 MachineBasicBlock *BB = MO.getParent()->getParent();
238 Register DstReg = MRI->createVirtualRegister(&SubRC);
239
240 if (MO.isReg()) {
241 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
242 Register Reg = MO.getReg();
243 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
244 .addReg(Reg, 0, ComposedSubIdx);
245
246 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
247 MO.isKill(), MO.isDead(), MO.isUndef(),
248 MO.isEarlyClobber(), 0, MO.isDebug(),
249 MO.isInternalRead());
250 }
251
252 assert(MO.isImm())(static_cast <bool> (MO.isImm()) ? void (0) : __assert_fail
("MO.isImm()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 252, __extension__ __PRETTY_FUNCTION__))
;
253
254 APInt Imm(64, MO.getImm());
255
256 switch (SubIdx) {
257 default:
258 llvm_unreachable("do not know to split immediate with this sub index.")::llvm::llvm_unreachable_internal("do not know to split immediate with this sub index."
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 258)
;
259 case AMDGPU::sub0:
260 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
261 case AMDGPU::sub1:
262 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
263 }
264}
265
266static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
267 switch (Opc) {
268 case AMDGPU::G_AND:
269 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
270 case AMDGPU::G_OR:
271 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
272 case AMDGPU::G_XOR:
273 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
274 default:
275 llvm_unreachable("not a bit op")::llvm::llvm_unreachable_internal("not a bit op", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 275)
;
276 }
277}
278
279bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
280 Register DstReg = I.getOperand(0).getReg();
281 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
282
283 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
284 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
285 DstRB->getID() != AMDGPU::VCCRegBankID)
286 return false;
287
288 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
289 STI.isWave64());
290 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
291
292 // Dead implicit-def of scc
293 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
294 true, // isImp
295 false, // isKill
296 true)); // isDead
297 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
298}
299
300bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
301 MachineBasicBlock *BB = I.getParent();
302 MachineFunction *MF = BB->getParent();
303 Register DstReg = I.getOperand(0).getReg();
304 const DebugLoc &DL = I.getDebugLoc();
305 LLT Ty = MRI->getType(DstReg);
306 if (Ty.isVector())
307 return false;
308
309 unsigned Size = Ty.getSizeInBits();
310 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
311 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
312 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
313
314 if (Size == 32) {
315 if (IsSALU) {
316 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
317 MachineInstr *Add =
318 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
319 .add(I.getOperand(1))
320 .add(I.getOperand(2));
321 I.eraseFromParent();
322 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
323 }
324
325 if (STI.hasAddNoCarry()) {
326 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
327 I.setDesc(TII.get(Opc));
328 I.addOperand(*MF, MachineOperand::CreateImm(0));
329 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
330 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
331 }
332
333 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
334
335 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
336 MachineInstr *Add
337 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
338 .addDef(UnusedCarry, RegState::Dead)
339 .add(I.getOperand(1))
340 .add(I.getOperand(2))
341 .addImm(0);
342 I.eraseFromParent();
343 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
344 }
345
346 assert(!Sub && "illegal sub should not reach here")(static_cast <bool> (!Sub && "illegal sub should not reach here"
) ? void (0) : __assert_fail ("!Sub && \"illegal sub should not reach here\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 346, __extension__ __PRETTY_FUNCTION__))
;
347
348 const TargetRegisterClass &RC
349 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
350 const TargetRegisterClass &HalfRC
351 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
352
353 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
354 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
355 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
356 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
357
358 Register DstLo = MRI->createVirtualRegister(&HalfRC);
359 Register DstHi = MRI->createVirtualRegister(&HalfRC);
360
361 if (IsSALU) {
362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
363 .add(Lo1)
364 .add(Lo2);
365 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
366 .add(Hi1)
367 .add(Hi2);
368 } else {
369 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
370 Register CarryReg = MRI->createVirtualRegister(CarryRC);
371 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
372 .addDef(CarryReg)
373 .add(Lo1)
374 .add(Lo2)
375 .addImm(0);
376 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
377 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
378 .add(Hi1)
379 .add(Hi2)
380 .addReg(CarryReg, RegState::Kill)
381 .addImm(0);
382
383 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
384 return false;
385 }
386
387 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
388 .addReg(DstLo)
389 .addImm(AMDGPU::sub0)
390 .addReg(DstHi)
391 .addImm(AMDGPU::sub1);
392
393
394 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
395 return false;
396
397 I.eraseFromParent();
398 return true;
399}
400
401bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
402 MachineInstr &I) const {
403 MachineBasicBlock *BB = I.getParent();
404 MachineFunction *MF = BB->getParent();
405 const DebugLoc &DL = I.getDebugLoc();
406 Register Dst0Reg = I.getOperand(0).getReg();
407 Register Dst1Reg = I.getOperand(1).getReg();
408 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
409 I.getOpcode() == AMDGPU::G_UADDE;
410 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
411 I.getOpcode() == AMDGPU::G_USUBE;
412
413 if (isVCC(Dst1Reg, *MRI)) {
414 unsigned NoCarryOpc =
415 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
416 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
417 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
418 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
419 I.addOperand(*MF, MachineOperand::CreateImm(0));
420 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
421 }
422
423 Register Src0Reg = I.getOperand(2).getReg();
424 Register Src1Reg = I.getOperand(3).getReg();
425
426 if (HasCarryIn) {
427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
428 .addReg(I.getOperand(4).getReg());
429 }
430
431 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
432 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
433
434 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
435 .add(I.getOperand(2))
436 .add(I.getOperand(3));
437 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
438 .addReg(AMDGPU::SCC);
439
440 if (!MRI->getRegClassOrNull(Dst1Reg))
441 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
442
443 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
444 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
445 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
446 return false;
447
448 if (HasCarryIn &&
449 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
450 AMDGPU::SReg_32RegClass, *MRI))
451 return false;
452
453 I.eraseFromParent();
454 return true;
455}
456
457// TODO: We should probably legalize these to only using 32-bit results.
458bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
459 MachineBasicBlock *BB = I.getParent();
460 Register DstReg = I.getOperand(0).getReg();
461 Register SrcReg = I.getOperand(1).getReg();
462 LLT DstTy = MRI->getType(DstReg);
463 LLT SrcTy = MRI->getType(SrcReg);
464 const unsigned SrcSize = SrcTy.getSizeInBits();
465 unsigned DstSize = DstTy.getSizeInBits();
466
467 // TODO: Should handle any multiple of 32 offset.
468 unsigned Offset = I.getOperand(2).getImm();
469 if (Offset % 32 != 0 || DstSize > 128)
470 return false;
471
472 // 16-bit operations really use 32-bit registers.
473 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
474 if (DstSize == 16)
475 DstSize = 32;
476
477 const TargetRegisterClass *DstRC =
478 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
479 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
480 return false;
481
482 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
483 const TargetRegisterClass *SrcRC =
484 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
485 if (!SrcRC)
486 return false;
487 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
488 DstSize / 32);
489 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
490 if (!SrcRC)
491 return false;
492
493 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
494 *SrcRC, I.getOperand(1));
495 const DebugLoc &DL = I.getDebugLoc();
496 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
497 .addReg(SrcReg, 0, SubReg);
498
499 I.eraseFromParent();
500 return true;
501}
502
503bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
504 MachineBasicBlock *BB = MI.getParent();
505 Register DstReg = MI.getOperand(0).getReg();
506 LLT DstTy = MRI->getType(DstReg);
507 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
508
509 const unsigned SrcSize = SrcTy.getSizeInBits();
510 if (SrcSize < 32)
511 return selectImpl(MI, *CoverageInfo);
512
513 const DebugLoc &DL = MI.getDebugLoc();
514 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
515 const unsigned DstSize = DstTy.getSizeInBits();
516 const TargetRegisterClass *DstRC =
517 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
518 if (!DstRC)
519 return false;
520
521 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
522 MachineInstrBuilder MIB =
523 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
524 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
525 MachineOperand &Src = MI.getOperand(I + 1);
526 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
527 MIB.addImm(SubRegs[I]);
528
529 const TargetRegisterClass *SrcRC
530 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
531 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
532 return false;
533 }
534
535 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
536 return false;
537
538 MI.eraseFromParent();
539 return true;
540}
541
542bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
543 MachineBasicBlock *BB = MI.getParent();
544 const int NumDst = MI.getNumOperands() - 1;
545
546 MachineOperand &Src = MI.getOperand(NumDst);
547
548 Register SrcReg = Src.getReg();
549 Register DstReg0 = MI.getOperand(0).getReg();
550 LLT DstTy = MRI->getType(DstReg0);
551 LLT SrcTy = MRI->getType(SrcReg);
552
553 const unsigned DstSize = DstTy.getSizeInBits();
554 const unsigned SrcSize = SrcTy.getSizeInBits();
555 const DebugLoc &DL = MI.getDebugLoc();
556 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
557
558 const TargetRegisterClass *SrcRC =
559 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
560 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
561 return false;
562
563 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
564 // source, and this relies on the fact that the same subregister indices are
565 // used for both.
566 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
567 for (int I = 0, E = NumDst; I != E; ++I) {
568 MachineOperand &Dst = MI.getOperand(I);
569 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
570 .addReg(SrcReg, 0, SubRegs[I]);
571
572 // Make sure the subregister index is valid for the source register.
573 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
574 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
575 return false;
576
577 const TargetRegisterClass *DstRC =
578 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
579 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
580 return false;
581 }
582
583 MI.eraseFromParent();
584 return true;
585}
586
587bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
588 MachineInstr &MI) const {
589 if (selectImpl(MI, *CoverageInfo))
590 return true;
591
592 const LLT S32 = LLT::scalar(32);
593 const LLT V2S16 = LLT::vector(2, 16);
594
595 Register Dst = MI.getOperand(0).getReg();
596 if (MRI->getType(Dst) != V2S16)
597 return false;
598
599 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
600 if (DstBank->getID() != AMDGPU::SGPRRegBankID)
601 return false;
602
603 Register Src0 = MI.getOperand(1).getReg();
604 Register Src1 = MI.getOperand(2).getReg();
605 if (MRI->getType(Src0) != S32)
606 return false;
607
608 const DebugLoc &DL = MI.getDebugLoc();
609 MachineBasicBlock *BB = MI.getParent();
610
611 auto ConstSrc1 =
612 getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
613 if (ConstSrc1) {
614 auto ConstSrc0 =
615 getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
616 if (ConstSrc0) {
617 const int64_t K0 = ConstSrc0->Value.getSExtValue();
618 const int64_t K1 = ConstSrc1->Value.getSExtValue();
619 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
620 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
621
622 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
623 .addImm(Lo16 | (Hi16 << 16));
624 MI.eraseFromParent();
625 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
626 }
627 }
628
629 // TODO: This should probably be a combine somewhere
630 // (build_vector_trunc $src0, undef -> copy $src0
631 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
632 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
633 MI.setDesc(TII.get(AMDGPU::COPY));
634 MI.RemoveOperand(2);
635 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
636 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
637 }
638
639 Register ShiftSrc0;
640 Register ShiftSrc1;
641
642 // With multiple uses of the shift, this will duplicate the shift and
643 // increase register pressure.
644 //
645 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
646 // => (S_PACK_HH_B32_B16 $src0, $src1)
647 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
648 // => (S_PACK_LH_B32_B16 $src0, $src1)
649 // (build_vector_trunc $src0, $src1)
650 // => (S_PACK_LL_B32_B16 $src0, $src1)
651
652 bool Shift0 = mi_match(
653 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
654
655 bool Shift1 = mi_match(
656 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
657
658 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
659 if (Shift0 && Shift1) {
660 Opc = AMDGPU::S_PACK_HH_B32_B16;
661 MI.getOperand(1).setReg(ShiftSrc0);
662 MI.getOperand(2).setReg(ShiftSrc1);
663 } else if (Shift1) {
664 Opc = AMDGPU::S_PACK_LH_B32_B16;
665 MI.getOperand(2).setReg(ShiftSrc1);
666 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
667 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
668 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
669 .addReg(ShiftSrc0)
670 .addImm(16);
671
672 MI.eraseFromParent();
673 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
674 }
675
676 MI.setDesc(TII.get(Opc));
677 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
678}
679
680bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
681 return selectG_ADD_SUB(I);
682}
683
684bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
685 const MachineOperand &MO = I.getOperand(0);
686
687 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
688 // regbank check here is to know why getConstrainedRegClassForOperand failed.
689 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
690 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
691 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
692 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
693 return true;
694 }
695
696 return false;
697}
698
699bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
700 MachineBasicBlock *BB = I.getParent();
701
702 Register DstReg = I.getOperand(0).getReg();
703 Register Src0Reg = I.getOperand(1).getReg();
704 Register Src1Reg = I.getOperand(2).getReg();
705 LLT Src1Ty = MRI->getType(Src1Reg);
706
707 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
708 unsigned InsSize = Src1Ty.getSizeInBits();
709
710 int64_t Offset = I.getOperand(3).getImm();
711
712 // FIXME: These cases should have been illegal and unnecessary to check here.
713 if (Offset % 32 != 0 || InsSize % 32 != 0)
714 return false;
715
716 // Currently not handled by getSubRegFromChannel.
717 if (InsSize > 128)
718 return false;
719
720 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
721 if (SubReg == AMDGPU::NoSubRegister)
722 return false;
723
724 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
725 const TargetRegisterClass *DstRC =
726 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
727 if (!DstRC)
728 return false;
729
730 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
731 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
732 const TargetRegisterClass *Src0RC =
733 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
734 const TargetRegisterClass *Src1RC =
735 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
736
737 // Deal with weird cases where the class only partially supports the subreg
738 // index.
739 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
740 if (!Src0RC || !Src1RC)
741 return false;
742
743 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
744 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
745 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
746 return false;
747
748 const DebugLoc &DL = I.getDebugLoc();
749 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
750 .addReg(Src0Reg)
751 .addReg(Src1Reg)
752 .addImm(SubReg);
753
754 I.eraseFromParent();
755 return true;
756}
757
758bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
759 if (STI.getLDSBankCount() != 16)
760 return selectImpl(MI, *CoverageInfo);
761
762 Register Dst = MI.getOperand(0).getReg();
763 Register Src0 = MI.getOperand(2).getReg();
764 Register M0Val = MI.getOperand(6).getReg();
765 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
766 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
767 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
768 return false;
769
770 // This requires 2 instructions. It is possible to write a pattern to support
771 // this, but the generated isel emitter doesn't correctly deal with multiple
772 // output instructions using the same physical register input. The copy to m0
773 // is incorrectly placed before the second instruction.
774 //
775 // TODO: Match source modifiers.
776
777 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
778 const DebugLoc &DL = MI.getDebugLoc();
779 MachineBasicBlock *MBB = MI.getParent();
780
781 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
782 .addReg(M0Val);
783 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
784 .addImm(2)
785 .addImm(MI.getOperand(4).getImm()) // $attr
786 .addImm(MI.getOperand(3).getImm()); // $attrchan
787
788 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
789 .addImm(0) // $src0_modifiers
790 .addReg(Src0) // $src0
791 .addImm(MI.getOperand(4).getImm()) // $attr
792 .addImm(MI.getOperand(3).getImm()) // $attrchan
793 .addImm(0) // $src2_modifiers
794 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
795 .addImm(MI.getOperand(5).getImm()) // $high
796 .addImm(0) // $clamp
797 .addImm(0); // $omod
798
799 MI.eraseFromParent();
800 return true;
801}
802
803// Writelane is special in that it can use SGPR and M0 (which would normally
804// count as using the constant bus twice - but in this case it is allowed since
805// the lane selector doesn't count as a use of the constant bus). However, it is
806// still required to abide by the 1 SGPR rule. Fix this up if we might have
807// multiple SGPRs.
808bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
809 // With a constant bus limit of at least 2, there's no issue.
810 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
811 return selectImpl(MI, *CoverageInfo);
812
813 MachineBasicBlock *MBB = MI.getParent();
814 const DebugLoc &DL = MI.getDebugLoc();
815 Register VDst = MI.getOperand(0).getReg();
816 Register Val = MI.getOperand(2).getReg();
817 Register LaneSelect = MI.getOperand(3).getReg();
818 Register VDstIn = MI.getOperand(4).getReg();
819
820 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
821
822 Optional<ValueAndVReg> ConstSelect =
823 getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
824 if (ConstSelect) {
825 // The selector has to be an inline immediate, so we can use whatever for
826 // the other operands.
827 MIB.addReg(Val);
828 MIB.addImm(ConstSelect->Value.getSExtValue() &
829 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
830 } else {
831 Optional<ValueAndVReg> ConstVal =
832 getConstantVRegValWithLookThrough(Val, *MRI, true, true);
833
834 // If the value written is an inline immediate, we can get away without a
835 // copy to m0.
836 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
837 STI.hasInv2PiInlineImm())) {
838 MIB.addImm(ConstVal->Value.getSExtValue());
839 MIB.addReg(LaneSelect);
840 } else {
841 MIB.addReg(Val);
842
843 // If the lane selector was originally in a VGPR and copied with
844 // readfirstlane, there's a hazard to read the same SGPR from the
845 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
846 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
847
848 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
849 .addReg(LaneSelect);
850 MIB.addReg(AMDGPU::M0);
851 }
852 }
853
854 MIB.addReg(VDstIn);
855
856 MI.eraseFromParent();
857 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
858}
859
860// We need to handle this here because tablegen doesn't support matching
861// instructions with multiple outputs.
862bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
863 Register Dst0 = MI.getOperand(0).getReg();
864 Register Dst1 = MI.getOperand(1).getReg();
865
866 LLT Ty = MRI->getType(Dst0);
867 unsigned Opc;
868 if (Ty == LLT::scalar(32))
869 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
870 else if (Ty == LLT::scalar(64))
871 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
872 else
873 return false;
874
875 // TODO: Match source modifiers.
876
877 const DebugLoc &DL = MI.getDebugLoc();
878 MachineBasicBlock *MBB = MI.getParent();
879
880 Register Numer = MI.getOperand(3).getReg();
881 Register Denom = MI.getOperand(4).getReg();
882 unsigned ChooseDenom = MI.getOperand(5).getImm();
883
884 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
885
886 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
887 .addDef(Dst1)
888 .addImm(0) // $src0_modifiers
889 .addUse(Src0) // $src0
890 .addImm(0) // $src1_modifiers
891 .addUse(Denom) // $src1
892 .addImm(0) // $src2_modifiers
893 .addUse(Numer) // $src2
894 .addImm(0) // $clamp
895 .addImm(0); // $omod
896
897 MI.eraseFromParent();
898 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
899}
900
901bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
902 unsigned IntrinsicID = I.getIntrinsicID();
903 switch (IntrinsicID) {
904 case Intrinsic::amdgcn_if_break: {
905 MachineBasicBlock *BB = I.getParent();
906
907 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
908 // SelectionDAG uses for wave32 vs wave64.
909 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
910 .add(I.getOperand(0))
911 .add(I.getOperand(2))
912 .add(I.getOperand(3));
913
914 Register DstReg = I.getOperand(0).getReg();
915 Register Src0Reg = I.getOperand(2).getReg();
916 Register Src1Reg = I.getOperand(3).getReg();
917
918 I.eraseFromParent();
919
920 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
921 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
922
923 return true;
924 }
925 case Intrinsic::amdgcn_interp_p1_f16:
926 return selectInterpP1F16(I);
927 case Intrinsic::amdgcn_wqm:
928 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
929 case Intrinsic::amdgcn_softwqm:
930 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
931 case Intrinsic::amdgcn_strict_wwm:
932 case Intrinsic::amdgcn_wwm:
933 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
934 case Intrinsic::amdgcn_strict_wqm:
935 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
936 case Intrinsic::amdgcn_writelane:
937 return selectWritelane(I);
938 case Intrinsic::amdgcn_div_scale:
939 return selectDivScale(I);
940 case Intrinsic::amdgcn_icmp:
941 return selectIntrinsicIcmp(I);
942 case Intrinsic::amdgcn_ballot:
943 return selectBallot(I);
944 case Intrinsic::amdgcn_reloc_constant:
945 return selectRelocConstant(I);
946 case Intrinsic::amdgcn_groupstaticsize:
947 return selectGroupStaticSize(I);
948 case Intrinsic::returnaddress:
949 return selectReturnAddress(I);
950 default:
951 return selectImpl(I, *CoverageInfo);
952 }
953}
954
955static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
956 if (Size != 32 && Size != 64)
957 return -1;
958 switch (P) {
959 default:
960 llvm_unreachable("Unknown condition code!")::llvm::llvm_unreachable_internal("Unknown condition code!", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 960)
;
961 case CmpInst::ICMP_NE:
962 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
963 case CmpInst::ICMP_EQ:
964 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
965 case CmpInst::ICMP_SGT:
966 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
967 case CmpInst::ICMP_SGE:
968 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
969 case CmpInst::ICMP_SLT:
970 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
971 case CmpInst::ICMP_SLE:
972 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
973 case CmpInst::ICMP_UGT:
974 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
975 case CmpInst::ICMP_UGE:
976 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
977 case CmpInst::ICMP_ULT:
978 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
979 case CmpInst::ICMP_ULE:
980 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
981 }
982}
983
984int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
985 unsigned Size) const {
986 if (Size == 64) {
987 if (!STI.hasScalarCompareEq64())
988 return -1;
989
990 switch (P) {
991 case CmpInst::ICMP_NE:
992 return AMDGPU::S_CMP_LG_U64;
993 case CmpInst::ICMP_EQ:
994 return AMDGPU::S_CMP_EQ_U64;
995 default:
996 return -1;
997 }
998 }
999
1000 if (Size != 32)
1001 return -1;
1002
1003 switch (P) {
1004 case CmpInst::ICMP_NE:
1005 return AMDGPU::S_CMP_LG_U32;
1006 case CmpInst::ICMP_EQ:
1007 return AMDGPU::S_CMP_EQ_U32;
1008 case CmpInst::ICMP_SGT:
1009 return AMDGPU::S_CMP_GT_I32;
1010 case CmpInst::ICMP_SGE:
1011 return AMDGPU::S_CMP_GE_I32;
1012 case CmpInst::ICMP_SLT:
1013 return AMDGPU::S_CMP_LT_I32;
1014 case CmpInst::ICMP_SLE:
1015 return AMDGPU::S_CMP_LE_I32;
1016 case CmpInst::ICMP_UGT:
1017 return AMDGPU::S_CMP_GT_U32;
1018 case CmpInst::ICMP_UGE:
1019 return AMDGPU::S_CMP_GE_U32;
1020 case CmpInst::ICMP_ULT:
1021 return AMDGPU::S_CMP_LT_U32;
1022 case CmpInst::ICMP_ULE:
1023 return AMDGPU::S_CMP_LE_U32;
1024 default:
1025 llvm_unreachable("Unknown condition code!")::llvm::llvm_unreachable_internal("Unknown condition code!", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1025)
;
1026 }
1027}
1028
1029bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1030 MachineBasicBlock *BB = I.getParent();
1031 const DebugLoc &DL = I.getDebugLoc();
1032
1033 Register SrcReg = I.getOperand(2).getReg();
1034 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1035
1036 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1037
1038 Register CCReg = I.getOperand(0).getReg();
1039 if (!isVCC(CCReg, *MRI)) {
1040 int Opcode = getS_CMPOpcode(Pred, Size);
1041 if (Opcode == -1)
1042 return false;
1043 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1044 .add(I.getOperand(2))
1045 .add(I.getOperand(3));
1046 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1047 .addReg(AMDGPU::SCC);
1048 bool Ret =
1049 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1050 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1051 I.eraseFromParent();
1052 return Ret;
1053 }
1054
1055 int Opcode = getV_CMPOpcode(Pred, Size);
1056 if (Opcode == -1)
1057 return false;
1058
1059 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1060 I.getOperand(0).getReg())
1061 .add(I.getOperand(2))
1062 .add(I.getOperand(3));
1063 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1064 *TRI.getBoolRC(), *MRI);
1065 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1066 I.eraseFromParent();
1067 return Ret;
1068}
1069
1070bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1071 Register Dst = I.getOperand(0).getReg();
1072 if (isVCC(Dst, *MRI))
1073 return false;
1074
1075 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1076 return false;
1077
1078 MachineBasicBlock *BB = I.getParent();
1079 const DebugLoc &DL = I.getDebugLoc();
1080 Register SrcReg = I.getOperand(2).getReg();
1081 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1082 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1083
1084 int Opcode = getV_CMPOpcode(Pred, Size);
1085 if (Opcode == -1)
1086 return false;
1087
1088 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1089 .add(I.getOperand(2))
1090 .add(I.getOperand(3));
1091 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1092 *MRI);
1093 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1094 I.eraseFromParent();
1095 return Ret;
1096}
1097
1098bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1099 MachineBasicBlock *BB = I.getParent();
1100 const DebugLoc &DL = I.getDebugLoc();
1101 Register DstReg = I.getOperand(0).getReg();
1102 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1103 const bool Is64 = Size == 64;
1104
1105 if (Size != STI.getWavefrontSize())
1106 return false;
1107
1108 Optional<ValueAndVReg> Arg =
1109 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1110
1111 if (Arg.hasValue()) {
1112 const int64_t Value = Arg.getValue().Value.getSExtValue();
1113 if (Value == 0) {
1114 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1115 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1116 } else if (Value == -1) { // all ones
1117 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1118 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1119 } else
1120 return false;
1121 } else {
1122 Register SrcReg = I.getOperand(2).getReg();
1123 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1124 }
1125
1126 I.eraseFromParent();
1127 return true;
1128}
1129
1130bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1131 Register DstReg = I.getOperand(0).getReg();
1132 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1133 const TargetRegisterClass *DstRC =
1134 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1135 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1136 return false;
1137
1138 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1139
1140 Module *M = MF->getFunction().getParent();
1141 const MDNode *Metadata = I.getOperand(2).getMetadata();
1142 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1143 auto RelocSymbol = cast<GlobalVariable>(
1144 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1145
1146 MachineBasicBlock *BB = I.getParent();
1147 BuildMI(*BB, &I, I.getDebugLoc(),
1148 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1149 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1150
1151 I.eraseFromParent();
1152 return true;
1153}
1154
1155bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1156 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1157
1158 Register DstReg = I.getOperand(0).getReg();
1159 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1160 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1161 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1162
1163 MachineBasicBlock *MBB = I.getParent();
1164 const DebugLoc &DL = I.getDebugLoc();
1165
1166 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1167
1168 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1169 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1170 MIB.addImm(MFI->getLDSSize());
1171 } else {
1172 Module *M = MF->getFunction().getParent();
1173 const GlobalValue *GV
1174 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1175 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1176 }
1177
1178 I.eraseFromParent();
1179 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1180}
1181
1182bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1183 MachineBasicBlock *MBB = I.getParent();
1184 MachineFunction &MF = *MBB->getParent();
1185 const DebugLoc &DL = I.getDebugLoc();
1186
1187 MachineOperand &Dst = I.getOperand(0);
1188 Register DstReg = Dst.getReg();
1189 unsigned Depth = I.getOperand(2).getImm();
1190
1191 const TargetRegisterClass *RC
1192 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1193 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1194 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1195 return false;
1196
1197 // Check for kernel and shader functions
1198 if (Depth != 0 ||
1199 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1200 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1201 .addImm(0);
1202 I.eraseFromParent();
1203 return true;
1204 }
1205
1206 MachineFrameInfo &MFI = MF.getFrameInfo();
1207 // There is a call to @llvm.returnaddress in this function
1208 MFI.setReturnAddressIsTaken(true);
1209
1210 // Get the return address reg and mark it as an implicit live-in
1211 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1212 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1213 AMDGPU::SReg_64RegClass);
1214 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1215 .addReg(LiveIn);
1216 I.eraseFromParent();
1217 return true;
1218}
1219
1220bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1221 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1222 // SelectionDAG uses for wave32 vs wave64.
1223 MachineBasicBlock *BB = MI.getParent();
1224 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1225 .add(MI.getOperand(1));
1226
1227 Register Reg = MI.getOperand(1).getReg();
1228 MI.eraseFromParent();
1229
1230 if (!MRI->getRegClassOrNull(Reg))
1231 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1232 return true;
1233}
1234
1235bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1236 MachineInstr &MI, Intrinsic::ID IntrID) const {
1237 MachineBasicBlock *MBB = MI.getParent();
1238 MachineFunction *MF = MBB->getParent();
1239 const DebugLoc &DL = MI.getDebugLoc();
1240
1241 unsigned IndexOperand = MI.getOperand(7).getImm();
1242 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1243 bool WaveDone = MI.getOperand(9).getImm() != 0;
1244
1245 if (WaveDone && !WaveRelease)
1246 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1247
1248 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1249 IndexOperand &= ~0x3f;
1250 unsigned CountDw = 0;
1251
1252 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1253 CountDw = (IndexOperand >> 24) & 0xf;
1254 IndexOperand &= ~(0xf << 24);
1255
1256 if (CountDw < 1 || CountDw > 4) {
1257 report_fatal_error(
1258 "ds_ordered_count: dword count must be between 1 and 4");
1259 }
1260 }
1261
1262 if (IndexOperand)
1263 report_fatal_error("ds_ordered_count: bad index operand");
1264
1265 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1266 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1267
1268 unsigned Offset0 = OrderedCountIndex << 2;
1269 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1270 (Instruction << 4);
1271
1272 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1273 Offset1 |= (CountDw - 1) << 6;
1274
1275 unsigned Offset = Offset0 | (Offset1 << 8);
1276
1277 Register M0Val = MI.getOperand(2).getReg();
1278 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1279 .addReg(M0Val);
1280
1281 Register DstReg = MI.getOperand(0).getReg();
1282 Register ValReg = MI.getOperand(3).getReg();
1283 MachineInstrBuilder DS =
1284 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1285 .addReg(ValReg)
1286 .addImm(Offset)
1287 .cloneMemRefs(MI);
1288
1289 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1290 return false;
1291
1292 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1293 MI.eraseFromParent();
1294 return Ret;
1295}
1296
1297static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1298 switch (IntrID) {
1299 case Intrinsic::amdgcn_ds_gws_init:
1300 return AMDGPU::DS_GWS_INIT;
1301 case Intrinsic::amdgcn_ds_gws_barrier:
1302 return AMDGPU::DS_GWS_BARRIER;
1303 case Intrinsic::amdgcn_ds_gws_sema_v:
1304 return AMDGPU::DS_GWS_SEMA_V;
1305 case Intrinsic::amdgcn_ds_gws_sema_br:
1306 return AMDGPU::DS_GWS_SEMA_BR;
1307 case Intrinsic::amdgcn_ds_gws_sema_p:
1308 return AMDGPU::DS_GWS_SEMA_P;
1309 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1310 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1311 default:
1312 llvm_unreachable("not a gws intrinsic")::llvm::llvm_unreachable_internal("not a gws intrinsic", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1312)
;
1313 }
1314}
1315
1316bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1317 Intrinsic::ID IID) const {
1318 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1319 !STI.hasGWSSemaReleaseAll())
1320 return false;
1321
1322 // intrinsic ID, vsrc, offset
1323 const bool HasVSrc = MI.getNumOperands() == 3;
1324 assert(HasVSrc || MI.getNumOperands() == 2)(static_cast <bool> (HasVSrc || MI.getNumOperands() == 2
) ? void (0) : __assert_fail ("HasVSrc || MI.getNumOperands() == 2"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1324, __extension__ __PRETTY_FUNCTION__))
;
1325
1326 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1327 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1328 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1329 return false;
1330
1331 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1332 assert(OffsetDef)(static_cast <bool> (OffsetDef) ? void (0) : __assert_fail
("OffsetDef", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1332, __extension__ __PRETTY_FUNCTION__))
;
1333
1334 unsigned ImmOffset;
1335
1336 MachineBasicBlock *MBB = MI.getParent();
1337 const DebugLoc &DL = MI.getDebugLoc();
1338
1339 MachineInstr *Readfirstlane = nullptr;
1340
1341 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1342 // incoming offset, in case there's an add of a constant. We'll have to put it
1343 // back later.
1344 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1345 Readfirstlane = OffsetDef;
1346 BaseOffset = OffsetDef->getOperand(1).getReg();
1347 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1348 }
1349
1350 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1351 // If we have a constant offset, try to use the 0 in m0 as the base.
1352 // TODO: Look into changing the default m0 initialization value. If the
1353 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1354 // the immediate offset.
1355
1356 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1357 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1358 .addImm(0);
1359 } else {
1360 std::tie(BaseOffset, ImmOffset) =
1361 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1362
1363 if (Readfirstlane) {
1364 // We have the constant offset now, so put the readfirstlane back on the
1365 // variable component.
1366 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1367 return false;
1368
1369 Readfirstlane->getOperand(1).setReg(BaseOffset);
1370 BaseOffset = Readfirstlane->getOperand(0).getReg();
1371 } else {
1372 if (!RBI.constrainGenericRegister(BaseOffset,
1373 AMDGPU::SReg_32RegClass, *MRI))
1374 return false;
1375 }
1376
1377 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1378 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1379 .addReg(BaseOffset)
1380 .addImm(16);
1381
1382 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1383 .addReg(M0Base);
1384 }
1385
1386 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1387 // offset field) % 64. Some versions of the programming guide omit the m0
1388 // part, or claim it's from offset 0.
1389 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1390
1391 if (HasVSrc) {
1392 Register VSrc = MI.getOperand(1).getReg();
1393
1394 if (STI.needsAlignedVGPRs()) {
1395 // Add implicit aligned super-reg to force alignment on the data operand.
1396 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1397 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1398 Register NewVR =
1399 MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
1400 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
1401 .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
1402 .addImm(AMDGPU::sub0)
1403 .addReg(Undef)
1404 .addImm(AMDGPU::sub1);
1405 MIB.addReg(NewVR, 0, AMDGPU::sub0);
1406 MIB.addReg(NewVR, RegState::Implicit);
1407 } else {
1408 MIB.addReg(VSrc);
1409 }
1410
1411 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1412 return false;
1413 }
1414
1415 MIB.addImm(ImmOffset)
1416 .cloneMemRefs(MI);
1417
1418 MI.eraseFromParent();
1419 return true;
1420}
1421
1422bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1423 bool IsAppend) const {
1424 Register PtrBase = MI.getOperand(2).getReg();
1425 LLT PtrTy = MRI->getType(PtrBase);
1426 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1427
1428 unsigned Offset;
1429 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1430
1431 // TODO: Should this try to look through readfirstlane like GWS?
1432 if (!isDSOffsetLegal(PtrBase, Offset)) {
1433 PtrBase = MI.getOperand(2).getReg();
1434 Offset = 0;
1435 }
1436
1437 MachineBasicBlock *MBB = MI.getParent();
1438 const DebugLoc &DL = MI.getDebugLoc();
1439 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1440
1441 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1442 .addReg(PtrBase);
1443 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1444 return false;
1445
1446 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1447 .addImm(Offset)
1448 .addImm(IsGDS ? -1 : 0)
1449 .cloneMemRefs(MI);
1450 MI.eraseFromParent();
1451 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1452}
1453
1454bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1455 if (TM.getOptLevel() > CodeGenOpt::None) {
1456 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1457 if (WGSize <= STI.getWavefrontSize()) {
1458 MachineBasicBlock *MBB = MI.getParent();
1459 const DebugLoc &DL = MI.getDebugLoc();
1460 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1461 MI.eraseFromParent();
1462 return true;
1463 }
1464 }
1465 return selectImpl(MI, *CoverageInfo);
1466}
1467
1468static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1469 bool &IsTexFail) {
1470 if (TexFailCtrl)
1471 IsTexFail = true;
1472
1473 TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1474 TexFailCtrl &= ~(uint64_t)0x1;
1475 LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1476 TexFailCtrl &= ~(uint64_t)0x2;
1477
1478 return TexFailCtrl == 0;
1479}
1480
1481bool AMDGPUInstructionSelector::selectImageIntrinsic(
1482 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1483 MachineBasicBlock *MBB = MI.getParent();
1484 const DebugLoc &DL = MI.getDebugLoc();
1485
1486 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1487 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1488
1489 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1490 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1491 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1492 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1493 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1494 unsigned IntrOpcode = Intr->BaseOpcode;
1495 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1496
1497 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1498
1499 Register VDataIn, VDataOut;
1500 LLT VDataTy;
1501 int NumVDataDwords = -1;
1502 bool IsD16 = false;
1503
1504 bool Unorm;
1505 if (!BaseOpcode->Sampler)
1506 Unorm = true;
1507 else
1508 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1509
1510 bool TFE;
1511 bool LWE;
1512 bool IsTexFail = false;
1513 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1514 TFE, LWE, IsTexFail))
1515 return false;
1516
1517 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1518 const bool IsA16 = (Flags & 1) != 0;
1519 const bool IsG16 = (Flags & 2) != 0;
1520
1521 // A16 implies 16 bit gradients if subtarget doesn't support G16
1522 if (IsA16 && !STI.hasG16() && !IsG16)
1523 return false;
1524
1525 unsigned DMask = 0;
1526 unsigned DMaskLanes = 0;
1527
1528 if (BaseOpcode->Atomic) {
1529 VDataOut = MI.getOperand(0).getReg();
1530 VDataIn = MI.getOperand(2).getReg();
1531 LLT Ty = MRI->getType(VDataIn);
1532
1533 // Be careful to allow atomic swap on 16-bit element vectors.
1534 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1535 Ty.getSizeInBits() == 128 :
1536 Ty.getSizeInBits() == 64;
1537
1538 if (BaseOpcode->AtomicX2) {
1539 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister)(static_cast <bool> (MI.getOperand(3).getReg() == AMDGPU
::NoRegister) ? void (0) : __assert_fail ("MI.getOperand(3).getReg() == AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1539, __extension__ __PRETTY_FUNCTION__))
;
1540
1541 DMask = Is64Bit ? 0xf : 0x3;
1542 NumVDataDwords = Is64Bit ? 4 : 2;
1543 } else {
1544 DMask = Is64Bit ? 0x3 : 0x1;
1545 NumVDataDwords = Is64Bit ? 2 : 1;
1546 }
1547 } else {
1548 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1549 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1550
1551 // One memoperand is mandatory, except for getresinfo.
1552 // FIXME: Check this in verifier.
1553 if (!MI.memoperands_empty()) {
1554 const MachineMemOperand *MMO = *MI.memoperands_begin();
1555
1556 // Infer d16 from the memory size, as the register type will be mangled by
1557 // unpacked subtargets, or by TFE.
1558 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1559 }
1560
1561 if (BaseOpcode->Store) {
1562 VDataIn = MI.getOperand(1).getReg();
1563 VDataTy = MRI->getType(VDataIn);
1564 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1565 } else {
1566 VDataOut = MI.getOperand(0).getReg();
1567 VDataTy = MRI->getType(VDataOut);
1568 NumVDataDwords = DMaskLanes;
1569
1570 if (IsD16 && !STI.hasUnpackedD16VMem())
1571 NumVDataDwords = (DMaskLanes + 1) / 2;
1572 }
1573 }
1574
1575 // Optimize _L to _LZ when _L is zero
1576 if (LZMappingInfo) {
1577 // The legalizer replaced the register with an immediate 0 if we need to
1578 // change the opcode.
1579 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
1580 if (Lod.isImm()) {
1581 assert(Lod.getImm() == 0)(static_cast <bool> (Lod.getImm() == 0) ? void (0) : __assert_fail
("Lod.getImm() == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1581, __extension__ __PRETTY_FUNCTION__))
;
1582 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
1583 }
1584 }
1585
1586 // Optimize _mip away, when 'lod' is zero
1587 if (MIPMappingInfo) {
1588 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
1589 if (Lod.isImm()) {
1590 assert(Lod.getImm() == 0)(static_cast <bool> (Lod.getImm() == 0) ? void (0) : __assert_fail
("Lod.getImm() == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1590, __extension__ __PRETTY_FUNCTION__))
;
1591 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
1592 }
1593 }
1594
1595 // Set G16 opcode
1596 if (IsG16 && !IsA16) {
1597 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1598 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1599 assert(G16MappingInfo)(static_cast <bool> (G16MappingInfo) ? void (0) : __assert_fail
("G16MappingInfo", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1599, __extension__ __PRETTY_FUNCTION__))
;
1600 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1601 }
1602
1603 // TODO: Check this in verifier.
1604 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this")(static_cast <bool> ((!IsTexFail || DMaskLanes >= 1)
&& "should have legalized this") ? void (0) : __assert_fail
("(!IsTexFail || DMaskLanes >= 1) && \"should have legalized this\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1604, __extension__ __PRETTY_FUNCTION__))
;
1605
1606 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1607 if (BaseOpcode->Atomic)
1608 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1609 if (CPol & ~AMDGPU::CPol::ALL)
1610 return false;
1611
1612 int NumVAddrRegs = 0;
1613 int NumVAddrDwords = 0;
1614 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1615 // Skip the $noregs and 0s inserted during legalization.
1616 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1617 if (!AddrOp.isReg())
1618 continue; // XXX - Break?
1619
1620 Register Addr = AddrOp.getReg();
1621 if (!Addr)
1622 break;
1623
1624 ++NumVAddrRegs;
1625 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1626 }
1627
1628 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1629 // NSA, these should have beeen packed into a single value in the first
1630 // address register
1631 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1632 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1633 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Trying to use NSA on non-NSA target\n"
; } } while (false)
;
1634 return false;
1635 }
1636
1637 if (IsTexFail)
1638 ++NumVDataDwords;
1639
1640 int Opcode = -1;
1641 if (IsGFX10Plus) {
1642 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1643 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1644 : AMDGPU::MIMGEncGfx10Default,
1645 NumVDataDwords, NumVAddrDwords);
1646 } else {
1647 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1648 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1649 NumVDataDwords, NumVAddrDwords);
1650 if (Opcode == -1)
1651 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1652 NumVDataDwords, NumVAddrDwords);
1653 }
1654 assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail
("Opcode != -1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1654, __extension__ __PRETTY_FUNCTION__))
;
1655
1656 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1657 .cloneMemRefs(MI);
1658
1659 if (VDataOut) {
1660 if (BaseOpcode->AtomicX2) {
1661 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1662
1663 Register TmpReg = MRI->createVirtualRegister(
1664 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1665 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1666
1667 MIB.addDef(TmpReg);
1668 if (!MRI->use_empty(VDataOut)) {
1669 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1670 .addReg(TmpReg, RegState::Kill, SubReg);
1671 }
1672
1673 } else {
1674 MIB.addDef(VDataOut); // vdata output
1675 }
1676 }
1677
1678 if (VDataIn)
1679 MIB.addReg(VDataIn); // vdata input
1680
1681 for (int I = 0; I != NumVAddrRegs; ++I) {
1682 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1683 if (SrcOp.isReg()) {
1684 assert(SrcOp.getReg() != 0)(static_cast <bool> (SrcOp.getReg() != 0) ? void (0) : __assert_fail
("SrcOp.getReg() != 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1684, __extension__ __PRETTY_FUNCTION__))
;
1685 MIB.addReg(SrcOp.getReg());
1686 }
1687 }
1688
1689 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1690 if (BaseOpcode->Sampler)
1691 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1692
1693 MIB.addImm(DMask); // dmask
1694
1695 if (IsGFX10Plus)
1696 MIB.addImm(DimInfo->Encoding);
1697 MIB.addImm(Unorm);
1698
1699 MIB.addImm(CPol);
1700 MIB.addImm(IsA16 && // a16 or r128
1701 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1702 if (IsGFX10Plus)
1703 MIB.addImm(IsA16 ? -1 : 0);
1704
1705 MIB.addImm(TFE); // tfe
1706 MIB.addImm(LWE); // lwe
1707 if (!IsGFX10Plus)
1708 MIB.addImm(DimInfo->DA ? -1 : 0);
1709 if (BaseOpcode->HasD16)
1710 MIB.addImm(IsD16 ? -1 : 0);
1711
1712 if (IsTexFail) {
1713 // An image load instruction with TFE/LWE only conditionally writes to its
1714 // result registers. Initialize them to zero so that we always get well
1715 // defined result values.
1716 assert(VDataOut && !VDataIn)(static_cast <bool> (VDataOut && !VDataIn) ? void
(0) : __assert_fail ("VDataOut && !VDataIn", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1716, __extension__ __PRETTY_FUNCTION__))
;
1717 Register Tied = MRI->cloneVirtualRegister(VDataOut);
1718 Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1719 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
1720 .addImm(0);
1721 auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
1722 if (STI.usePRTStrictNull()) {
1723 // With enable-prt-strict-null enabled, initialize all result registers to
1724 // zero.
1725 auto RegSeq =
1726 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1727 for (auto Sub : Parts)
1728 RegSeq.addReg(Zero).addImm(Sub);
1729 } else {
1730 // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
1731 // result register.
1732 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1733 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1734 auto RegSeq =
1735 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1736 for (auto Sub : Parts.drop_back(1))
1737 RegSeq.addReg(Undef).addImm(Sub);
1738 RegSeq.addReg(Zero).addImm(Parts.back());
1739 }
1740 MIB.addReg(Tied, RegState::Implicit);
1741 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1742 }
1743
1744 MI.eraseFromParent();
1745 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1746}
1747
1748bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1749 MachineInstr &I) const {
1750 unsigned IntrinsicID = I.getIntrinsicID();
1751 switch (IntrinsicID) {
1752 case Intrinsic::amdgcn_end_cf:
1753 return selectEndCfIntrinsic(I);
1754 case Intrinsic::amdgcn_ds_ordered_add:
1755 case Intrinsic::amdgcn_ds_ordered_swap:
1756 return selectDSOrderedIntrinsic(I, IntrinsicID);
1757 case Intrinsic::amdgcn_ds_gws_init:
1758 case Intrinsic::amdgcn_ds_gws_barrier:
1759 case Intrinsic::amdgcn_ds_gws_sema_v:
1760 case Intrinsic::amdgcn_ds_gws_sema_br:
1761 case Intrinsic::amdgcn_ds_gws_sema_p:
1762 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1763 return selectDSGWSIntrinsic(I, IntrinsicID);
1764 case Intrinsic::amdgcn_ds_append:
1765 return selectDSAppendConsume(I, true);
1766 case Intrinsic::amdgcn_ds_consume:
1767 return selectDSAppendConsume(I, false);
1768 case Intrinsic::amdgcn_s_barrier:
1769 return selectSBarrier(I);
1770 case Intrinsic::amdgcn_global_atomic_fadd:
1771 return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
1772 default: {
1773 return selectImpl(I, *CoverageInfo);
1774 }
1775 }
1776}
1777
1778bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1779 if (selectImpl(I, *CoverageInfo))
1780 return true;
1781
1782 MachineBasicBlock *BB = I.getParent();
1783 const DebugLoc &DL = I.getDebugLoc();
1784
1785 Register DstReg = I.getOperand(0).getReg();
1786 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1787 assert(Size <= 32 || Size == 64)(static_cast <bool> (Size <= 32 || Size == 64) ? void
(0) : __assert_fail ("Size <= 32 || Size == 64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1787, __extension__ __PRETTY_FUNCTION__))
;
1788 const MachineOperand &CCOp = I.getOperand(1);
1789 Register CCReg = CCOp.getReg();
1790 if (!isVCC(CCReg, *MRI)) {
1791 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1792 AMDGPU::S_CSELECT_B32;
1793 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1794 .addReg(CCReg);
1795
1796 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1797 // bank, because it does not cover the register class that we used to represent
1798 // for it. So we need to manually set the register class here.
1799 if (!MRI->getRegClassOrNull(CCReg))
1800 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1801 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1802 .add(I.getOperand(2))
1803 .add(I.getOperand(3));
1804
1805 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1806 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1807 I.eraseFromParent();
1808 return Ret;
1809 }
1810
1811 // Wide VGPR select should have been split in RegBankSelect.
1812 if (Size > 32)
1813 return false;
1814
1815 MachineInstr *Select =
1816 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1817 .addImm(0)
1818 .add(I.getOperand(3))
1819 .addImm(0)
1820 .add(I.getOperand(2))
1821 .add(I.getOperand(1));
1822
1823 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1824 I.eraseFromParent();
1825 return Ret;
1826}
1827
1828static int sizeToSubRegIndex(unsigned Size) {
1829 switch (Size) {
1830 case 32:
1831 return AMDGPU::sub0;
1832 case 64:
1833 return AMDGPU::sub0_sub1;
1834 case 96:
1835 return AMDGPU::sub0_sub1_sub2;
1836 case 128:
1837 return AMDGPU::sub0_sub1_sub2_sub3;
1838 case 256:
1839 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1840 default:
1841 if (Size < 32)
1842 return AMDGPU::sub0;
1843 if (Size > 256)
1844 return -1;
1845 return sizeToSubRegIndex(PowerOf2Ceil(Size));
1846 }
1847}
1848
1849bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1850 Register DstReg = I.getOperand(0).getReg();
1851 Register SrcReg = I.getOperand(1).getReg();
1852 const LLT DstTy = MRI->getType(DstReg);
1853 const LLT SrcTy = MRI->getType(SrcReg);
1854 const LLT S1 = LLT::scalar(1);
1855
1856 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1857 const RegisterBank *DstRB;
1858 if (DstTy == S1) {
1859 // This is a special case. We don't treat s1 for legalization artifacts as
1860 // vcc booleans.
1861 DstRB = SrcRB;
1862 } else {
1863 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1864 if (SrcRB != DstRB)
1865 return false;
1866 }
1867
1868 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1869
1870 unsigned DstSize = DstTy.getSizeInBits();
1871 unsigned SrcSize = SrcTy.getSizeInBits();
1872
1873 const TargetRegisterClass *SrcRC
1874 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1875 const TargetRegisterClass *DstRC
1876 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1877 if (!SrcRC || !DstRC)
1878 return false;
1879
1880 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1881 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1882 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Failed to constrain G_TRUNC\n"
; } } while (false)
;
1883 return false;
1884 }
1885
1886 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1887 MachineBasicBlock *MBB = I.getParent();
1888 const DebugLoc &DL = I.getDebugLoc();
1889
1890 Register LoReg = MRI->createVirtualRegister(DstRC);
1891 Register HiReg = MRI->createVirtualRegister(DstRC);
1892 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1893 .addReg(SrcReg, 0, AMDGPU::sub0);
1894 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1895 .addReg(SrcReg, 0, AMDGPU::sub1);
1896
1897 if (IsVALU && STI.hasSDWA()) {
1898 // Write the low 16-bits of the high element into the high 16-bits of the
1899 // low element.
1900 MachineInstr *MovSDWA =
1901 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1902 .addImm(0) // $src0_modifiers
1903 .addReg(HiReg) // $src0
1904 .addImm(0) // $clamp
1905 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
1906 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1907 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
1908 .addReg(LoReg, RegState::Implicit);
1909 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1910 } else {
1911 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1912 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1913 Register ImmReg = MRI->createVirtualRegister(DstRC);
1914 if (IsVALU) {
1915 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1916 .addImm(16)
1917 .addReg(HiReg);
1918 } else {
1919 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1920 .addReg(HiReg)
1921 .addImm(16);
1922 }
1923
1924 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1925 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1926 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1927
1928 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1929 .addImm(0xffff);
1930 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1931 .addReg(LoReg)
1932 .addReg(ImmReg);
1933 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1934 .addReg(TmpReg0)
1935 .addReg(TmpReg1);
1936 }
1937
1938 I.eraseFromParent();
1939 return true;
1940 }
1941
1942 if (!DstTy.isScalar())
1943 return false;
1944
1945 if (SrcSize > 32) {
1946 int SubRegIdx = sizeToSubRegIndex(DstSize);
1947 if (SubRegIdx == -1)
1948 return false;
1949
1950 // Deal with weird cases where the class only partially supports the subreg
1951 // index.
1952 const TargetRegisterClass *SrcWithSubRC
1953 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1954 if (!SrcWithSubRC)
1955 return false;
1956
1957 if (SrcWithSubRC != SrcRC) {
1958 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1959 return false;
1960 }
1961
1962 I.getOperand(1).setSubReg(SubRegIdx);
1963 }
1964
1965 I.setDesc(TII.get(TargetOpcode::COPY));
1966 return true;
1967}
1968
1969/// \returns true if a bitmask for \p Size bits will be an inline immediate.
1970static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1971 Mask = maskTrailingOnes<unsigned>(Size);
1972 int SignedMask = static_cast<int>(Mask);
1973 return SignedMask >= -16 && SignedMask <= 64;
1974}
1975
1976// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1977const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1978 Register Reg, const MachineRegisterInfo &MRI,
1979 const TargetRegisterInfo &TRI) const {
1980 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1981 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1982 return RB;
1983
1984 // Ignore the type, since we don't use vcc in artifacts.
1985 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1986 return &RBI.getRegBankFromRegClass(*RC, LLT());
1987 return nullptr;
1988}
1989
1990bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1991 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1992 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1993 const DebugLoc &DL = I.getDebugLoc();
1994 MachineBasicBlock &MBB = *I.getParent();
1995 const Register DstReg = I.getOperand(0).getReg();
1996 const Register SrcReg = I.getOperand(1).getReg();
1997
1998 const LLT DstTy = MRI->getType(DstReg);
1999 const LLT SrcTy = MRI->getType(SrcReg);
2000 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2001 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2002 const unsigned DstSize = DstTy.getSizeInBits();
2003 if (!DstTy.isScalar())
2004 return false;
2005
2006 // Artifact casts should never use vcc.
2007 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2008
2009 // FIXME: This should probably be illegal and split earlier.
2010 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2011 if (DstSize <= 32)
2012 return selectCOPY(I);
2013
2014 const TargetRegisterClass *SrcRC =
2015 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
2016 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2017 const TargetRegisterClass *DstRC =
2018 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
2019
2020 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2021 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2022 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2023 .addReg(SrcReg)
2024 .addImm(AMDGPU::sub0)
2025 .addReg(UndefReg)
2026 .addImm(AMDGPU::sub1);
2027 I.eraseFromParent();
2028
2029 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2030 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2031 }
2032
2033 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2034 // 64-bit should have been split up in RegBankSelect
2035
2036 // Try to use an and with a mask if it will save code size.
2037 unsigned Mask;
2038 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2039 MachineInstr *ExtI =
2040 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2041 .addImm(Mask)
2042 .addReg(SrcReg);
2043 I.eraseFromParent();
2044 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2045 }
2046
2047 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2048 MachineInstr *ExtI =
2049 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2050 .addReg(SrcReg)
2051 .addImm(0) // Offset
2052 .addImm(SrcSize); // Width
2053 I.eraseFromParent();
2054 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2055 }
2056
2057 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2058 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2059 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2060 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2061 return false;
2062
2063 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2064 const unsigned SextOpc = SrcSize == 8 ?
2065 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2066 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2067 .addReg(SrcReg);
2068 I.eraseFromParent();
2069 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2070 }
2071
2072 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2073 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2074
2075 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2076 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2077 // We need a 64-bit register source, but the high bits don't matter.
2078 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2079 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2080 unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2081
2082 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2083 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2084 .addReg(SrcReg, 0, SubReg)
2085 .addImm(AMDGPU::sub0)
2086 .addReg(UndefReg)
2087 .addImm(AMDGPU::sub1);
2088
2089 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2090 .addReg(ExtReg)
2091 .addImm(SrcSize << 16);
2092
2093 I.eraseFromParent();
2094 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2095 }
2096
2097 unsigned Mask;
2098 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2099 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2100 .addReg(SrcReg)
2101 .addImm(Mask);
2102 } else {
2103 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2104 .addReg(SrcReg)
2105 .addImm(SrcSize << 16);
2106 }
2107
2108 I.eraseFromParent();
2109 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2110 }
2111
2112 return false;
2113}
2114
2115bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2116 MachineBasicBlock *BB = I.getParent();
2117 MachineOperand &ImmOp = I.getOperand(1);
2118 Register DstReg = I.getOperand(0).getReg();
2119 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2120
2121 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2122 if (ImmOp.isFPImm()) {
2123 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2124 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2125 } else if (ImmOp.isCImm()) {
2126 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2127 } else {
2128 llvm_unreachable("Not supported by g_constants")::llvm::llvm_unreachable_internal("Not supported by g_constants"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2128)
;
2129 }
2130
2131 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2132 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2133
2134 unsigned Opcode;
2135 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2136 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2137 } else {
2138 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2139
2140 // We should never produce s1 values on banks other than VCC. If the user of
2141 // this already constrained the register, we may incorrectly think it's VCC
2142 // if it wasn't originally.
2143 if (Size == 1)
2144 return false;
2145 }
2146
2147 if (Size != 64) {
2148 I.setDesc(TII.get(Opcode));
2149 I.addImplicitDefUseOperands(*MF);
2150 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2151 }
2152
2153 const DebugLoc &DL = I.getDebugLoc();
2154
2155 APInt Imm(Size, I.getOperand(1).getImm());
2156
2157 MachineInstr *ResInst;
2158 if (IsSgpr && TII.isInlineConstant(Imm)) {
2159 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2160 .addImm(I.getOperand(1).getImm());
2161 } else {
2162 const TargetRegisterClass *RC = IsSgpr ?
2163 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2164 Register LoReg = MRI->createVirtualRegister(RC);
2165 Register HiReg = MRI->createVirtualRegister(RC);
2166
2167 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2168 .addImm(Imm.trunc(32).getZExtValue());
2169
2170 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2171 .addImm(Imm.ashr(32).getZExtValue());
2172
2173 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2174 .addReg(LoReg)
2175 .addImm(AMDGPU::sub0)
2176 .addReg(HiReg)
2177 .addImm(AMDGPU::sub1);
2178 }
2179
2180 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2181 // work for target independent opcodes
2182 I.eraseFromParent();
2183 const TargetRegisterClass *DstRC =
2184 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2185 if (!DstRC)
2186 return true;
2187 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2188}
2189
2190bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2191 // Only manually handle the f64 SGPR case.
2192 //
2193 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2194 // the bit ops theoretically have a second result due to the implicit def of
2195 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2196 // that is easy by disabling the check. The result works, but uses a
2197 // nonsensical sreg32orlds_and_sreg_1 regclass.
2198 //
2199 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2200 // the variadic REG_SEQUENCE operands.
2201
2202 Register Dst = MI.getOperand(0).getReg();
2203 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2204 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2205 MRI->getType(Dst) != LLT::scalar(64))
2206 return false;
2207
2208 Register Src = MI.getOperand(1).getReg();
2209 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2210 if (Fabs)
2211 Src = Fabs->getOperand(1).getReg();
2212
2213 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2214 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2215 return false;
2216
2217 MachineBasicBlock *BB = MI.getParent();
2218 const DebugLoc &DL = MI.getDebugLoc();
2219 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2220 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2221 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2222 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2223
2224 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2225 .addReg(Src, 0, AMDGPU::sub0);
2226 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2227 .addReg(Src, 0, AMDGPU::sub1);
2228 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2229 .addImm(0x80000000);
2230
2231 // Set or toggle sign bit.
2232 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2233 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2234 .addReg(HiReg)
2235 .addReg(ConstReg);
2236 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2237 .addReg(LoReg)
2238 .addImm(AMDGPU::sub0)
2239 .addReg(OpReg)
2240 .addImm(AMDGPU::sub1);
2241 MI.eraseFromParent();
2242 return true;
2243}
2244
2245// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2246bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2247 Register Dst = MI.getOperand(0).getReg();
2248 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2249 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2250 MRI->getType(Dst) != LLT::scalar(64))
2251 return false;
2252
2253 Register Src = MI.getOperand(1).getReg();
2254 MachineBasicBlock *BB = MI.getParent();
2255 const DebugLoc &DL = MI.getDebugLoc();
2256 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2257 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2258 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2259 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2260
2261 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2262 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2263 return false;
2264
2265 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2266 .addReg(Src, 0, AMDGPU::sub0);
2267 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2268 .addReg(Src, 0, AMDGPU::sub1);
2269 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2270 .addImm(0x7fffffff);
2271
2272 // Clear sign bit.
2273 // TODO: Should this used S_BITSET0_*?
2274 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2275 .addReg(HiReg)
2276 .addReg(ConstReg);
2277 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2278 .addReg(LoReg)
2279 .addImm(AMDGPU::sub0)
2280 .addReg(OpReg)
2281 .addImm(AMDGPU::sub1);
2282
2283 MI.eraseFromParent();
2284 return true;
2285}
2286
2287static bool isConstant(const MachineInstr &MI) {
2288 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2289}
2290
2291void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2292 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2293
2294 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2295
2296 assert(PtrMI)(static_cast <bool> (PtrMI) ? void (0) : __assert_fail (
"PtrMI", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2296, __extension__ __PRETTY_FUNCTION__))
;
2297
2298 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2299 return;
2300
2301 GEPInfo GEPInfo(*PtrMI);
2302
2303 for (unsigned i = 1; i != 3; ++i) {
2304 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2305 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2306 assert(OpDef)(static_cast <bool> (OpDef) ? void (0) : __assert_fail (
"OpDef", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2306, __extension__ __PRETTY_FUNCTION__))
;
2307 if (i == 2 && isConstant(*OpDef)) {
2308 // TODO: Could handle constant base + variable offset, but a combine
2309 // probably should have commuted it.
2310 assert(GEPInfo.Imm == 0)(static_cast <bool> (GEPInfo.Imm == 0) ? void (0) : __assert_fail
("GEPInfo.Imm == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2310, __extension__ __PRETTY_FUNCTION__))
;
2311 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2312 continue;
2313 }
2314 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2315 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2316 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2317 else
2318 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2319 }
2320
2321 AddrInfo.push_back(GEPInfo);
2322 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2323}
2324
2325bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2326 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2327}
2328
2329bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2330 if (!MI.hasOneMemOperand())
2331 return false;
2332
2333 const MachineMemOperand *MMO = *MI.memoperands_begin();
2334 const Value *Ptr = MMO->getValue();
2335
2336 // UndefValue means this is a load of a kernel input. These are uniform.
2337 // Sometimes LDS instructions have constant pointers.
2338 // If Ptr is null, then that means this mem operand contains a
2339 // PseudoSourceValue like GOT.
2340 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2341 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2342 return true;
2343
2344 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2345 return true;
2346
2347 const Instruction *I = dyn_cast<Instruction>(Ptr);
2348 return I && I->getMetadata("amdgpu.uniform");
2349}
2350
2351bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2352 for (const GEPInfo &GEPInfo : AddrInfo) {
2353 if (!GEPInfo.VgprParts.empty())
2354 return true;
2355 }
2356 return false;
2357}
2358
2359void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2360 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2361 unsigned AS = PtrTy.getAddressSpace();
2362 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2363 STI.ldsRequiresM0Init()) {
2364 MachineBasicBlock *BB = I.getParent();
2365
2366 // If DS instructions require M0 initializtion, insert it before selecting.
2367 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2368 .addImm(-1);
2369 }
2370}
2371
2372bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2373 MachineInstr &I) const {
2374 if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) {
2375 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2376 unsigned AS = PtrTy.getAddressSpace();
2377 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
2378 return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2));
2379 }
2380
2381 initM0(I);
2382 return selectImpl(I, *CoverageInfo);
2383}
2384
2385// TODO: No rtn optimization.
2386bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2387 MachineInstr &MI) const {
2388 Register PtrReg = MI.getOperand(1).getReg();
2389 const LLT PtrTy = MRI->getType(PtrReg);
2390 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2391 STI.useFlatForGlobal())
2392 return selectImpl(MI, *CoverageInfo);
2393
2394 Register DstReg = MI.getOperand(0).getReg();
2395 const LLT Ty = MRI->getType(DstReg);
2396 const bool Is64 = Ty.getSizeInBits() == 64;
2397 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2398 Register TmpReg = MRI->createVirtualRegister(
2399 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2400
2401 const DebugLoc &DL = MI.getDebugLoc();
2402 MachineBasicBlock *BB = MI.getParent();
2403
2404 Register VAddr, RSrcReg, SOffset;
2405 int64_t Offset = 0;
2406
2407 unsigned Opcode;
2408 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2409 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2410 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2411 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2412 RSrcReg, SOffset, Offset)) {
2413 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2414 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2415 } else
2416 return selectImpl(MI, *CoverageInfo);
2417
2418 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2419 .addReg(MI.getOperand(2).getReg());
2420
2421 if (VAddr)
2422 MIB.addReg(VAddr);
2423
2424 MIB.addReg(RSrcReg);
2425 if (SOffset)
2426 MIB.addReg(SOffset);
2427 else
2428 MIB.addImm(0);
2429
2430 MIB.addImm(Offset);
2431 MIB.addImm(AMDGPU::CPol::GLC);
2432 MIB.cloneMemRefs(MI);
2433
2434 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2435 .addReg(TmpReg, RegState::Kill, SubReg);
2436
2437 MI.eraseFromParent();
2438
2439 MRI->setRegClass(
2440 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2441 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2442}
2443
2444bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2445 MachineBasicBlock *BB = I.getParent();
2446 MachineOperand &CondOp = I.getOperand(0);
2447 Register CondReg = CondOp.getReg();
2448 const DebugLoc &DL = I.getDebugLoc();
2449
2450 unsigned BrOpcode;
2451 Register CondPhysReg;
2452 const TargetRegisterClass *ConstrainRC;
2453
2454 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2455 // whether the branch is uniform when selecting the instruction. In
2456 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2457 // RegBankSelect knows what it's doing if the branch condition is scc, even
2458 // though it currently does not.
2459 if (!isVCC(CondReg, *MRI)) {
2460 if (MRI->getType(CondReg) != LLT::scalar(32))
2461 return false;
2462
2463 CondPhysReg = AMDGPU::SCC;
2464 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2465 ConstrainRC = &AMDGPU::SReg_32RegClass;
2466 } else {
2467 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2468 // We sort of know that a VCC producer based on the register bank, that ands
2469 // inactive lanes with 0. What if there was a logical operation with vcc
2470 // producers in different blocks/with different exec masks?
2471 // FIXME: Should scc->vcc copies and with exec?
2472 CondPhysReg = TRI.getVCC();
2473 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2474 ConstrainRC = TRI.getBoolRC();
2475 }
2476
2477 if (!MRI->getRegClassOrNull(CondReg))
2478 MRI->setRegClass(CondReg, ConstrainRC);
2479
2480 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2481 .addReg(CondReg);
2482 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2483 .addMBB(I.getOperand(1).getMBB());
2484
2485 I.eraseFromParent();
2486 return true;
2487}
2488
2489bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2490 MachineInstr &I) const {
2491 Register DstReg = I.getOperand(0).getReg();
2492 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2493 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2494 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2495 if (IsVGPR)
2496 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2497
2498 return RBI.constrainGenericRegister(
2499 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2500}
2501
2502bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2503 Register DstReg = I.getOperand(0).getReg();
2504 Register SrcReg = I.getOperand(1).getReg();
2505 Register MaskReg = I.getOperand(2).getReg();
2506 LLT Ty = MRI->getType(DstReg);
2507 LLT MaskTy = MRI->getType(MaskReg);
2508
2509 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2510 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2511 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2512 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2513 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2514 return false;
2515
2516 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2517 const TargetRegisterClass &RegRC
2518 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2519
2520 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2521 *MRI);
2522 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2523 *MRI);
2524 const TargetRegisterClass *MaskRC =
2525 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2526
2527 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2528 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2529 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2530 return false;
2531
2532 MachineBasicBlock *BB = I.getParent();
2533 const DebugLoc &DL = I.getDebugLoc();
2534 if (Ty.getSizeInBits() == 32) {
2535 assert(MaskTy.getSizeInBits() == 32 &&(static_cast <bool> (MaskTy.getSizeInBits() == 32 &&
"ptrmask should have been narrowed during legalize") ? void (
0) : __assert_fail ("MaskTy.getSizeInBits() == 32 && \"ptrmask should have been narrowed during legalize\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2536, __extension__ __PRETTY_FUNCTION__))
2536 "ptrmask should have been narrowed during legalize")(static_cast <bool> (MaskTy.getSizeInBits() == 32 &&
"ptrmask should have been narrowed during legalize") ? void (
0) : __assert_fail ("MaskTy.getSizeInBits() == 32 && \"ptrmask should have been narrowed during legalize\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2536, __extension__ __PRETTY_FUNCTION__))
;
2537
2538 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2539 .addReg(SrcReg)
2540 .addReg(MaskReg);
2541 I.eraseFromParent();
2542 return true;
2543 }
2544
2545 Register HiReg = MRI->createVirtualRegister(&RegRC);
2546 Register LoReg = MRI->createVirtualRegister(&RegRC);
2547
2548 // Extract the subregisters from the source pointer.
2549 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2550 .addReg(SrcReg, 0, AMDGPU::sub0);
2551 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2552 .addReg(SrcReg, 0, AMDGPU::sub1);
2553
2554 Register MaskedLo, MaskedHi;
2555
2556 // Try to avoid emitting a bit operation when we only need to touch half of
2557 // the 64-bit pointer.
2558 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2559
2560 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2561 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2562 if ((MaskOnes & MaskLo32) == MaskLo32) {
2563 // If all the bits in the low half are 1, we only need a copy for it.
2564 MaskedLo = LoReg;
2565 } else {
2566 // Extract the mask subregister and apply the and.
2567 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2568 MaskedLo = MRI->createVirtualRegister(&RegRC);
2569
2570 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2571 .addReg(MaskReg, 0, AMDGPU::sub0);
2572 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2573 .addReg(LoReg)
2574 .addReg(MaskLo);
2575 }
2576
2577 if ((MaskOnes & MaskHi32) == MaskHi32) {
2578 // If all the bits in the high half are 1, we only need a copy for it.
2579 MaskedHi = HiReg;
2580 } else {
2581 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2582 MaskedHi = MRI->createVirtualRegister(&RegRC);
2583
2584 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2585 .addReg(MaskReg, 0, AMDGPU::sub1);
2586 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2587 .addReg(HiReg)
2588 .addReg(MaskHi);
2589 }
2590
2591 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2592 .addReg(MaskedLo)
2593 .addImm(AMDGPU::sub0)
2594 .addReg(MaskedHi)
2595 .addImm(AMDGPU::sub1);
2596 I.eraseFromParent();
2597 return true;
2598}
2599
2600/// Return the register to use for the index value, and the subregister to use
2601/// for the indirectly accessed register.
2602static std::pair<Register, unsigned>
2603computeIndirectRegIndex(MachineRegisterInfo &MRI,
2604 const SIRegisterInfo &TRI,
2605 const TargetRegisterClass *SuperRC,
2606 Register IdxReg,
2607 unsigned EltSize) {
2608 Register IdxBaseReg;
2609 int Offset;
2610
2611 std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2612 if (IdxBaseReg == AMDGPU::NoRegister) {
2613 // This will happen if the index is a known constant. This should ordinarily
2614 // be legalized out, but handle it as a register just in case.
2615 assert(Offset == 0)(static_cast <bool> (Offset == 0) ? void (0) : __assert_fail
("Offset == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2615, __extension__ __PRETTY_FUNCTION__))
;
2616 IdxBaseReg = IdxReg;
2617 }
2618
2619 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2620
2621 // Skip out of bounds offsets, or else we would end up using an undefined
2622 // register.
2623 if (static_cast<unsigned>(Offset) >= SubRegs.size())
2624 return std::make_pair(IdxReg, SubRegs[0]);
2625 return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2626}
2627
2628bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2629 MachineInstr &MI) const {
2630 Register DstReg = MI.getOperand(0).getReg();
2631 Register SrcReg = MI.getOperand(1).getReg();
2632 Register IdxReg = MI.getOperand(2).getReg();
2633
2634 LLT DstTy = MRI->getType(DstReg);
2635 LLT SrcTy = MRI->getType(SrcReg);
2636
2637 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2638 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2639 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2640
2641 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2642 // into a waterfall loop.
2643 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2644 return false;
2645
2646 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2647 *MRI);
2648 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2649 *MRI);
2650 if (!SrcRC || !DstRC)
2651 return false;
2652 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2653 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2654 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2655 return false;
2656
2657 MachineBasicBlock *BB = MI.getParent();
2658 const DebugLoc &DL = MI.getDebugLoc();
2659 const bool Is64 = DstTy.getSizeInBits() == 64;
2660
2661 unsigned SubReg;
2662 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2663 DstTy.getSizeInBits() / 8);
2664
2665 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2666 if (DstTy.getSizeInBits() != 32 && !Is64)
2667 return false;
2668
2669 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2670 .addReg(IdxReg);
2671
2672 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2673 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2674 .addReg(SrcReg, 0, SubReg)
2675 .addReg(SrcReg, RegState::Implicit);
2676 MI.eraseFromParent();
2677 return true;
2678 }
2679
2680 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2681 return false;
2682
2683 if (!STI.useVGPRIndexMode()) {
2684 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2685 .addReg(IdxReg);
2686 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2687 .addReg(SrcReg, 0, SubReg)
2688 .addReg(SrcReg, RegState::Implicit);
2689 MI.eraseFromParent();
2690 return true;
2691 }
2692
2693 const MCInstrDesc &GPRIDXDesc =
2694 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2695 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2696 .addReg(SrcReg)
2697 .addReg(IdxReg)
2698 .addImm(SubReg);
2699
2700 MI.eraseFromParent();
2701 return true;
2702}
2703
2704// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2705bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2706 MachineInstr &MI) const {
2707 Register DstReg = MI.getOperand(0).getReg();
2708 Register VecReg = MI.getOperand(1).getReg();
2709 Register ValReg = MI.getOperand(2).getReg();
2710 Register IdxReg = MI.getOperand(3).getReg();
2711
2712 LLT VecTy = MRI->getType(DstReg);
2713 LLT ValTy = MRI->getType(ValReg);
2714 unsigned VecSize = VecTy.getSizeInBits();
2715 unsigned ValSize = ValTy.getSizeInBits();
2716
2717 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2718 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2719 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2720
2721 assert(VecTy.getElementType() == ValTy)(static_cast <bool> (VecTy.getElementType() == ValTy) ?
void (0) : __assert_fail ("VecTy.getElementType() == ValTy",
"/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2721, __extension__ __PRETTY_FUNCTION__))
;
2722
2723 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2724 // into a waterfall loop.
2725 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2726 return false;
2727
2728 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2729 *MRI);
2730 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2731 *MRI);
2732
2733 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2734 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2735 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2736 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2737 return false;
2738
2739 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2740 return false;
2741
2742 unsigned SubReg;
2743 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2744 ValSize / 8);
2745
2746 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2747 STI.useVGPRIndexMode();
2748
2749 MachineBasicBlock *BB = MI.getParent();
2750 const DebugLoc &DL = MI.getDebugLoc();
2751
2752 if (!IndexMode) {
2753 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2754 .addReg(IdxReg);
2755
2756 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2757 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2758 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2759 .addReg(VecReg)
2760 .addReg(ValReg)
2761 .addImm(SubReg);
2762 MI.eraseFromParent();
2763 return true;
2764 }
2765
2766 const MCInstrDesc &GPRIDXDesc =
2767 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2768 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2769 .addReg(VecReg)
2770 .addReg(ValReg)
2771 .addReg(IdxReg)
2772 .addImm(SubReg);
2773
2774 MI.eraseFromParent();
2775 return true;
2776}
2777
2778static bool isZeroOrUndef(int X) {
2779 return X == 0 || X == -1;
2780}
2781
2782static bool isOneOrUndef(int X) {
2783 return X == 1 || X == -1;
2784}
2785
2786static bool isZeroOrOneOrUndef(int X) {
2787 return X == 0 || X == 1 || X == -1;
2788}
2789
2790// Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2791// 32-bit register.
2792static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2793 ArrayRef<int> Mask) {
2794 NewMask[0] = Mask[0];
2795 NewMask[1] = Mask[1];
2796 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2797 return Src0;
2798
2799 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1)(static_cast <bool> (NewMask[0] == 2 || NewMask[0] == 3
|| NewMask[0] == -1) ? void (0) : __assert_fail ("NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2799, __extension__ __PRETTY_FUNCTION__))
;
2800 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1)(static_cast <bool> (NewMask[1] == 2 || NewMask[1] == 3
|| NewMask[1] == -1) ? void (0) : __assert_fail ("NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2800, __extension__ __PRETTY_FUNCTION__))
;
2801
2802 // Shift the mask inputs to be 0/1;
2803 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2804 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2805 return Src1;
2806}
2807
2808// This is only legal with VOP3P instructions as an aid to op_sel matching.
2809bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2810 MachineInstr &MI) const {
2811 Register DstReg = MI.getOperand(0).getReg();
2812 Register Src0Reg = MI.getOperand(1).getReg();
2813 Register Src1Reg = MI.getOperand(2).getReg();
2814 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2815
2816 const LLT V2S16 = LLT::vector(2, 16);
2817 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2818 return false;
2819
2820 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2821 return false;
2822
2823 assert(ShufMask.size() == 2)(static_cast <bool> (ShufMask.size() == 2) ? void (0) :
__assert_fail ("ShufMask.size() == 2", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2823, __extension__ __PRETTY_FUNCTION__))
;
2824 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA")(static_cast <bool> (STI.hasSDWA() && "no target has VOP3P but not SDWA"
) ? void (0) : __assert_fail ("STI.hasSDWA() && \"no target has VOP3P but not SDWA\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2824, __extension__ __PRETTY_FUNCTION__))
;
2825
2826 MachineBasicBlock *MBB = MI.getParent();
2827 const DebugLoc &DL = MI.getDebugLoc();
2828
2829 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2830 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2831 const TargetRegisterClass &RC = IsVALU ?
2832 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2833
2834 // Handle the degenerate case which should have folded out.
2835 if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2836 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2837
2838 MI.eraseFromParent();
2839 return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2840 }
2841
2842 // A legal VOP3P mask only reads one of the sources.
2843 int Mask[2];
2844 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2845
2846 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2847 !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2848 return false;
2849
2850 // TODO: This also should have been folded out
2851 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2852 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2853 .addReg(SrcVec);
2854
2855 MI.eraseFromParent();
2856 return true;
2857 }
2858
2859 if (Mask[0] == 1 && Mask[1] == -1) {
2860 if (IsVALU) {
2861 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2862 .addImm(16)
2863 .addReg(SrcVec);
2864 } else {
2865 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2866 .addReg(SrcVec)
2867 .addImm(16);
2868 }
2869 } else if (Mask[0] == -1 && Mask[1] == 0) {
2870 if (IsVALU) {
2871 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2872 .addImm(16)
2873 .addReg(SrcVec);
2874 } else {
2875 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2876 .addReg(SrcVec)
2877 .addImm(16);
2878 }
2879 } else if (Mask[0] == 0 && Mask[1] == 0) {
2880 if (IsVALU) {
2881 // Write low half of the register into the high half.
2882 MachineInstr *MovSDWA =
2883 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2884 .addImm(0) // $src0_modifiers
2885 .addReg(SrcVec) // $src0
2886 .addImm(0) // $clamp
2887 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2888 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2889 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2890 .addReg(SrcVec, RegState::Implicit);
2891 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2892 } else {
2893 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2894 .addReg(SrcVec)
2895 .addReg(SrcVec);
2896 }
2897 } else if (Mask[0] == 1 && Mask[1] == 1) {
2898 if (IsVALU) {
2899 // Write high half of the register into the low half.
2900 MachineInstr *MovSDWA =
2901 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2902 .addImm(0) // $src0_modifiers
2903 .addReg(SrcVec) // $src0
2904 .addImm(0) // $clamp
2905 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
2906 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2907 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
2908 .addReg(SrcVec, RegState::Implicit);
2909 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2910 } else {
2911 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2912 .addReg(SrcVec)
2913 .addReg(SrcVec);
2914 }
2915 } else if (Mask[0] == 1 && Mask[1] == 0) {
2916 if (IsVALU) {
2917 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2918 .addReg(SrcVec)
2919 .addReg(SrcVec)
2920 .addImm(16);
2921 } else {
2922 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2923 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2924 .addReg(SrcVec)
2925 .addImm(16);
2926 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2927 .addReg(TmpReg)
2928 .addReg(SrcVec);
2929 }
2930 } else
2931 llvm_unreachable("all shuffle masks should be handled")::llvm::llvm_unreachable_internal("all shuffle masks should be handled"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2931)
;
2932
2933 MI.eraseFromParent();
2934 return true;
2935}
2936
2937bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2938 MachineInstr &MI) const {
2939 if (STI.hasGFX90AInsts())
2940 return selectImpl(MI, *CoverageInfo);
2941
2942 MachineBasicBlock *MBB = MI.getParent();
2943 const DebugLoc &DL = MI.getDebugLoc();
2944
2945 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2946 Function &F = MBB->getParent()->getFunction();
2947 DiagnosticInfoUnsupported
2948 NoFpRet(F, "return versions of fp atomics not supported",
2949 MI.getDebugLoc(), DS_Error);
2950 F.getContext().diagnose(NoFpRet);
2951 return false;
2952 }
2953
2954 // FIXME: This is only needed because tablegen requires number of dst operands
2955 // in match and replace pattern to be the same. Otherwise patterns can be
2956 // exported from SDag path.
2957 MachineOperand &VDataIn = MI.getOperand(1);
2958 MachineOperand &VIndex = MI.getOperand(3);
2959 MachineOperand &VOffset = MI.getOperand(4);
2960 MachineOperand &SOffset = MI.getOperand(5);
2961 int16_t Offset = MI.getOperand(6).getImm();
2962
2963 bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
2964 bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
2965
2966 unsigned Opcode;
2967 if (HasVOffset) {
2968 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
2969 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
2970 } else {
2971 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
2972 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
2973 }
2974
2975 if (MRI->getType(VDataIn.getReg()).isVector()) {
2976 switch (Opcode) {
2977 case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
2978 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
2979 break;
2980 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
2981 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
2982 break;
2983 case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
2984 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
2985 break;
2986 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
2987 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
2988 break;
2989 }
2990 }
2991
2992 auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
2993 I.add(VDataIn);
2994
2995 if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
2996 Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
2997 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
2998 BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
2999 .addReg(VIndex.getReg())
3000 .addImm(AMDGPU::sub0)
3001 .addReg(VOffset.getReg())
3002 .addImm(AMDGPU::sub1);
3003
3004 I.addReg(IdxReg);
3005 } else if (HasVIndex) {
3006 I.add(VIndex);
3007 } else if (HasVOffset) {
3008 I.add(VOffset);
3009 }
3010
3011 I.add(MI.getOperand(2)); // rsrc
3012 I.add(SOffset);
3013 I.addImm(Offset);
3014 I.addImm(MI.getOperand(7).getImm()); // cpol
3015 I.cloneMemRefs(MI);
3016
3017 MI.eraseFromParent();
3018
3019 return true;
3020}
3021
3022bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
3023 MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const {
3024
3025 if (STI.hasGFX90AInsts()) {
3026 // gfx90a adds return versions of the global atomic fadd instructions so no
3027 // special handling is required.
3028 return selectImpl(MI, *CoverageInfo);
3029 }
3030
3031 MachineBasicBlock *MBB = MI.getParent();
3032 const DebugLoc &DL = MI.getDebugLoc();
3033
3034 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
3035 Function &F = MBB->getParent()->getFunction();
3036 DiagnosticInfoUnsupported
3037 NoFpRet(F, "return versions of fp atomics not supported",
3038 MI.getDebugLoc(), DS_Error);
3039 F.getContext().diagnose(NoFpRet);
3040 return false;
3041 }
3042
3043 // FIXME: This is only needed because tablegen requires number of dst operands
3044 // in match and replace pattern to be the same. Otherwise patterns can be
3045 // exported from SDag path.
3046 auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal);
3047
3048 Register Data = DataOp.getReg();
3049 const unsigned Opc = MRI->getType(Data).isVector() ?
3050 AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3051 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3052 .addReg(Addr.first)
3053 .addReg(Data)
3054 .addImm(Addr.second)
3055 .addImm(0) // cpol
3056 .cloneMemRefs(MI);
3057
3058 MI.eraseFromParent();
3059 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3060}
3061
3062bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3063 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3064 MI.RemoveOperand(1);
3065 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3066 return true;
3067}
3068
3069bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3070 if (I.isPHI())
3071 return selectPHI(I);
3072
3073 if (!I.isPreISelOpcode()) {
3074 if (I.isCopy())
3075 return selectCOPY(I);
3076 return true;
3077 }
3078
3079 switch (I.getOpcode()) {
3080 case TargetOpcode::G_AND:
3081 case TargetOpcode::G_OR:
3082 case TargetOpcode::G_XOR:
3083 if (selectImpl(I, *CoverageInfo))
3084 return true;
3085 return selectG_AND_OR_XOR(I);
3086 case TargetOpcode::G_ADD:
3087 case TargetOpcode::G_SUB:
3088 if (selectImpl(I, *CoverageInfo))
3089 return true;
3090 return selectG_ADD_SUB(I);
3091 case TargetOpcode::G_UADDO:
3092 case TargetOpcode::G_USUBO:
3093 case TargetOpcode::G_UADDE:
3094 case TargetOpcode::G_USUBE:
3095 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3096 case TargetOpcode::G_INTTOPTR:
3097 case TargetOpcode::G_BITCAST:
3098 case TargetOpcode::G_PTRTOINT:
3099 return selectCOPY(I);
3100 case TargetOpcode::G_CONSTANT:
3101 case TargetOpcode::G_FCONSTANT:
3102 return selectG_CONSTANT(I);
3103 case TargetOpcode::G_FNEG:
3104 if (selectImpl(I, *CoverageInfo))
3105 return true;
3106 return selectG_FNEG(I);
3107 case TargetOpcode::G_FABS:
3108 if (selectImpl(I, *CoverageInfo))
3109 return true;
3110 return selectG_FABS(I);
3111 case TargetOpcode::G_EXTRACT:
3112 return selectG_EXTRACT(I);
3113 case TargetOpcode::G_MERGE_VALUES:
3114 case TargetOpcode::G_BUILD_VECTOR:
3115 case TargetOpcode::G_CONCAT_VECTORS:
3116 return selectG_MERGE_VALUES(I);
3117 case TargetOpcode::G_UNMERGE_VALUES:
3118 return selectG_UNMERGE_VALUES(I);
3119 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3120 return selectG_BUILD_VECTOR_TRUNC(I);
3121 case TargetOpcode::G_PTR_ADD:
3122 return selectG_PTR_ADD(I);
3123 case TargetOpcode::G_IMPLICIT_DEF:
3124 return selectG_IMPLICIT_DEF(I);
3125 case TargetOpcode::G_FREEZE:
3126 return selectCOPY(I);
3127 case TargetOpcode::G_INSERT:
3128 return selectG_INSERT(I);
3129 case TargetOpcode::G_INTRINSIC:
3130 return selectG_INTRINSIC(I);
3131 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3132 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3133 case TargetOpcode::G_ICMP:
3134 if (selectG_ICMP(I))
3135 return true;
3136 return selectImpl(I, *CoverageInfo);
3137 case TargetOpcode::G_LOAD:
3138 case TargetOpcode::G_STORE:
3139 case TargetOpcode::G_ATOMIC_CMPXCHG:
3140 case TargetOpcode::G_ATOMICRMW_XCHG:
3141 case TargetOpcode::G_ATOMICRMW_ADD:
3142 case TargetOpcode::G_ATOMICRMW_SUB:
3143 case TargetOpcode::G_ATOMICRMW_AND:
3144 case TargetOpcode::G_ATOMICRMW_OR:
3145 case TargetOpcode::G_ATOMICRMW_XOR:
3146 case TargetOpcode::G_ATOMICRMW_MIN:
3147 case TargetOpcode::G_ATOMICRMW_MAX:
3148 case TargetOpcode::G_ATOMICRMW_UMIN:
3149 case TargetOpcode::G_ATOMICRMW_UMAX:
3150 case TargetOpcode::G_ATOMICRMW_FADD:
3151 case AMDGPU::G_AMDGPU_ATOMIC_INC:
3152 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3153 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3154 case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3155 return selectG_LOAD_STORE_ATOMICRMW(I);
3156 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3157 return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3158 case TargetOpcode::G_SELECT:
3159 return selectG_SELECT(I);
3160 case TargetOpcode::G_TRUNC:
3161 return selectG_TRUNC(I);
3162 case TargetOpcode::G_SEXT:
3163 case TargetOpcode::G_ZEXT:
3164 case TargetOpcode::G_ANYEXT:
3165 case TargetOpcode::G_SEXT_INREG:
3166 if (selectImpl(I, *CoverageInfo))
3167 return true;
3168 return selectG_SZA_EXT(I);
3169 case TargetOpcode::G_BRCOND:
3170 return selectG_BRCOND(I);
3171 case TargetOpcode::G_GLOBAL_VALUE:
3172 return selectG_GLOBAL_VALUE(I);
3173 case TargetOpcode::G_PTRMASK:
3174 return selectG_PTRMASK(I);
3175 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3176 return selectG_EXTRACT_VECTOR_ELT(I);
3177 case TargetOpcode::G_INSERT_VECTOR_ELT:
3178 return selectG_INSERT_VECTOR_ELT(I);
3179 case TargetOpcode::G_SHUFFLE_VECTOR:
3180 return selectG_SHUFFLE_VECTOR(I);
3181 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3182 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3183 const AMDGPU::ImageDimIntrinsicInfo *Intr
3184 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3185 assert(Intr && "not an image intrinsic with image pseudo")(static_cast <bool> (Intr && "not an image intrinsic with image pseudo"
) ? void (0) : __assert_fail ("Intr && \"not an image intrinsic with image pseudo\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3185, __extension__ __PRETTY_FUNCTION__))
;
3186 return selectImageIntrinsic(I, Intr);
3187 }
3188 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3189 return selectBVHIntrinsic(I);
3190 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3191 return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3192 default:
3193 return selectImpl(I, *CoverageInfo);
3194 }
3195 return false;
3196}
3197
3198InstructionSelector::ComplexRendererFns
3199AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3200 return {{
3201 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3202 }};
3203
3204}
3205
3206std::pair<Register, unsigned>
3207AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3208 bool AllowAbs) const {
3209 Register Src = Root.getReg();
3210 Register OrigSrc = Src;
3211 unsigned Mods = 0;
3212 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3213
3214 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3215 Src = MI->getOperand(1).getReg();
3216 Mods |= SISrcMods::NEG;
3217 MI = getDefIgnoringCopies(Src, *MRI);
3218 }
3219
3220 if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3221 Src = MI->getOperand(1).getReg();
3222 Mods |= SISrcMods::ABS;
3223 }
3224
3225 if (Mods != 0 &&
3226 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3227 MachineInstr *UseMI = Root.getParent();
3228
3229 // If we looked through copies to find source modifiers on an SGPR operand,
3230 // we now have an SGPR register source. To avoid potentially violating the
3231 // constant bus restriction, we need to insert a copy to a VGPR.
3232 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3233 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
3234 TII.get(AMDGPU::COPY), VGPRSrc)
3235 .addReg(Src);
3236 Src = VGPRSrc;
3237 }
3238
3239 return std::make_pair(Src, Mods);
3240}
3241
3242///
3243/// This will select either an SGPR or VGPR operand and will save us from
3244/// having to write an extra tablegen pattern.
3245InstructionSelector::ComplexRendererFns
3246AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3247 return {{
3248 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3249 }};
3250}
3251
3252InstructionSelector::ComplexRendererFns
3253AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3254 Register Src;
3255 unsigned Mods;
3256 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3257
3258 return {{
3259 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3260 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3261 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3262 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3263 }};
3264}
3265
3266InstructionSelector::ComplexRendererFns
3267AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3268 Register Src;
3269 unsigned Mods;
3270 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3271
3272 return {{
3273 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3274 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3275 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3276 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3277 }};
3278}
3279
3280InstructionSelector::ComplexRendererFns
3281AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3282 return {{
3283 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3284 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3285 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3286 }};
3287}
3288
3289InstructionSelector::ComplexRendererFns
3290AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3291 Register Src;
3292 unsigned Mods;
3293 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3294
3295 return {{
3296 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3297 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3298 }};
3299}
3300
3301InstructionSelector::ComplexRendererFns
3302AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3303 Register Src;
3304 unsigned Mods;
3305 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3306
3307 return {{
3308 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3309 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3310 }};
3311}
3312
3313InstructionSelector::ComplexRendererFns
3314AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3315 Register Reg = Root.getReg();
3316 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3317 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3318 Def->getOpcode() == AMDGPU::G_FABS))
3319 return {};
3320 return {{
3321 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3322 }};
3323}
3324
3325std::pair<Register, unsigned>
3326AMDGPUInstructionSelector::selectVOP3PModsImpl(
3327 Register Src, const MachineRegisterInfo &MRI) const {
3328 unsigned Mods = 0;
3329 MachineInstr *MI = MRI.getVRegDef(Src);
3330
3331 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3332 // It's possible to see an f32 fneg here, but unlikely.
3333 // TODO: Treat f32 fneg as only high bit.
3334 MRI.getType(Src) == LLT::vector(2, 16)) {
3335 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3336 Src = MI->getOperand(1).getReg();
3337 MI = MRI.getVRegDef(Src);
Value stored to 'MI' is never read
3338 }
3339
3340 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3341
3342 // Packed instructions do not have abs modifiers.
3343 Mods |= SISrcMods::OP_SEL_1;
3344
3345 return std::make_pair(Src, Mods);
3346}
3347
3348InstructionSelector::ComplexRendererFns
3349AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3350 MachineRegisterInfo &MRI
3351 = Root.getParent()->getParent()->getParent()->getRegInfo();
3352
3353 Register Src;
3354 unsigned Mods;
3355 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3356
3357 return {{
3358 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3359 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3360 }};
3361}
3362
3363InstructionSelector::ComplexRendererFns
3364AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3365 Register Src;
3366 unsigned Mods;
3367 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3368 if (!isKnownNeverNaN(Src, *MRI))
3369 return None;
3370
3371 return {{
3372 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3373 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3374 }};
3375}
3376
3377InstructionSelector::ComplexRendererFns
3378AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3379 // FIXME: Handle op_sel
3380 return {{
3381 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3382 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3383 }};
3384}
3385
3386InstructionSelector::ComplexRendererFns
3387AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3388 SmallVector<GEPInfo, 4> AddrInfo;
3389 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3390
3391 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3392 return None;
3393
3394 const GEPInfo &GEPInfo = AddrInfo[0];
3395 Optional<int64_t> EncodedImm =
3396 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3397 if (!EncodedImm)
3398 return None;
3399
3400 unsigned PtrReg = GEPInfo.SgprParts[0];
3401 return {{
3402 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3403 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3404 }};
3405}
3406
3407InstructionSelector::ComplexRendererFns
3408AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3409 SmallVector<GEPInfo, 4> AddrInfo;
3410 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3411
3412 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3413 return None;
3414
3415 const GEPInfo &GEPInfo = AddrInfo[0];
3416 Register PtrReg = GEPInfo.SgprParts[0];
3417 Optional<int64_t> EncodedImm =
3418 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3419 if (!EncodedImm)
3420 return None;
3421
3422 return {{
3423 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3424 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3425 }};
3426}
3427
3428InstructionSelector::ComplexRendererFns
3429AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3430 MachineInstr *MI = Root.getParent();
3431 MachineBasicBlock *MBB = MI->getParent();
3432
3433 SmallVector<GEPInfo, 4> AddrInfo;
3434 getAddrModeInfo(*MI, *MRI, AddrInfo);
3435
3436 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3437 // then we can select all ptr + 32-bit offsets not just immediate offsets.
3438 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3439 return None;
3440
3441 const GEPInfo &GEPInfo = AddrInfo[0];
3442 // SGPR offset is unsigned.
3443 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3444 return None;
3445
3446 // If we make it this far we have a load with an 32-bit immediate offset.
3447 // It is OK to select this using a sgpr offset, because we have already
3448 // failed trying to select this load into one of the _IMM variants since
3449 // the _IMM Patterns are considered before the _SGPR patterns.
3450 Register PtrReg = GEPInfo.SgprParts[0];
3451 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3452 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3453 .addImm(GEPInfo.Imm);
3454 return {{
3455 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3456 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3457 }};
3458}
3459
3460std::pair<Register, int>
3461AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
3462 uint64_t FlatVariant) const {
3463 MachineInstr *MI = Root.getParent();
3464
3465 auto Default = std::make_pair(Root.getReg(), 0);
3466
3467 if (!STI.hasFlatInstOffsets())
3468 return Default;
3469
3470 Register PtrBase;
3471 int64_t ConstOffset;
3472 std::tie(PtrBase, ConstOffset) =
3473 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3474 if (ConstOffset == 0)
3475 return Default;
3476
3477 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3478 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
3479 return Default;
3480
3481 return std::make_pair(PtrBase, ConstOffset);
3482}
3483
3484InstructionSelector::ComplexRendererFns
3485AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3486 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
3487
3488 return {{
3489 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3490 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3491 }};
3492}
3493
3494InstructionSelector::ComplexRendererFns
3495AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
3496 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
3497
3498 return {{
3499 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3500 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3501 }};
3502}
3503
3504InstructionSelector::ComplexRendererFns
3505AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
3506 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
3507
3508 return {{
3509 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3510 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3511 }};
3512}
3513
3514/// Match a zero extend from a 32-bit value to 64-bits.
3515static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3516 Register ZExtSrc;
3517 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3518 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3519
3520 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3521 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3522 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3523 return false;
3524
3525 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3526 return Def->getOperand(1).getReg();
3527 }
3528
3529 return Register();
3530}
3531
3532// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3533InstructionSelector::ComplexRendererFns
3534AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3535 Register Addr = Root.getReg();
3536 Register PtrBase;
3537 int64_t ConstOffset;
3538 int64_t ImmOffset = 0;
3539
3540 // Match the immediate offset first, which canonically is moved as low as
3541 // possible.
3542 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3543
3544 if (ConstOffset != 0) {
3545 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
3546 SIInstrFlags::FlatGlobal)) {
3547 Addr = PtrBase;
3548 ImmOffset = ConstOffset;
3549 } else {
3550 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3551 if (!PtrBaseDef)
3552 return None;
3553
3554 if (isSGPR(PtrBaseDef->Reg)) {
3555 if (ConstOffset > 0) {
3556 // Offset is too large.
3557 //
3558 // saddr + large_offset -> saddr +
3559 // (voffset = large_offset & ~MaxOffset) +
3560 // (large_offset & MaxOffset);
3561 int64_t SplitImmOffset, RemainderOffset;
3562 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
3563 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
3564
3565 if (isUInt<32>(RemainderOffset)) {
3566 MachineInstr *MI = Root.getParent();
3567 MachineBasicBlock *MBB = MI->getParent();
3568 Register HighBits =
3569 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3570
3571 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3572 HighBits)
3573 .addImm(RemainderOffset);
3574
3575 return {{
3576 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
3577 [=](MachineInstrBuilder &MIB) {
3578 MIB.addReg(HighBits);
3579 }, // voffset
3580 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3581 }};
3582 }
3583 }
3584
3585 // We are adding a 64 bit SGPR and a constant. If constant bus limit
3586 // is 1 we would need to perform 1 or 2 extra moves for each half of
3587 // the constant and it is better to do a scalar add and then issue a
3588 // single VALU instruction to materialize zero. Otherwise it is less
3589 // instructions to perform VALU adds with immediates or inline literals.
3590 unsigned NumLiterals =
3591 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
3592 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
3593 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
3594 return None;
3595 }
3596 }
3597 }
3598
3599 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3600 if (!AddrDef)
3601 return None;
3602
3603 // Match the variable offset.
3604 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3605 // Look through the SGPR->VGPR copy.
3606 Register SAddr =
3607 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3608
3609 if (SAddr && isSGPR(SAddr)) {
3610 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3611
3612 // It's possible voffset is an SGPR here, but the copy to VGPR will be
3613 // inserted later.
3614 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3615 return {{[=](MachineInstrBuilder &MIB) { // saddr
3616 MIB.addReg(SAddr);
3617 },
3618 [=](MachineInstrBuilder &MIB) { // voffset
3619 MIB.addReg(VOffset);
3620 },
3621 [=](MachineInstrBuilder &MIB) { // offset
3622 MIB.addImm(ImmOffset);
3623 }}};
3624 }
3625 }
3626 }
3627
3628 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3629 // drop this.
3630 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3631 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
3632 return None;
3633
3634 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3635 // moves required to copy a 64-bit SGPR to VGPR.
3636 MachineInstr *MI = Root.getParent();
3637 MachineBasicBlock *MBB = MI->getParent();
3638 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3639
3640 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3641 .addImm(0);
3642
3643 return {{
3644 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
3645 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
3646 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3647 }};
3648}
3649
3650InstructionSelector::ComplexRendererFns
3651AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3652 Register Addr = Root.getReg();
3653 Register PtrBase;
3654 int64_t ConstOffset;
3655 int64_t ImmOffset = 0;
3656
3657 // Match the immediate offset first, which canonically is moved as low as
3658 // possible.
3659 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3660
3661 if (ConstOffset != 0 &&
3662 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
3663 SIInstrFlags::FlatScratch)) {
3664 Addr = PtrBase;
3665 ImmOffset = ConstOffset;
3666 }
3667
3668 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3669 if (!AddrDef)
3670 return None;
3671
3672 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3673 int FI = AddrDef->MI->getOperand(1).getIndex();
3674 return {{
3675 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3676 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3677 }};
3678 }
3679
3680 Register SAddr = AddrDef->Reg;
3681
3682 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3683 Register LHS = AddrDef->MI->getOperand(1).getReg();
3684 Register RHS = AddrDef->MI->getOperand(2).getReg();
3685 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3686 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3687
3688 if (LHSDef && RHSDef &&
3689 LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3690 isSGPR(RHSDef->Reg)) {
3691 int FI = LHSDef->MI->getOperand(1).getIndex();
3692 MachineInstr &I = *Root.getParent();
3693 MachineBasicBlock *BB = I.getParent();
3694 const DebugLoc &DL = I.getDebugLoc();
3695 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3696
3697 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
3698 .addFrameIndex(FI)
3699 .addReg(RHSDef->Reg);
3700 }
3701 }
3702
3703 if (!isSGPR(SAddr))
3704 return None;
3705
3706 return {{
3707 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3708 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3709 }};
3710}
3711
3712InstructionSelector::ComplexRendererFns
3713AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3714 MachineInstr *MI = Root.getParent();
3715 MachineBasicBlock *MBB = MI->getParent();
3716 MachineFunction *MF = MBB->getParent();
3717 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3718
3719 int64_t Offset = 0;
3720 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3721 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3722 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3723
3724 // TODO: Should this be inside the render function? The iterator seems to
3725 // move.
3726 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3727 HighBits)
3728 .addImm(Offset & ~4095);
3729
3730 return {{[=](MachineInstrBuilder &MIB) { // rsrc
3731 MIB.addReg(Info->getScratchRSrcReg());
3732 },
3733 [=](MachineInstrBuilder &MIB) { // vaddr
3734 MIB.addReg(HighBits);
3735 },
3736 [=](MachineInstrBuilder &MIB) { // soffset
3737 // Use constant zero for soffset and rely on eliminateFrameIndex
3738 // to choose the appropriate frame register if need be.
3739 MIB.addImm(0);
3740 },
3741 [=](MachineInstrBuilder &MIB) { // offset
3742 MIB.addImm(Offset & 4095);
3743 }}};
3744 }
3745
3746 assert(Offset == 0 || Offset == -1)(static_cast <bool> (Offset == 0 || Offset == -1) ? void
(0) : __assert_fail ("Offset == 0 || Offset == -1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3746, __extension__ __PRETTY_FUNCTION__))
;
3747
3748 // Try to fold a frame index directly into the MUBUF vaddr field, and any
3749 // offsets.
3750 Optional<int> FI;
3751 Register VAddr = Root.getReg();
3752 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3753 Register PtrBase;
3754 int64_t ConstOffset;
3755 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
3756 if (ConstOffset != 0) {
3757 if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
3758 (!STI.privateMemoryResourceIsRangeChecked() ||
3759 KnownBits->signBitIsZero(PtrBase))) {
3760 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
3761 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3762 FI = PtrBaseDef->getOperand(1).getIndex();
3763 else
3764 VAddr = PtrBase;
3765 Offset = ConstOffset;
3766 }
3767 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3768 FI = RootDef->getOperand(1).getIndex();
3769 }
3770 }
3771
3772 return {{[=](MachineInstrBuilder &MIB) { // rsrc
3773 MIB.addReg(Info->getScratchRSrcReg());
3774 },
3775 [=](MachineInstrBuilder &MIB) { // vaddr
3776 if (FI.hasValue())
3777 MIB.addFrameIndex(FI.getValue());
3778 else
3779 MIB.addReg(VAddr);
3780 },
3781 [=](MachineInstrBuilder &MIB) { // soffset
3782 // Use constant zero for soffset and rely on eliminateFrameIndex
3783 // to choose the appropriate frame register if need be.
3784 MIB.addImm(0);
3785 },
3786 [=](MachineInstrBuilder &MIB) { // offset
3787 MIB.addImm(Offset);
3788 }}};
3789}
3790
3791bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3792 int64_t Offset) const {
3793 if (!isUInt<16>(Offset))
3794 return false;
3795
3796 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3797 return true;
3798
3799 // On Southern Islands instruction with a negative base value and an offset
3800 // don't seem to work.
3801 return KnownBits->signBitIsZero(Base);
3802}
3803
3804bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3805 int64_t Offset1,
3806 unsigned Size) const {
3807 if (Offset0 % Size != 0 || Offset1 % Size != 0)
3808 return false;
3809 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3810 return false;
3811
3812 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3813 return true;
3814
3815 // On Southern Islands instruction with a negative base value and an offset
3816 // don't seem to work.
3817 return KnownBits->signBitIsZero(Base);
3818}
3819
3820InstructionSelector::ComplexRendererFns
3821AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3822 MachineOperand &Root) const {
3823 MachineInstr *MI = Root.getParent();
3824 MachineBasicBlock *MBB = MI->getParent();
3825
3826 int64_t Offset = 0;
3827 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3828 !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3829 return {};
3830
3831 const MachineFunction *MF = MBB->getParent();
3832 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3833
3834 return {{
3835 [=](MachineInstrBuilder &MIB) { // rsrc
3836 MIB.addReg(Info->getScratchRSrcReg());
3837 },
3838 [=](MachineInstrBuilder &MIB) { // soffset
3839 MIB.addImm(0);
3840 },
3841 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3842 }};
3843}
3844
3845std::pair<Register, unsigned>
3846AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3847 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3848 if (!RootDef)
3849 return std::make_pair(Root.getReg(), 0);
3850
3851 int64_t ConstAddr = 0;
3852
3853 Register PtrBase;
3854 int64_t Offset;
3855 std::tie(PtrBase, Offset) =
3856 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3857
3858 if (Offset) {
3859 if (isDSOffsetLegal(PtrBase, Offset)) {
3860 // (add n0, c0)
3861 return std::make_pair(PtrBase, Offset);
3862 }
3863 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3864 // TODO
3865
3866
3867 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3868 // TODO
3869
3870 }
3871
3872 return std::make_pair(Root.getReg(), 0);
3873}
3874
3875InstructionSelector::ComplexRendererFns
3876AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3877 Register Reg;
3878 unsigned Offset;
3879 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3880 return {{
3881 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3882 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3883 }};
3884}
3885
3886InstructionSelector::ComplexRendererFns
3887AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3888 return selectDSReadWrite2(Root, 4);
3889}
3890
3891InstructionSelector::ComplexRendererFns
3892AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3893 return selectDSReadWrite2(Root, 8);
3894}
3895
3896InstructionSelector::ComplexRendererFns
3897AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3898 unsigned Size) const {
3899 Register Reg;
3900 unsigned Offset;
3901 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3902 return {{
3903 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3904 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3905 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3906 }};
3907}
3908
3909std::pair<Register, unsigned>
3910AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3911 unsigned Size) const {
3912 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3913 if (!RootDef)
3914 return std::make_pair(Root.getReg(), 0);
3915
3916 int64_t ConstAddr = 0;
3917
3918 Register PtrBase;
3919 int64_t Offset;
3920 std::tie(PtrBase, Offset) =
3921 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3922
3923 if (Offset) {
3924 int64_t OffsetValue0 = Offset;
3925 int64_t OffsetValue1 = Offset + Size;
3926 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
3927 // (add n0, c0)
3928 return std::make_pair(PtrBase, OffsetValue0 / Size);
3929 }
3930 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3931 // TODO
3932
3933 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3934 // TODO
3935
3936 }
3937
3938 return std::make_pair(Root.getReg(), 0);
3939}
3940
3941/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3942/// the base value with the constant offset. There may be intervening copies
3943/// between \p Root and the identified constant. Returns \p Root, 0 if this does
3944/// not match the pattern.
3945std::pair<Register, int64_t>
3946AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3947 Register Root, const MachineRegisterInfo &MRI) const {
3948 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
3949 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3950 return {Root, 0};
3951
3952 MachineOperand &RHS = RootI->getOperand(2);
3953 Optional<ValueAndVReg> MaybeOffset
3954 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3955 if (!MaybeOffset)
3956 return {Root, 0};
3957 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
3958}
3959
3960static void addZeroImm(MachineInstrBuilder &MIB) {
3961 MIB.addImm(0);
3962}
3963
3964/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3965/// BasePtr is not valid, a null base pointer will be used.
3966static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3967 uint32_t FormatLo, uint32_t FormatHi,
3968 Register BasePtr) {
3969 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3970 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3971 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3972 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3973
3974 B.buildInstr(AMDGPU::S_MOV_B32)
3975 .addDef(RSrc2)
3976 .addImm(FormatLo);
3977 B.buildInstr(AMDGPU::S_MOV_B32)
3978 .addDef(RSrc3)
3979 .addImm(FormatHi);
3980
3981 // Build the half of the subregister with the constants before building the
3982 // full 128-bit register. If we are building multiple resource descriptors,
3983 // this will allow CSEing of the 2-component register.
3984 B.buildInstr(AMDGPU::REG_SEQUENCE)
3985 .addDef(RSrcHi)
3986 .addReg(RSrc2)
3987 .addImm(AMDGPU::sub0)
3988 .addReg(RSrc3)
3989 .addImm(AMDGPU::sub1);
3990
3991 Register RSrcLo = BasePtr;
3992 if (!BasePtr) {
3993 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3994 B.buildInstr(AMDGPU::S_MOV_B64)
3995 .addDef(RSrcLo)
3996 .addImm(0);
3997 }
3998
3999 B.buildInstr(AMDGPU::REG_SEQUENCE)
4000 .addDef(RSrc)
4001 .addReg(RSrcLo)
4002 .addImm(AMDGPU::sub0_sub1)
4003 .addReg(RSrcHi)
4004 .addImm(AMDGPU::sub2_sub3);
4005
4006 return RSrc;
4007}
4008
4009static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
4010 const SIInstrInfo &TII, Register BasePtr) {
4011 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4012
4013 // FIXME: Why are half the "default" bits ignored based on the addressing
4014 // mode?
4015 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
4016}
4017
4018static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
4019 const SIInstrInfo &TII, Register BasePtr) {
4020 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4021
4022 // FIXME: Why are half the "default" bits ignored based on the addressing
4023 // mode?
4024 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
4025}
4026
4027AMDGPUInstructionSelector::MUBUFAddressData
4028AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
4029 MUBUFAddressData Data;
4030 Data.N0 = Src;
4031
4032 Register PtrBase;
4033 int64_t Offset;
4034
4035 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
4036 if (isUInt<32>(Offset)) {
4037 Data.N0 = PtrBase;
4038 Data.Offset = Offset;
4039 }
4040
4041 if (MachineInstr *InputAdd
4042 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
4043 Data.N2 = InputAdd->getOperand(1).getReg();
4044 Data.N3 = InputAdd->getOperand(2).getReg();
4045
4046 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4047 // FIXME: Don't know this was defined by operand 0
4048 //
4049 // TODO: Remove this when we have copy folding optimizations after
4050 // RegBankSelect.
4051 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4052 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4053 }
4054
4055 return Data;
4056}
4057
4058/// Return if the addr64 mubuf mode should be used for the given address.
4059bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4060 // (ptr_add N2, N3) -> addr64, or
4061 // (ptr_add (ptr_add N2, N3), C1) -> addr64
4062 if (Addr.N2)
4063 return true;
4064
4065 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4066 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4067}
4068
4069/// Split an immediate offset \p ImmOffset depending on whether it fits in the
4070/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4071/// component.
4072void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4073 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4074 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4075 return;
4076
4077 // Illegal offset, store it in soffset.
4078 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4079 B.buildInstr(AMDGPU::S_MOV_B32)
4080 .addDef(SOffset)
4081 .addImm(ImmOffset);
4082 ImmOffset = 0;
4083}
4084
4085bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4086 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4087 Register &SOffset, int64_t &Offset) const {
4088 // FIXME: Predicates should stop this from reaching here.
4089 // addr64 bit was removed for volcanic islands.
4090 if (!STI.hasAddr64() || STI.useFlatForGlobal())
4091 return false;
4092
4093 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4094 if (!shouldUseAddr64(AddrData))
4095 return false;
4096
4097 Register N0 = AddrData.N0;
4098 Register N2 = AddrData.N2;
4099 Register N3 = AddrData.N3;
4100 Offset = AddrData.Offset;
4101
4102 // Base pointer for the SRD.
4103 Register SRDPtr;
4104
4105 if (N2) {
4106 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4107 assert(N3)(static_cast <bool> (N3) ? void (0) : __assert_fail ("N3"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4107, __extension__ __PRETTY_FUNCTION__))
;
4108 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4109 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4110 // addr64, and construct the default resource from a 0 address.
4111 VAddr = N0;
4112 } else {
4113 SRDPtr = N3;
4114 VAddr = N2;
4115 }
4116 } else {
4117 // N2 is not divergent.
4118 SRDPtr = N2;
4119 VAddr = N3;
4120 }
4121 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4122 // Use the default null pointer in the resource
4123 VAddr = N0;
4124 } else {
4125 // N0 -> offset, or
4126 // (N0 + C1) -> offset
4127 SRDPtr = N0;
4128 }
4129
4130 MachineIRBuilder B(*Root.getParent());
4131 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4132 splitIllegalMUBUFOffset(B, SOffset, Offset);
4133 return true;
4134}
4135
4136bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4137 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4138 int64_t &Offset) const {
4139
4140 // FIXME: Pattern should not reach here.
4141 if (STI.useFlatForGlobal())
4142 return false;
4143
4144 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4145 if (shouldUseAddr64(AddrData))
4146 return false;
4147
4148 // N0 -> offset, or
4149 // (N0 + C1) -> offset
4150 Register SRDPtr = AddrData.N0;
4151 Offset = AddrData.Offset;
4152
4153 // TODO: Look through extensions for 32-bit soffset.
4154 MachineIRBuilder B(*Root.getParent());
4155
4156 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4157 splitIllegalMUBUFOffset(B, SOffset, Offset);
4158 return true;
4159}
4160
4161InstructionSelector::ComplexRendererFns
4162AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4163 Register VAddr;
4164 Register RSrcReg;
4165 Register SOffset;
4166 int64_t Offset = 0;
4167
4168 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4169 return {};
4170
4171 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4172 // pattern.
4173 return {{
4174 [=](MachineInstrBuilder &MIB) { // rsrc
4175 MIB.addReg(RSrcReg);
4176 },
4177 [=](MachineInstrBuilder &MIB) { // vaddr
4178 MIB.addReg(VAddr);
4179 },
4180 [=](MachineInstrBuilder &MIB) { // soffset
4181 if (SOffset)
4182 MIB.addReg(SOffset);
4183 else
4184 MIB.addImm(0);
4185 },
4186 [=](MachineInstrBuilder &MIB) { // offset
4187 MIB.addImm(Offset);
4188 },
4189 addZeroImm, // cpol
4190 addZeroImm, // tfe
4191 addZeroImm // swz
4192 }};
4193}
4194
4195InstructionSelector::ComplexRendererFns
4196AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4197 Register RSrcReg;
4198 Register SOffset;
4199 int64_t Offset = 0;
4200
4201 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4202 return {};
4203
4204 return {{
4205 [=](MachineInstrBuilder &MIB) { // rsrc
4206 MIB.addReg(RSrcReg);
4207 },
4208 [=](MachineInstrBuilder &MIB) { // soffset
4209 if (SOffset)
4210 MIB.addReg(SOffset);
4211 else
4212 MIB.addImm(0);
4213 },
4214 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4215 addZeroImm, // cpol
4216 addZeroImm, // tfe
4217 addZeroImm, // swz
4218 }};
4219}
4220
4221InstructionSelector::ComplexRendererFns
4222AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4223 Register VAddr;
4224 Register RSrcReg;
4225 Register SOffset;
4226 int64_t Offset = 0;
4227
4228 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4229 return {};
4230
4231 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4232 // pattern.
4233 return {{
4234 [=](MachineInstrBuilder &MIB) { // rsrc
4235 MIB.addReg(RSrcReg);
4236 },
4237 [=](MachineInstrBuilder &MIB) { // vaddr
4238 MIB.addReg(VAddr);
4239 },
4240 [=](MachineInstrBuilder &MIB) { // soffset
4241 if (SOffset)
4242 MIB.addReg(SOffset);
4243 else
4244 MIB.addImm(0);
4245 },
4246 [=](MachineInstrBuilder &MIB) { // offset
4247 MIB.addImm(Offset);
4248 },
4249 [=](MachineInstrBuilder &MIB) {
4250 MIB.addImm(AMDGPU::CPol::GLC); // cpol
4251 }
4252 }};
4253}
4254
4255InstructionSelector::ComplexRendererFns
4256AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4257 Register RSrcReg;
4258 Register SOffset;
4259 int64_t Offset = 0;
4260
4261 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4262 return {};
4263
4264 return {{
4265 [=](MachineInstrBuilder &MIB) { // rsrc
4266 MIB.addReg(RSrcReg);
4267 },
4268 [=](MachineInstrBuilder &MIB) { // soffset
4269 if (SOffset)
4270 MIB.addReg(SOffset);
4271 else
4272 MIB.addImm(0);
4273 },
4274 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4275 [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
4276 }};
4277}
4278
4279/// Get an immediate that must be 32-bits, and treated as zero extended.
4280static Optional<uint64_t> getConstantZext32Val(Register Reg,
4281 const MachineRegisterInfo &MRI) {
4282 // getConstantVRegVal sexts any values, so see if that matters.
4283 Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI);
4284 if (!OffsetVal || !isInt<32>(*OffsetVal))
4285 return None;
4286 return Lo_32(*OffsetVal);
4287}
4288
4289InstructionSelector::ComplexRendererFns
4290AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4291 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4292 if (!OffsetVal)
4293 return {};
4294
4295 Optional<int64_t> EncodedImm =
4296 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4297 if (!EncodedImm)
4298 return {};
4299
4300 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4301}
4302
4303InstructionSelector::ComplexRendererFns
4304AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4305 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)(static_cast <bool> (STI.getGeneration() == AMDGPUSubtarget
::SEA_ISLANDS) ? void (0) : __assert_fail ("STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4305, __extension__ __PRETTY_FUNCTION__))
;
4306
4307 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4308 if (!OffsetVal)
4309 return {};
4310
4311 Optional<int64_t> EncodedImm
4312 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4313 if (!EncodedImm)
4314 return {};
4315
4316 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4317}
4318
4319void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4320 const MachineInstr &MI,
4321 int OpIdx) const {
4322 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4323, __extension__ __PRETTY_FUNCTION__))
4323 "Expected G_CONSTANT")(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4323, __extension__ __PRETTY_FUNCTION__))
;
4324 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4325}
4326
4327void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4328 const MachineInstr &MI,
4329 int OpIdx) const {
4330 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4331, __extension__ __PRETTY_FUNCTION__))
4331 "Expected G_CONSTANT")(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4331, __extension__ __PRETTY_FUNCTION__))
;
4332 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4333}
4334
4335void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4336 const MachineInstr &MI,
4337 int OpIdx) const {
4338 assert(OpIdx == -1)(static_cast <bool> (OpIdx == -1) ? void (0) : __assert_fail
("OpIdx == -1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4338, __extension__ __PRETTY_FUNCTION__))
;
4339
4340 const MachineOperand &Op = MI.getOperand(1);
4341 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4342 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4343 else {
4344 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT")(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& "Expected G_CONSTANT") ? void (0) : __assert_fail
("MI.getOpcode() == TargetOpcode::G_CONSTANT && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4344, __extension__ __PRETTY_FUNCTION__))
;
4345 MIB.addImm(Op.getCImm()->getSExtValue());
4346 }
4347}
4348
4349void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4350 const MachineInstr &MI,
4351 int OpIdx) const {
4352 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4353, __extension__ __PRETTY_FUNCTION__))
4353 "Expected G_CONSTANT")(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4353, __extension__ __PRETTY_FUNCTION__))
;
4354 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4355}
4356
4357/// This only really exists to satisfy DAG type checking machinery, so is a
4358/// no-op here.
4359void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4360 const MachineInstr &MI,
4361 int OpIdx) const {
4362 MIB.addImm(MI.getOperand(OpIdx).getImm());
4363}
4364
4365void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
4366 const MachineInstr &MI,
4367 int OpIdx) const {
4368 assert(OpIdx >= 0 && "expected to match an immediate operand")(static_cast <bool> (OpIdx >= 0 && "expected to match an immediate operand"
) ? void (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4368, __extension__ __PRETTY_FUNCTION__))
;
4369 MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
4370}
4371
4372void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4373 const MachineInstr &MI,
4374 int OpIdx) const {
4375 assert(OpIdx >= 0 && "expected to match an immediate operand")(static_cast <bool> (OpIdx >= 0 && "expected to match an immediate operand"
) ? void (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4375, __extension__ __PRETTY_FUNCTION__))
;
4376 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4377}
4378
4379void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
4380 const MachineInstr &MI,
4381 int OpIdx) const {
4382 assert(OpIdx >= 0 && "expected to match an immediate operand")(static_cast <bool> (OpIdx >= 0 && "expected to match an immediate operand"
) ? void (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4382, __extension__ __PRETTY_FUNCTION__))
;
4383 MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
4384}
4385
4386void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4387 const MachineInstr &MI,
4388 int OpIdx) const {
4389 MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4390}
4391
4392bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4393 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
4394}
4395
4396bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4397 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
4398}
4399
4400bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4401 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
4402}
4403
4404bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4405 return TII.isInlineConstant(Imm);
4406}