Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Warning:line 3392, column 5
Value stored to 'MI' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUInstructionSelector.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-01-16-232930-107970-1 -x c++ /build/llvm-toolchain-snapshot-14~++20220116100644+5f782d25a742/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPU.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
24#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
25#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26#include "llvm/IR/DiagnosticInfo.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28
29#define DEBUG_TYPE"amdgpu-isel" "amdgpu-isel"
30
31using namespace llvm;
32using namespace MIPatternMatch;
33
34static cl::opt<bool> AllowRiskySelect(
35 "amdgpu-global-isel-risky-select",
36 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
37 cl::init(false),
38 cl::ReallyHidden);
39
40#define GET_GLOBALISEL_IMPL
41#define AMDGPUSubtarget GCNSubtarget
42#include "AMDGPUGenGlobalISel.inc"
43#undef GET_GLOBALISEL_IMPL
44#undef AMDGPUSubtarget
45
46AMDGPUInstructionSelector::AMDGPUInstructionSelector(
47 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
48 const AMDGPUTargetMachine &TM)
49 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
50 STI(STI),
51 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
52#define GET_GLOBALISEL_PREDICATES_INIT
53#include "AMDGPUGenGlobalISel.inc"
54#undef GET_GLOBALISEL_PREDICATES_INIT
55#define GET_GLOBALISEL_TEMPORARIES_INIT
56#include "AMDGPUGenGlobalISel.inc"
57#undef GET_GLOBALISEL_TEMPORARIES_INIT
58{
59}
60
61const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE"amdgpu-isel"; }
62
63void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
64 CodeGenCoverage &CoverageInfo,
65 ProfileSummaryInfo *PSI,
66 BlockFrequencyInfo *BFI) {
67 MRI = &MF.getRegInfo();
68 Subtarget = &MF.getSubtarget<GCNSubtarget>();
69 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
70}
71
72bool AMDGPUInstructionSelector::isVCC(Register Reg,
73 const MachineRegisterInfo &MRI) const {
74 // The verifier is oblivious to s1 being a valid value for wavesize registers.
75 if (Reg.isPhysical())
76 return false;
77
78 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
79 const TargetRegisterClass *RC =
80 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
81 if (RC) {
82 const LLT Ty = MRI.getType(Reg);
83 return RC->hasSuperClassEq(TRI.getBoolRC()) &&
84 Ty.isValid() && Ty.getSizeInBits() == 1;
85 }
86
87 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
88 return RB->getID() == AMDGPU::VCCRegBankID;
89}
90
91bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
92 unsigned NewOpc) const {
93 MI.setDesc(TII.get(NewOpc));
94 MI.RemoveOperand(1); // Remove intrinsic ID.
95 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
96
97 MachineOperand &Dst = MI.getOperand(0);
98 MachineOperand &Src = MI.getOperand(1);
99
100 // TODO: This should be legalized to s32 if needed
101 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
102 return false;
103
104 const TargetRegisterClass *DstRC
105 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
106 const TargetRegisterClass *SrcRC
107 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
108 if (!DstRC || DstRC != SrcRC)
109 return false;
110
111 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
112 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
113}
114
115bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
116 const DebugLoc &DL = I.getDebugLoc();
117 MachineBasicBlock *BB = I.getParent();
118 I.setDesc(TII.get(TargetOpcode::COPY));
119
120 const MachineOperand &Src = I.getOperand(1);
121 MachineOperand &Dst = I.getOperand(0);
122 Register DstReg = Dst.getReg();
123 Register SrcReg = Src.getReg();
124
125 if (isVCC(DstReg, *MRI)) {
126 if (SrcReg == AMDGPU::SCC) {
127 const TargetRegisterClass *RC
128 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
129 if (!RC)
130 return true;
131 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
132 }
133
134 if (!isVCC(SrcReg, *MRI)) {
135 // TODO: Should probably leave the copy and let copyPhysReg expand it.
136 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
137 return false;
138
139 const TargetRegisterClass *SrcRC
140 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
141
142 Optional<ValueAndVReg> ConstVal =
143 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
144 if (ConstVal) {
145 unsigned MovOpc =
146 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
147 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
148 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
149 } else {
150 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
151
152 // We can't trust the high bits at this point, so clear them.
153
154 // TODO: Skip masking high bits if def is known boolean.
155
156 unsigned AndOpc =
157 TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
158 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
159 .addImm(1)
160 .addReg(SrcReg);
161 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
162 .addImm(0)
163 .addReg(MaskedReg);
164 }
165
166 if (!MRI->getRegClassOrNull(SrcReg))
167 MRI->setRegClass(SrcReg, SrcRC);
168 I.eraseFromParent();
169 return true;
170 }
171
172 const TargetRegisterClass *RC =
173 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
174 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
175 return false;
176
177 return true;
178 }
179
180 for (const MachineOperand &MO : I.operands()) {
181 if (MO.getReg().isPhysical())
182 continue;
183
184 const TargetRegisterClass *RC =
185 TRI.getConstrainedRegClassForOperand(MO, *MRI);
186 if (!RC)
187 continue;
188 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
189 }
190 return true;
191}
192
193bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
194 const Register DefReg = I.getOperand(0).getReg();
195 const LLT DefTy = MRI->getType(DefReg);
196 if (DefTy == LLT::scalar(1)) {
197 if (!AllowRiskySelect) {
198 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Skipping risky boolean phi\n"
; } } while (false)
;
199 return false;
200 }
201
202 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Selecting risky boolean phi\n"
; } } while (false)
;
203 }
204
205 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
206
207 const RegClassOrRegBank &RegClassOrBank =
208 MRI->getRegClassOrRegBank(DefReg);
209
210 const TargetRegisterClass *DefRC
211 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
212 if (!DefRC) {
213 if (!DefTy.isValid()) {
214 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "PHI operand has no type, not a gvreg?\n"
; } } while (false)
;
215 return false;
216 }
217
218 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
219 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
220 if (!DefRC) {
221 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "PHI operand has unexpected size/bank\n"
; } } while (false)
;
222 return false;
223 }
224 }
225
226 // TODO: Verify that all registers have the same bank
227 I.setDesc(TII.get(TargetOpcode::PHI));
228 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
229}
230
231MachineOperand
232AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
233 const TargetRegisterClass &SubRC,
234 unsigned SubIdx) const {
235
236 MachineInstr *MI = MO.getParent();
237 MachineBasicBlock *BB = MO.getParent()->getParent();
238 Register DstReg = MRI->createVirtualRegister(&SubRC);
239
240 if (MO.isReg()) {
241 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
242 Register Reg = MO.getReg();
243 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
244 .addReg(Reg, 0, ComposedSubIdx);
245
246 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
247 MO.isKill(), MO.isDead(), MO.isUndef(),
248 MO.isEarlyClobber(), 0, MO.isDebug(),
249 MO.isInternalRead());
250 }
251
252 assert(MO.isImm())(static_cast <bool> (MO.isImm()) ? void (0) : __assert_fail
("MO.isImm()", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 252, __extension__ __PRETTY_FUNCTION__))
;
253
254 APInt Imm(64, MO.getImm());
255
256 switch (SubIdx) {
257 default:
258 llvm_unreachable("do not know to split immediate with this sub index.")::llvm::llvm_unreachable_internal("do not know to split immediate with this sub index."
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 258
)
;
259 case AMDGPU::sub0:
260 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
261 case AMDGPU::sub1:
262 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
263 }
264}
265
266static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
267 switch (Opc) {
268 case AMDGPU::G_AND:
269 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
270 case AMDGPU::G_OR:
271 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
272 case AMDGPU::G_XOR:
273 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
274 default:
275 llvm_unreachable("not a bit op")::llvm::llvm_unreachable_internal("not a bit op", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 275)
;
276 }
277}
278
279bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
280 Register DstReg = I.getOperand(0).getReg();
281 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
282
283 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
284 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
285 DstRB->getID() != AMDGPU::VCCRegBankID)
286 return false;
287
288 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
289 STI.isWave64());
290 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
291
292 // Dead implicit-def of scc
293 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
294 true, // isImp
295 false, // isKill
296 true)); // isDead
297 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
298}
299
300bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
301 MachineBasicBlock *BB = I.getParent();
302 MachineFunction *MF = BB->getParent();
303 Register DstReg = I.getOperand(0).getReg();
304 const DebugLoc &DL = I.getDebugLoc();
305 LLT Ty = MRI->getType(DstReg);
306 if (Ty.isVector())
307 return false;
308
309 unsigned Size = Ty.getSizeInBits();
310 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
311 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
312 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
313
314 if (Size == 32) {
315 if (IsSALU) {
316 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
317 MachineInstr *Add =
318 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
319 .add(I.getOperand(1))
320 .add(I.getOperand(2));
321 I.eraseFromParent();
322 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
323 }
324
325 if (STI.hasAddNoCarry()) {
326 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
327 I.setDesc(TII.get(Opc));
328 I.addOperand(*MF, MachineOperand::CreateImm(0));
329 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
330 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
331 }
332
333 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
334
335 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
336 MachineInstr *Add
337 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
338 .addDef(UnusedCarry, RegState::Dead)
339 .add(I.getOperand(1))
340 .add(I.getOperand(2))
341 .addImm(0);
342 I.eraseFromParent();
343 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
344 }
345
346 assert(!Sub && "illegal sub should not reach here")(static_cast <bool> (!Sub && "illegal sub should not reach here"
) ? void (0) : __assert_fail ("!Sub && \"illegal sub should not reach here\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 346
, __extension__ __PRETTY_FUNCTION__))
;
347
348 const TargetRegisterClass &RC
349 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
350 const TargetRegisterClass &HalfRC
351 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
352
353 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
354 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
355 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
356 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
357
358 Register DstLo = MRI->createVirtualRegister(&HalfRC);
359 Register DstHi = MRI->createVirtualRegister(&HalfRC);
360
361 if (IsSALU) {
362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
363 .add(Lo1)
364 .add(Lo2);
365 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
366 .add(Hi1)
367 .add(Hi2);
368 } else {
369 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
370 Register CarryReg = MRI->createVirtualRegister(CarryRC);
371 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
372 .addDef(CarryReg)
373 .add(Lo1)
374 .add(Lo2)
375 .addImm(0);
376 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
377 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
378 .add(Hi1)
379 .add(Hi2)
380 .addReg(CarryReg, RegState::Kill)
381 .addImm(0);
382
383 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
384 return false;
385 }
386
387 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
388 .addReg(DstLo)
389 .addImm(AMDGPU::sub0)
390 .addReg(DstHi)
391 .addImm(AMDGPU::sub1);
392
393
394 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
395 return false;
396
397 I.eraseFromParent();
398 return true;
399}
400
401bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
402 MachineInstr &I) const {
403 MachineBasicBlock *BB = I.getParent();
404 MachineFunction *MF = BB->getParent();
405 const DebugLoc &DL = I.getDebugLoc();
406 Register Dst0Reg = I.getOperand(0).getReg();
407 Register Dst1Reg = I.getOperand(1).getReg();
408 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
409 I.getOpcode() == AMDGPU::G_UADDE;
410 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
411 I.getOpcode() == AMDGPU::G_USUBE;
412
413 if (isVCC(Dst1Reg, *MRI)) {
414 unsigned NoCarryOpc =
415 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
416 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
417 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
418 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
419 I.addOperand(*MF, MachineOperand::CreateImm(0));
420 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
421 }
422
423 Register Src0Reg = I.getOperand(2).getReg();
424 Register Src1Reg = I.getOperand(3).getReg();
425
426 if (HasCarryIn) {
427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
428 .addReg(I.getOperand(4).getReg());
429 }
430
431 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
432 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
433
434 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
435 .add(I.getOperand(2))
436 .add(I.getOperand(3));
437 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
438 .addReg(AMDGPU::SCC);
439
440 if (!MRI->getRegClassOrNull(Dst1Reg))
441 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
442
443 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
444 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
445 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
446 return false;
447
448 if (HasCarryIn &&
449 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
450 AMDGPU::SReg_32RegClass, *MRI))
451 return false;
452
453 I.eraseFromParent();
454 return true;
455}
456
457// TODO: We should probably legalize these to only using 32-bit results.
458bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
459 MachineBasicBlock *BB = I.getParent();
460 Register DstReg = I.getOperand(0).getReg();
461 Register SrcReg = I.getOperand(1).getReg();
462 LLT DstTy = MRI->getType(DstReg);
463 LLT SrcTy = MRI->getType(SrcReg);
464 const unsigned SrcSize = SrcTy.getSizeInBits();
465 unsigned DstSize = DstTy.getSizeInBits();
466
467 // TODO: Should handle any multiple of 32 offset.
468 unsigned Offset = I.getOperand(2).getImm();
469 if (Offset % 32 != 0 || DstSize > 128)
470 return false;
471
472 // 16-bit operations really use 32-bit registers.
473 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
474 if (DstSize == 16)
475 DstSize = 32;
476
477 const TargetRegisterClass *DstRC =
478 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
479 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
480 return false;
481
482 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
483 const TargetRegisterClass *SrcRC =
484 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
485 if (!SrcRC)
486 return false;
487 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
488 DstSize / 32);
489 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
490 if (!SrcRC)
491 return false;
492
493 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
494 *SrcRC, I.getOperand(1));
495 const DebugLoc &DL = I.getDebugLoc();
496 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
497 .addReg(SrcReg, 0, SubReg);
498
499 I.eraseFromParent();
500 return true;
501}
502
503bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
504 MachineBasicBlock *BB = MI.getParent();
505 Register DstReg = MI.getOperand(0).getReg();
506 LLT DstTy = MRI->getType(DstReg);
507 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
508
509 const unsigned SrcSize = SrcTy.getSizeInBits();
510 if (SrcSize < 32)
511 return selectImpl(MI, *CoverageInfo);
512
513 const DebugLoc &DL = MI.getDebugLoc();
514 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
515 const unsigned DstSize = DstTy.getSizeInBits();
516 const TargetRegisterClass *DstRC =
517 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
518 if (!DstRC)
519 return false;
520
521 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
522 MachineInstrBuilder MIB =
523 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
524 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
525 MachineOperand &Src = MI.getOperand(I + 1);
526 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
527 MIB.addImm(SubRegs[I]);
528
529 const TargetRegisterClass *SrcRC
530 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
531 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
532 return false;
533 }
534
535 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
536 return false;
537
538 MI.eraseFromParent();
539 return true;
540}
541
542bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
543 MachineBasicBlock *BB = MI.getParent();
544 const int NumDst = MI.getNumOperands() - 1;
545
546 MachineOperand &Src = MI.getOperand(NumDst);
547
548 Register SrcReg = Src.getReg();
549 Register DstReg0 = MI.getOperand(0).getReg();
550 LLT DstTy = MRI->getType(DstReg0);
551 LLT SrcTy = MRI->getType(SrcReg);
552
553 const unsigned DstSize = DstTy.getSizeInBits();
554 const unsigned SrcSize = SrcTy.getSizeInBits();
555 const DebugLoc &DL = MI.getDebugLoc();
556 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
557
558 const TargetRegisterClass *SrcRC =
559 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
560 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
561 return false;
562
563 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
564 // source, and this relies on the fact that the same subregister indices are
565 // used for both.
566 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
567 for (int I = 0, E = NumDst; I != E; ++I) {
568 MachineOperand &Dst = MI.getOperand(I);
569 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
570 .addReg(SrcReg, 0, SubRegs[I]);
571
572 // Make sure the subregister index is valid for the source register.
573 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
574 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
575 return false;
576
577 const TargetRegisterClass *DstRC =
578 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
579 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
580 return false;
581 }
582
583 MI.eraseFromParent();
584 return true;
585}
586
587bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
588 MachineInstr &MI) const {
589 if (selectImpl(MI, *CoverageInfo))
590 return true;
591
592 const LLT S32 = LLT::scalar(32);
593 const LLT V2S16 = LLT::fixed_vector(2, 16);
594
595 Register Dst = MI.getOperand(0).getReg();
596 if (MRI->getType(Dst) != V2S16)
597 return false;
598
599 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
600 if (DstBank->getID() != AMDGPU::SGPRRegBankID)
601 return false;
602
603 Register Src0 = MI.getOperand(1).getReg();
604 Register Src1 = MI.getOperand(2).getReg();
605 if (MRI->getType(Src0) != S32)
606 return false;
607
608 const DebugLoc &DL = MI.getDebugLoc();
609 MachineBasicBlock *BB = MI.getParent();
610
611 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
612 if (ConstSrc1) {
613 auto ConstSrc0 =
614 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
615 if (ConstSrc0) {
616 const int64_t K0 = ConstSrc0->Value.getSExtValue();
617 const int64_t K1 = ConstSrc1->Value.getSExtValue();
618 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
619 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
620
621 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
622 .addImm(Lo16 | (Hi16 << 16));
623 MI.eraseFromParent();
624 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
625 }
626 }
627
628 // TODO: This should probably be a combine somewhere
629 // (build_vector_trunc $src0, undef -> copy $src0
630 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
631 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
632 MI.setDesc(TII.get(AMDGPU::COPY));
633 MI.RemoveOperand(2);
634 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
635 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
636 }
637
638 Register ShiftSrc0;
639 Register ShiftSrc1;
640
641 // With multiple uses of the shift, this will duplicate the shift and
642 // increase register pressure.
643 //
644 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
645 // => (S_PACK_HH_B32_B16 $src0, $src1)
646 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
647 // => (S_PACK_LH_B32_B16 $src0, $src1)
648 // (build_vector_trunc $src0, $src1)
649 // => (S_PACK_LL_B32_B16 $src0, $src1)
650
651 bool Shift0 = mi_match(
652 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
653
654 bool Shift1 = mi_match(
655 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
656
657 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
658 if (Shift0 && Shift1) {
659 Opc = AMDGPU::S_PACK_HH_B32_B16;
660 MI.getOperand(1).setReg(ShiftSrc0);
661 MI.getOperand(2).setReg(ShiftSrc1);
662 } else if (Shift1) {
663 Opc = AMDGPU::S_PACK_LH_B32_B16;
664 MI.getOperand(2).setReg(ShiftSrc1);
665 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
666 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
667 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
668 .addReg(ShiftSrc0)
669 .addImm(16);
670
671 MI.eraseFromParent();
672 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
673 }
674
675 MI.setDesc(TII.get(Opc));
676 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
677}
678
679bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
680 return selectG_ADD_SUB(I);
681}
682
683bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
684 const MachineOperand &MO = I.getOperand(0);
685
686 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
687 // regbank check here is to know why getConstrainedRegClassForOperand failed.
688 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
689 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
690 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
691 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
692 return true;
693 }
694
695 return false;
696}
697
698bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
699 MachineBasicBlock *BB = I.getParent();
700
701 Register DstReg = I.getOperand(0).getReg();
702 Register Src0Reg = I.getOperand(1).getReg();
703 Register Src1Reg = I.getOperand(2).getReg();
704 LLT Src1Ty = MRI->getType(Src1Reg);
705
706 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
707 unsigned InsSize = Src1Ty.getSizeInBits();
708
709 int64_t Offset = I.getOperand(3).getImm();
710
711 // FIXME: These cases should have been illegal and unnecessary to check here.
712 if (Offset % 32 != 0 || InsSize % 32 != 0)
713 return false;
714
715 // Currently not handled by getSubRegFromChannel.
716 if (InsSize > 128)
717 return false;
718
719 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
720 if (SubReg == AMDGPU::NoSubRegister)
721 return false;
722
723 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
724 const TargetRegisterClass *DstRC =
725 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
726 if (!DstRC)
727 return false;
728
729 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
730 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
731 const TargetRegisterClass *Src0RC =
732 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
733 const TargetRegisterClass *Src1RC =
734 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
735
736 // Deal with weird cases where the class only partially supports the subreg
737 // index.
738 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
739 if (!Src0RC || !Src1RC)
740 return false;
741
742 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
743 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
744 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
745 return false;
746
747 const DebugLoc &DL = I.getDebugLoc();
748 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
749 .addReg(Src0Reg)
750 .addReg(Src1Reg)
751 .addImm(SubReg);
752
753 I.eraseFromParent();
754 return true;
755}
756
757bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
758 Register DstReg = MI.getOperand(0).getReg();
759 Register SrcReg = MI.getOperand(1).getReg();
760 Register OffsetReg = MI.getOperand(2).getReg();
761 Register WidthReg = MI.getOperand(3).getReg();
762
763 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&(static_cast <bool> (RBI.getRegBank(DstReg, *MRI, TRI)->
getID() == AMDGPU::VGPRRegBankID && "scalar BFX instructions are expanded in regbankselect"
) ? void (0) : __assert_fail ("RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID && \"scalar BFX instructions are expanded in regbankselect\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 764
, __extension__ __PRETTY_FUNCTION__))
764 "scalar BFX instructions are expanded in regbankselect")(static_cast <bool> (RBI.getRegBank(DstReg, *MRI, TRI)->
getID() == AMDGPU::VGPRRegBankID && "scalar BFX instructions are expanded in regbankselect"
) ? void (0) : __assert_fail ("RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID && \"scalar BFX instructions are expanded in regbankselect\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 764
, __extension__ __PRETTY_FUNCTION__))
;
765 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&(static_cast <bool> (MRI->getType(MI.getOperand(0).getReg
()).getSizeInBits() == 32 && "64-bit vector BFX instructions are expanded in regbankselect"
) ? void (0) : __assert_fail ("MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 && \"64-bit vector BFX instructions are expanded in regbankselect\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 766
, __extension__ __PRETTY_FUNCTION__))
766 "64-bit vector BFX instructions are expanded in regbankselect")(static_cast <bool> (MRI->getType(MI.getOperand(0).getReg
()).getSizeInBits() == 32 && "64-bit vector BFX instructions are expanded in regbankselect"
) ? void (0) : __assert_fail ("MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 && \"64-bit vector BFX instructions are expanded in regbankselect\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 766
, __extension__ __PRETTY_FUNCTION__))
;
767
768 const DebugLoc &DL = MI.getDebugLoc();
769 MachineBasicBlock *MBB = MI.getParent();
770
771 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
772 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
773 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
774 .addReg(SrcReg)
775 .addReg(OffsetReg)
776 .addReg(WidthReg);
777 MI.eraseFromParent();
778 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
779}
780
781bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
782 if (STI.getLDSBankCount() != 16)
783 return selectImpl(MI, *CoverageInfo);
784
785 Register Dst = MI.getOperand(0).getReg();
786 Register Src0 = MI.getOperand(2).getReg();
787 Register M0Val = MI.getOperand(6).getReg();
788 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
789 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
790 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
791 return false;
792
793 // This requires 2 instructions. It is possible to write a pattern to support
794 // this, but the generated isel emitter doesn't correctly deal with multiple
795 // output instructions using the same physical register input. The copy to m0
796 // is incorrectly placed before the second instruction.
797 //
798 // TODO: Match source modifiers.
799
800 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
801 const DebugLoc &DL = MI.getDebugLoc();
802 MachineBasicBlock *MBB = MI.getParent();
803
804 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
805 .addReg(M0Val);
806 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
807 .addImm(2)
808 .addImm(MI.getOperand(4).getImm()) // $attr
809 .addImm(MI.getOperand(3).getImm()); // $attrchan
810
811 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
812 .addImm(0) // $src0_modifiers
813 .addReg(Src0) // $src0
814 .addImm(MI.getOperand(4).getImm()) // $attr
815 .addImm(MI.getOperand(3).getImm()) // $attrchan
816 .addImm(0) // $src2_modifiers
817 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
818 .addImm(MI.getOperand(5).getImm()) // $high
819 .addImm(0) // $clamp
820 .addImm(0); // $omod
821
822 MI.eraseFromParent();
823 return true;
824}
825
826// Writelane is special in that it can use SGPR and M0 (which would normally
827// count as using the constant bus twice - but in this case it is allowed since
828// the lane selector doesn't count as a use of the constant bus). However, it is
829// still required to abide by the 1 SGPR rule. Fix this up if we might have
830// multiple SGPRs.
831bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
832 // With a constant bus limit of at least 2, there's no issue.
833 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
834 return selectImpl(MI, *CoverageInfo);
835
836 MachineBasicBlock *MBB = MI.getParent();
837 const DebugLoc &DL = MI.getDebugLoc();
838 Register VDst = MI.getOperand(0).getReg();
839 Register Val = MI.getOperand(2).getReg();
840 Register LaneSelect = MI.getOperand(3).getReg();
841 Register VDstIn = MI.getOperand(4).getReg();
842
843 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
844
845 Optional<ValueAndVReg> ConstSelect =
846 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
847 if (ConstSelect) {
848 // The selector has to be an inline immediate, so we can use whatever for
849 // the other operands.
850 MIB.addReg(Val);
851 MIB.addImm(ConstSelect->Value.getSExtValue() &
852 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
853 } else {
854 Optional<ValueAndVReg> ConstVal =
855 getIConstantVRegValWithLookThrough(Val, *MRI);
856
857 // If the value written is an inline immediate, we can get away without a
858 // copy to m0.
859 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
860 STI.hasInv2PiInlineImm())) {
861 MIB.addImm(ConstVal->Value.getSExtValue());
862 MIB.addReg(LaneSelect);
863 } else {
864 MIB.addReg(Val);
865
866 // If the lane selector was originally in a VGPR and copied with
867 // readfirstlane, there's a hazard to read the same SGPR from the
868 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
869 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
870
871 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
872 .addReg(LaneSelect);
873 MIB.addReg(AMDGPU::M0);
874 }
875 }
876
877 MIB.addReg(VDstIn);
878
879 MI.eraseFromParent();
880 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
881}
882
883// We need to handle this here because tablegen doesn't support matching
884// instructions with multiple outputs.
885bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
886 Register Dst0 = MI.getOperand(0).getReg();
887 Register Dst1 = MI.getOperand(1).getReg();
888
889 LLT Ty = MRI->getType(Dst0);
890 unsigned Opc;
891 if (Ty == LLT::scalar(32))
892 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
893 else if (Ty == LLT::scalar(64))
894 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
895 else
896 return false;
897
898 // TODO: Match source modifiers.
899
900 const DebugLoc &DL = MI.getDebugLoc();
901 MachineBasicBlock *MBB = MI.getParent();
902
903 Register Numer = MI.getOperand(3).getReg();
904 Register Denom = MI.getOperand(4).getReg();
905 unsigned ChooseDenom = MI.getOperand(5).getImm();
906
907 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
908
909 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
910 .addDef(Dst1)
911 .addImm(0) // $src0_modifiers
912 .addUse(Src0) // $src0
913 .addImm(0) // $src1_modifiers
914 .addUse(Denom) // $src1
915 .addImm(0) // $src2_modifiers
916 .addUse(Numer) // $src2
917 .addImm(0) // $clamp
918 .addImm(0); // $omod
919
920 MI.eraseFromParent();
921 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
922}
923
924bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
925 unsigned IntrinsicID = I.getIntrinsicID();
926 switch (IntrinsicID) {
927 case Intrinsic::amdgcn_if_break: {
928 MachineBasicBlock *BB = I.getParent();
929
930 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
931 // SelectionDAG uses for wave32 vs wave64.
932 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
933 .add(I.getOperand(0))
934 .add(I.getOperand(2))
935 .add(I.getOperand(3));
936
937 Register DstReg = I.getOperand(0).getReg();
938 Register Src0Reg = I.getOperand(2).getReg();
939 Register Src1Reg = I.getOperand(3).getReg();
940
941 I.eraseFromParent();
942
943 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
944 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
945
946 return true;
947 }
948 case Intrinsic::amdgcn_interp_p1_f16:
949 return selectInterpP1F16(I);
950 case Intrinsic::amdgcn_wqm:
951 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
952 case Intrinsic::amdgcn_softwqm:
953 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
954 case Intrinsic::amdgcn_strict_wwm:
955 case Intrinsic::amdgcn_wwm:
956 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
957 case Intrinsic::amdgcn_strict_wqm:
958 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
959 case Intrinsic::amdgcn_writelane:
960 return selectWritelane(I);
961 case Intrinsic::amdgcn_div_scale:
962 return selectDivScale(I);
963 case Intrinsic::amdgcn_icmp:
964 return selectIntrinsicIcmp(I);
965 case Intrinsic::amdgcn_ballot:
966 return selectBallot(I);
967 case Intrinsic::amdgcn_reloc_constant:
968 return selectRelocConstant(I);
969 case Intrinsic::amdgcn_groupstaticsize:
970 return selectGroupStaticSize(I);
971 case Intrinsic::returnaddress:
972 return selectReturnAddress(I);
973 default:
974 return selectImpl(I, *CoverageInfo);
975 }
976}
977
978static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
979 if (Size != 32 && Size != 64)
980 return -1;
981 switch (P) {
982 default:
983 llvm_unreachable("Unknown condition code!")::llvm::llvm_unreachable_internal("Unknown condition code!", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 983)
;
984 case CmpInst::ICMP_NE:
985 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
986 case CmpInst::ICMP_EQ:
987 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
988 case CmpInst::ICMP_SGT:
989 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
990 case CmpInst::ICMP_SGE:
991 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
992 case CmpInst::ICMP_SLT:
993 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
994 case CmpInst::ICMP_SLE:
995 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
996 case CmpInst::ICMP_UGT:
997 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
998 case CmpInst::ICMP_UGE:
999 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
1000 case CmpInst::ICMP_ULT:
1001 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
1002 case CmpInst::ICMP_ULE:
1003 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
1004 }
1005}
1006
1007int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1008 unsigned Size) const {
1009 if (Size == 64) {
1010 if (!STI.hasScalarCompareEq64())
1011 return -1;
1012
1013 switch (P) {
1014 case CmpInst::ICMP_NE:
1015 return AMDGPU::S_CMP_LG_U64;
1016 case CmpInst::ICMP_EQ:
1017 return AMDGPU::S_CMP_EQ_U64;
1018 default:
1019 return -1;
1020 }
1021 }
1022
1023 if (Size != 32)
1024 return -1;
1025
1026 switch (P) {
1027 case CmpInst::ICMP_NE:
1028 return AMDGPU::S_CMP_LG_U32;
1029 case CmpInst::ICMP_EQ:
1030 return AMDGPU::S_CMP_EQ_U32;
1031 case CmpInst::ICMP_SGT:
1032 return AMDGPU::S_CMP_GT_I32;
1033 case CmpInst::ICMP_SGE:
1034 return AMDGPU::S_CMP_GE_I32;
1035 case CmpInst::ICMP_SLT:
1036 return AMDGPU::S_CMP_LT_I32;
1037 case CmpInst::ICMP_SLE:
1038 return AMDGPU::S_CMP_LE_I32;
1039 case CmpInst::ICMP_UGT:
1040 return AMDGPU::S_CMP_GT_U32;
1041 case CmpInst::ICMP_UGE:
1042 return AMDGPU::S_CMP_GE_U32;
1043 case CmpInst::ICMP_ULT:
1044 return AMDGPU::S_CMP_LT_U32;
1045 case CmpInst::ICMP_ULE:
1046 return AMDGPU::S_CMP_LE_U32;
1047 default:
1048 llvm_unreachable("Unknown condition code!")::llvm::llvm_unreachable_internal("Unknown condition code!", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1048)
;
1049 }
1050}
1051
1052bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1053 MachineBasicBlock *BB = I.getParent();
1054 const DebugLoc &DL = I.getDebugLoc();
1055
1056 Register SrcReg = I.getOperand(2).getReg();
1057 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1058
1059 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1060
1061 Register CCReg = I.getOperand(0).getReg();
1062 if (!isVCC(CCReg, *MRI)) {
1063 int Opcode = getS_CMPOpcode(Pred, Size);
1064 if (Opcode == -1)
1065 return false;
1066 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1067 .add(I.getOperand(2))
1068 .add(I.getOperand(3));
1069 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1070 .addReg(AMDGPU::SCC);
1071 bool Ret =
1072 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1073 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1074 I.eraseFromParent();
1075 return Ret;
1076 }
1077
1078 int Opcode = getV_CMPOpcode(Pred, Size);
1079 if (Opcode == -1)
1080 return false;
1081
1082 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1083 I.getOperand(0).getReg())
1084 .add(I.getOperand(2))
1085 .add(I.getOperand(3));
1086 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1087 *TRI.getBoolRC(), *MRI);
1088 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1089 I.eraseFromParent();
1090 return Ret;
1091}
1092
1093bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1094 Register Dst = I.getOperand(0).getReg();
1095 if (isVCC(Dst, *MRI))
1096 return false;
1097
1098 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1099 return false;
1100
1101 MachineBasicBlock *BB = I.getParent();
1102 const DebugLoc &DL = I.getDebugLoc();
1103 Register SrcReg = I.getOperand(2).getReg();
1104 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1105 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1106
1107 int Opcode = getV_CMPOpcode(Pred, Size);
1108 if (Opcode == -1)
1109 return false;
1110
1111 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1112 .add(I.getOperand(2))
1113 .add(I.getOperand(3));
1114 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1115 *MRI);
1116 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1117 I.eraseFromParent();
1118 return Ret;
1119}
1120
1121bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1122 MachineBasicBlock *BB = I.getParent();
1123 const DebugLoc &DL = I.getDebugLoc();
1124 Register DstReg = I.getOperand(0).getReg();
1125 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1126 const bool Is64 = Size == 64;
1127
1128 if (Size != STI.getWavefrontSize())
1129 return false;
1130
1131 Optional<ValueAndVReg> Arg =
1132 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1133
1134 if (Arg.hasValue()) {
1135 const int64_t Value = Arg.getValue().Value.getSExtValue();
1136 if (Value == 0) {
1137 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1138 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1139 } else if (Value == -1) { // all ones
1140 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1141 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1142 } else
1143 return false;
1144 } else {
1145 Register SrcReg = I.getOperand(2).getReg();
1146 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1147 }
1148
1149 I.eraseFromParent();
1150 return true;
1151}
1152
1153bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1154 Register DstReg = I.getOperand(0).getReg();
1155 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1156 const TargetRegisterClass *DstRC =
1157 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1158 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1159 return false;
1160
1161 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1162
1163 Module *M = MF->getFunction().getParent();
1164 const MDNode *Metadata = I.getOperand(2).getMetadata();
1165 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1166 auto RelocSymbol = cast<GlobalVariable>(
1167 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1168
1169 MachineBasicBlock *BB = I.getParent();
1170 BuildMI(*BB, &I, I.getDebugLoc(),
1171 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1172 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1173
1174 I.eraseFromParent();
1175 return true;
1176}
1177
1178bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1179 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1180
1181 Register DstReg = I.getOperand(0).getReg();
1182 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1183 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1184 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1185
1186 MachineBasicBlock *MBB = I.getParent();
1187 const DebugLoc &DL = I.getDebugLoc();
1188
1189 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1190
1191 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1192 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1193 MIB.addImm(MFI->getLDSSize());
1194 } else {
1195 Module *M = MF->getFunction().getParent();
1196 const GlobalValue *GV
1197 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1198 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1199 }
1200
1201 I.eraseFromParent();
1202 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1203}
1204
1205bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1206 MachineBasicBlock *MBB = I.getParent();
1207 MachineFunction &MF = *MBB->getParent();
1208 const DebugLoc &DL = I.getDebugLoc();
1209
1210 MachineOperand &Dst = I.getOperand(0);
1211 Register DstReg = Dst.getReg();
1212 unsigned Depth = I.getOperand(2).getImm();
1213
1214 const TargetRegisterClass *RC
1215 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1216 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1217 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1218 return false;
1219
1220 // Check for kernel and shader functions
1221 if (Depth != 0 ||
1222 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1223 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1224 .addImm(0);
1225 I.eraseFromParent();
1226 return true;
1227 }
1228
1229 MachineFrameInfo &MFI = MF.getFrameInfo();
1230 // There is a call to @llvm.returnaddress in this function
1231 MFI.setReturnAddressIsTaken(true);
1232
1233 // Get the return address reg and mark it as an implicit live-in
1234 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1235 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1236 AMDGPU::SReg_64RegClass, DL);
1237 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1238 .addReg(LiveIn);
1239 I.eraseFromParent();
1240 return true;
1241}
1242
1243bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1244 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1245 // SelectionDAG uses for wave32 vs wave64.
1246 MachineBasicBlock *BB = MI.getParent();
1247 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1248 .add(MI.getOperand(1));
1249
1250 Register Reg = MI.getOperand(1).getReg();
1251 MI.eraseFromParent();
1252
1253 if (!MRI->getRegClassOrNull(Reg))
1254 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1255 return true;
1256}
1257
1258bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1259 MachineInstr &MI, Intrinsic::ID IntrID) const {
1260 MachineBasicBlock *MBB = MI.getParent();
1261 MachineFunction *MF = MBB->getParent();
1262 const DebugLoc &DL = MI.getDebugLoc();
1263
1264 unsigned IndexOperand = MI.getOperand(7).getImm();
1265 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1266 bool WaveDone = MI.getOperand(9).getImm() != 0;
1267
1268 if (WaveDone && !WaveRelease)
1269 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1270
1271 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1272 IndexOperand &= ~0x3f;
1273 unsigned CountDw = 0;
1274
1275 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1276 CountDw = (IndexOperand >> 24) & 0xf;
1277 IndexOperand &= ~(0xf << 24);
1278
1279 if (CountDw < 1 || CountDw > 4) {
1280 report_fatal_error(
1281 "ds_ordered_count: dword count must be between 1 and 4");
1282 }
1283 }
1284
1285 if (IndexOperand)
1286 report_fatal_error("ds_ordered_count: bad index operand");
1287
1288 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1289 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1290
1291 unsigned Offset0 = OrderedCountIndex << 2;
1292 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1293 (Instruction << 4);
1294
1295 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1296 Offset1 |= (CountDw - 1) << 6;
1297
1298 unsigned Offset = Offset0 | (Offset1 << 8);
1299
1300 Register M0Val = MI.getOperand(2).getReg();
1301 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1302 .addReg(M0Val);
1303
1304 Register DstReg = MI.getOperand(0).getReg();
1305 Register ValReg = MI.getOperand(3).getReg();
1306 MachineInstrBuilder DS =
1307 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1308 .addReg(ValReg)
1309 .addImm(Offset)
1310 .cloneMemRefs(MI);
1311
1312 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1313 return false;
1314
1315 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1316 MI.eraseFromParent();
1317 return Ret;
1318}
1319
1320static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1321 switch (IntrID) {
1322 case Intrinsic::amdgcn_ds_gws_init:
1323 return AMDGPU::DS_GWS_INIT;
1324 case Intrinsic::amdgcn_ds_gws_barrier:
1325 return AMDGPU::DS_GWS_BARRIER;
1326 case Intrinsic::amdgcn_ds_gws_sema_v:
1327 return AMDGPU::DS_GWS_SEMA_V;
1328 case Intrinsic::amdgcn_ds_gws_sema_br:
1329 return AMDGPU::DS_GWS_SEMA_BR;
1330 case Intrinsic::amdgcn_ds_gws_sema_p:
1331 return AMDGPU::DS_GWS_SEMA_P;
1332 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1333 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1334 default:
1335 llvm_unreachable("not a gws intrinsic")::llvm::llvm_unreachable_internal("not a gws intrinsic", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1335)
;
1336 }
1337}
1338
1339bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1340 Intrinsic::ID IID) const {
1341 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1342 !STI.hasGWSSemaReleaseAll())
1343 return false;
1344
1345 // intrinsic ID, vsrc, offset
1346 const bool HasVSrc = MI.getNumOperands() == 3;
1347 assert(HasVSrc || MI.getNumOperands() == 2)(static_cast <bool> (HasVSrc || MI.getNumOperands() == 2
) ? void (0) : __assert_fail ("HasVSrc || MI.getNumOperands() == 2"
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 1347
, __extension__ __PRETTY_FUNCTION__))
;
1348
1349 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1350 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1351 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1352 return false;
1353
1354 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1355 assert(OffsetDef)(static_cast <bool> (OffsetDef) ? void (0) : __assert_fail
("OffsetDef", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1355, __extension__ __PRETTY_FUNCTION__))
;
1356
1357 unsigned ImmOffset;
1358
1359 MachineBasicBlock *MBB = MI.getParent();
1360 const DebugLoc &DL = MI.getDebugLoc();
1361
1362 MachineInstr *Readfirstlane = nullptr;
1363
1364 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1365 // incoming offset, in case there's an add of a constant. We'll have to put it
1366 // back later.
1367 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1368 Readfirstlane = OffsetDef;
1369 BaseOffset = OffsetDef->getOperand(1).getReg();
1370 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1371 }
1372
1373 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1374 // If we have a constant offset, try to use the 0 in m0 as the base.
1375 // TODO: Look into changing the default m0 initialization value. If the
1376 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1377 // the immediate offset.
1378
1379 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1380 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1381 .addImm(0);
1382 } else {
1383 std::tie(BaseOffset, ImmOffset) =
1384 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1385
1386 if (Readfirstlane) {
1387 // We have the constant offset now, so put the readfirstlane back on the
1388 // variable component.
1389 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1390 return false;
1391
1392 Readfirstlane->getOperand(1).setReg(BaseOffset);
1393 BaseOffset = Readfirstlane->getOperand(0).getReg();
1394 } else {
1395 if (!RBI.constrainGenericRegister(BaseOffset,
1396 AMDGPU::SReg_32RegClass, *MRI))
1397 return false;
1398 }
1399
1400 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1401 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1402 .addReg(BaseOffset)
1403 .addImm(16);
1404
1405 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1406 .addReg(M0Base);
1407 }
1408
1409 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1410 // offset field) % 64. Some versions of the programming guide omit the m0
1411 // part, or claim it's from offset 0.
1412 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1413
1414 if (HasVSrc) {
1415 Register VSrc = MI.getOperand(1).getReg();
1416
1417 if (STI.needsAlignedVGPRs()) {
1418 // Add implicit aligned super-reg to force alignment on the data operand.
1419 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1420 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1421 Register NewVR =
1422 MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
1423 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
1424 .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
1425 .addImm(AMDGPU::sub0)
1426 .addReg(Undef)
1427 .addImm(AMDGPU::sub1);
1428 MIB.addReg(NewVR, 0, AMDGPU::sub0);
1429 MIB.addReg(NewVR, RegState::Implicit);
1430 } else {
1431 MIB.addReg(VSrc);
1432 }
1433
1434 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1435 return false;
1436 }
1437
1438 MIB.addImm(ImmOffset)
1439 .cloneMemRefs(MI);
1440
1441 MI.eraseFromParent();
1442 return true;
1443}
1444
1445bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1446 bool IsAppend) const {
1447 Register PtrBase = MI.getOperand(2).getReg();
1448 LLT PtrTy = MRI->getType(PtrBase);
1449 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1450
1451 unsigned Offset;
1452 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1453
1454 // TODO: Should this try to look through readfirstlane like GWS?
1455 if (!isDSOffsetLegal(PtrBase, Offset)) {
1456 PtrBase = MI.getOperand(2).getReg();
1457 Offset = 0;
1458 }
1459
1460 MachineBasicBlock *MBB = MI.getParent();
1461 const DebugLoc &DL = MI.getDebugLoc();
1462 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1463
1464 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1465 .addReg(PtrBase);
1466 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1467 return false;
1468
1469 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1470 .addImm(Offset)
1471 .addImm(IsGDS ? -1 : 0)
1472 .cloneMemRefs(MI);
1473 MI.eraseFromParent();
1474 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1475}
1476
1477bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1478 if (TM.getOptLevel() > CodeGenOpt::None) {
1479 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1480 if (WGSize <= STI.getWavefrontSize()) {
1481 MachineBasicBlock *MBB = MI.getParent();
1482 const DebugLoc &DL = MI.getDebugLoc();
1483 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1484 MI.eraseFromParent();
1485 return true;
1486 }
1487 }
1488 return selectImpl(MI, *CoverageInfo);
1489}
1490
1491static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1492 bool &IsTexFail) {
1493 if (TexFailCtrl)
1494 IsTexFail = true;
1495
1496 TFE = (TexFailCtrl & 0x1) ? true : false;
1497 TexFailCtrl &= ~(uint64_t)0x1;
1498 LWE = (TexFailCtrl & 0x2) ? true : false;
1499 TexFailCtrl &= ~(uint64_t)0x2;
1500
1501 return TexFailCtrl == 0;
1502}
1503
1504bool AMDGPUInstructionSelector::selectImageIntrinsic(
1505 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1506 MachineBasicBlock *MBB = MI.getParent();
1507 const DebugLoc &DL = MI.getDebugLoc();
1508
1509 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1510 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1511
1512 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1513 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1514 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1515 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1516 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1517 unsigned IntrOpcode = Intr->BaseOpcode;
1518 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1519
1520 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1521
1522 Register VDataIn, VDataOut;
1523 LLT VDataTy;
1524 int NumVDataDwords = -1;
1525 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1526 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1527
1528 bool Unorm;
1529 if (!BaseOpcode->Sampler)
1530 Unorm = true;
1531 else
1532 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1533
1534 bool TFE;
1535 bool LWE;
1536 bool IsTexFail = false;
1537 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1538 TFE, LWE, IsTexFail))
1539 return false;
1540
1541 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1542 const bool IsA16 = (Flags & 1) != 0;
1543 const bool IsG16 = (Flags & 2) != 0;
1544
1545 // A16 implies 16 bit gradients if subtarget doesn't support G16
1546 if (IsA16 && !STI.hasG16() && !IsG16)
1547 return false;
1548
1549 unsigned DMask = 0;
1550 unsigned DMaskLanes = 0;
1551
1552 if (BaseOpcode->Atomic) {
1553 VDataOut = MI.getOperand(0).getReg();
1554 VDataIn = MI.getOperand(2).getReg();
1555 LLT Ty = MRI->getType(VDataIn);
1556
1557 // Be careful to allow atomic swap on 16-bit element vectors.
1558 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1559 Ty.getSizeInBits() == 128 :
1560 Ty.getSizeInBits() == 64;
1561
1562 if (BaseOpcode->AtomicX2) {
1563 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister)(static_cast <bool> (MI.getOperand(3).getReg() == AMDGPU
::NoRegister) ? void (0) : __assert_fail ("MI.getOperand(3).getReg() == AMDGPU::NoRegister"
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 1563
, __extension__ __PRETTY_FUNCTION__))
;
1564
1565 DMask = Is64Bit ? 0xf : 0x3;
1566 NumVDataDwords = Is64Bit ? 4 : 2;
1567 } else {
1568 DMask = Is64Bit ? 0x3 : 0x1;
1569 NumVDataDwords = Is64Bit ? 2 : 1;
1570 }
1571 } else {
1572 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1573 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1574
1575 if (BaseOpcode->Store) {
1576 VDataIn = MI.getOperand(1).getReg();
1577 VDataTy = MRI->getType(VDataIn);
1578 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1579 } else {
1580 VDataOut = MI.getOperand(0).getReg();
1581 VDataTy = MRI->getType(VDataOut);
1582 NumVDataDwords = DMaskLanes;
1583
1584 if (IsD16 && !STI.hasUnpackedD16VMem())
1585 NumVDataDwords = (DMaskLanes + 1) / 2;
1586 }
1587 }
1588
1589 // Optimize _L to _LZ when _L is zero
1590 if (LZMappingInfo) {
1591 // The legalizer replaced the register with an immediate 0 if we need to
1592 // change the opcode.
1593 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
1594 if (Lod.isImm()) {
1595 assert(Lod.getImm() == 0)(static_cast <bool> (Lod.getImm() == 0) ? void (0) : __assert_fail
("Lod.getImm() == 0", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1595, __extension__ __PRETTY_FUNCTION__))
;
1596 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
1597 }
1598 }
1599
1600 // Optimize _mip away, when 'lod' is zero
1601 if (MIPMappingInfo) {
1602 const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
1603 if (Lod.isImm()) {
1604 assert(Lod.getImm() == 0)(static_cast <bool> (Lod.getImm() == 0) ? void (0) : __assert_fail
("Lod.getImm() == 0", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1604, __extension__ __PRETTY_FUNCTION__))
;
1605 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
1606 }
1607 }
1608
1609 // Set G16 opcode
1610 if (IsG16 && !IsA16) {
1611 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1612 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1613 assert(G16MappingInfo)(static_cast <bool> (G16MappingInfo) ? void (0) : __assert_fail
("G16MappingInfo", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1613, __extension__ __PRETTY_FUNCTION__))
;
1614 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1615 }
1616
1617 // TODO: Check this in verifier.
1618 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this")(static_cast <bool> ((!IsTexFail || DMaskLanes >= 1)
&& "should have legalized this") ? void (0) : __assert_fail
("(!IsTexFail || DMaskLanes >= 1) && \"should have legalized this\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 1618
, __extension__ __PRETTY_FUNCTION__))
;
1619
1620 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1621 if (BaseOpcode->Atomic)
1622 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1623 if (CPol & ~AMDGPU::CPol::ALL)
1624 return false;
1625
1626 int NumVAddrRegs = 0;
1627 int NumVAddrDwords = 0;
1628 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1629 // Skip the $noregs and 0s inserted during legalization.
1630 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1631 if (!AddrOp.isReg())
1632 continue; // XXX - Break?
1633
1634 Register Addr = AddrOp.getReg();
1635 if (!Addr)
1636 break;
1637
1638 ++NumVAddrRegs;
1639 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1640 }
1641
1642 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1643 // NSA, these should have beeen packed into a single value in the first
1644 // address register
1645 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1646 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1647 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Trying to use NSA on non-NSA target\n"
; } } while (false)
;
1648 return false;
1649 }
1650
1651 if (IsTexFail)
1652 ++NumVDataDwords;
1653
1654 int Opcode = -1;
1655 if (IsGFX10Plus) {
1656 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1657 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1658 : AMDGPU::MIMGEncGfx10Default,
1659 NumVDataDwords, NumVAddrDwords);
1660 } else {
1661 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1662 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1663 NumVDataDwords, NumVAddrDwords);
1664 if (Opcode == -1)
1665 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1666 NumVDataDwords, NumVAddrDwords);
1667 }
1668 assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail
("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1668, __extension__ __PRETTY_FUNCTION__))
;
1669
1670 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1671 .cloneMemRefs(MI);
1672
1673 if (VDataOut) {
1674 if (BaseOpcode->AtomicX2) {
1675 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1676
1677 Register TmpReg = MRI->createVirtualRegister(
1678 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1679 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1680
1681 MIB.addDef(TmpReg);
1682 if (!MRI->use_empty(VDataOut)) {
1683 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1684 .addReg(TmpReg, RegState::Kill, SubReg);
1685 }
1686
1687 } else {
1688 MIB.addDef(VDataOut); // vdata output
1689 }
1690 }
1691
1692 if (VDataIn)
1693 MIB.addReg(VDataIn); // vdata input
1694
1695 for (int I = 0; I != NumVAddrRegs; ++I) {
1696 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1697 if (SrcOp.isReg()) {
1698 assert(SrcOp.getReg() != 0)(static_cast <bool> (SrcOp.getReg() != 0) ? void (0) : __assert_fail
("SrcOp.getReg() != 0", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1698, __extension__ __PRETTY_FUNCTION__))
;
1699 MIB.addReg(SrcOp.getReg());
1700 }
1701 }
1702
1703 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1704 if (BaseOpcode->Sampler)
1705 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1706
1707 MIB.addImm(DMask); // dmask
1708
1709 if (IsGFX10Plus)
1710 MIB.addImm(DimInfo->Encoding);
1711 MIB.addImm(Unorm);
1712
1713 MIB.addImm(CPol);
1714 MIB.addImm(IsA16 && // a16 or r128
1715 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1716 if (IsGFX10Plus)
1717 MIB.addImm(IsA16 ? -1 : 0);
1718
1719 MIB.addImm(TFE); // tfe
1720 MIB.addImm(LWE); // lwe
1721 if (!IsGFX10Plus)
1722 MIB.addImm(DimInfo->DA ? -1 : 0);
1723 if (BaseOpcode->HasD16)
1724 MIB.addImm(IsD16 ? -1 : 0);
1725
1726 if (IsTexFail) {
1727 // An image load instruction with TFE/LWE only conditionally writes to its
1728 // result registers. Initialize them to zero so that we always get well
1729 // defined result values.
1730 assert(VDataOut && !VDataIn)(static_cast <bool> (VDataOut && !VDataIn) ? void
(0) : __assert_fail ("VDataOut && !VDataIn", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1730, __extension__ __PRETTY_FUNCTION__))
;
1731 Register Tied = MRI->cloneVirtualRegister(VDataOut);
1732 Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1733 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
1734 .addImm(0);
1735 auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
1736 if (STI.usePRTStrictNull()) {
1737 // With enable-prt-strict-null enabled, initialize all result registers to
1738 // zero.
1739 auto RegSeq =
1740 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1741 for (auto Sub : Parts)
1742 RegSeq.addReg(Zero).addImm(Sub);
1743 } else {
1744 // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
1745 // result register.
1746 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1747 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1748 auto RegSeq =
1749 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1750 for (auto Sub : Parts.drop_back(1))
1751 RegSeq.addReg(Undef).addImm(Sub);
1752 RegSeq.addReg(Zero).addImm(Parts.back());
1753 }
1754 MIB.addReg(Tied, RegState::Implicit);
1755 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1756 }
1757
1758 MI.eraseFromParent();
1759 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1760}
1761
1762bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1763 MachineInstr &I) const {
1764 unsigned IntrinsicID = I.getIntrinsicID();
1765 switch (IntrinsicID) {
1766 case Intrinsic::amdgcn_end_cf:
1767 return selectEndCfIntrinsic(I);
1768 case Intrinsic::amdgcn_ds_ordered_add:
1769 case Intrinsic::amdgcn_ds_ordered_swap:
1770 return selectDSOrderedIntrinsic(I, IntrinsicID);
1771 case Intrinsic::amdgcn_ds_gws_init:
1772 case Intrinsic::amdgcn_ds_gws_barrier:
1773 case Intrinsic::amdgcn_ds_gws_sema_v:
1774 case Intrinsic::amdgcn_ds_gws_sema_br:
1775 case Intrinsic::amdgcn_ds_gws_sema_p:
1776 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1777 return selectDSGWSIntrinsic(I, IntrinsicID);
1778 case Intrinsic::amdgcn_ds_append:
1779 return selectDSAppendConsume(I, true);
1780 case Intrinsic::amdgcn_ds_consume:
1781 return selectDSAppendConsume(I, false);
1782 case Intrinsic::amdgcn_s_barrier:
1783 return selectSBarrier(I);
1784 case Intrinsic::amdgcn_global_atomic_fadd:
1785 return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
1786 default: {
1787 return selectImpl(I, *CoverageInfo);
1788 }
1789 }
1790}
1791
1792bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1793 if (selectImpl(I, *CoverageInfo))
1794 return true;
1795
1796 MachineBasicBlock *BB = I.getParent();
1797 const DebugLoc &DL = I.getDebugLoc();
1798
1799 Register DstReg = I.getOperand(0).getReg();
1800 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1801 assert(Size <= 32 || Size == 64)(static_cast <bool> (Size <= 32 || Size == 64) ? void
(0) : __assert_fail ("Size <= 32 || Size == 64", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1801, __extension__ __PRETTY_FUNCTION__))
;
1802 const MachineOperand &CCOp = I.getOperand(1);
1803 Register CCReg = CCOp.getReg();
1804 if (!isVCC(CCReg, *MRI)) {
1805 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1806 AMDGPU::S_CSELECT_B32;
1807 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1808 .addReg(CCReg);
1809
1810 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1811 // bank, because it does not cover the register class that we used to represent
1812 // for it. So we need to manually set the register class here.
1813 if (!MRI->getRegClassOrNull(CCReg))
1814 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1815 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1816 .add(I.getOperand(2))
1817 .add(I.getOperand(3));
1818
1819 bool Ret = false;
1820 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1821 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1822 I.eraseFromParent();
1823 return Ret;
1824 }
1825
1826 // Wide VGPR select should have been split in RegBankSelect.
1827 if (Size > 32)
1828 return false;
1829
1830 MachineInstr *Select =
1831 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1832 .addImm(0)
1833 .add(I.getOperand(3))
1834 .addImm(0)
1835 .add(I.getOperand(2))
1836 .add(I.getOperand(1));
1837
1838 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1839 I.eraseFromParent();
1840 return Ret;
1841}
1842
1843static int sizeToSubRegIndex(unsigned Size) {
1844 switch (Size) {
1845 case 32:
1846 return AMDGPU::sub0;
1847 case 64:
1848 return AMDGPU::sub0_sub1;
1849 case 96:
1850 return AMDGPU::sub0_sub1_sub2;
1851 case 128:
1852 return AMDGPU::sub0_sub1_sub2_sub3;
1853 case 256:
1854 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1855 default:
1856 if (Size < 32)
1857 return AMDGPU::sub0;
1858 if (Size > 256)
1859 return -1;
1860 return sizeToSubRegIndex(PowerOf2Ceil(Size));
1861 }
1862}
1863
1864bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1865 Register DstReg = I.getOperand(0).getReg();
1866 Register SrcReg = I.getOperand(1).getReg();
1867 const LLT DstTy = MRI->getType(DstReg);
1868 const LLT SrcTy = MRI->getType(SrcReg);
1869 const LLT S1 = LLT::scalar(1);
1870
1871 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1872 const RegisterBank *DstRB;
1873 if (DstTy == S1) {
1874 // This is a special case. We don't treat s1 for legalization artifacts as
1875 // vcc booleans.
1876 DstRB = SrcRB;
1877 } else {
1878 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1879 if (SrcRB != DstRB)
1880 return false;
1881 }
1882
1883 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1884
1885 unsigned DstSize = DstTy.getSizeInBits();
1886 unsigned SrcSize = SrcTy.getSizeInBits();
1887
1888 const TargetRegisterClass *SrcRC
1889 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1890 const TargetRegisterClass *DstRC
1891 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1892 if (!SrcRC || !DstRC)
1893 return false;
1894
1895 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1896 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1897 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Failed to constrain G_TRUNC\n"
; } } while (false)
;
1898 return false;
1899 }
1900
1901 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
1902 MachineBasicBlock *MBB = I.getParent();
1903 const DebugLoc &DL = I.getDebugLoc();
1904
1905 Register LoReg = MRI->createVirtualRegister(DstRC);
1906 Register HiReg = MRI->createVirtualRegister(DstRC);
1907 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1908 .addReg(SrcReg, 0, AMDGPU::sub0);
1909 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1910 .addReg(SrcReg, 0, AMDGPU::sub1);
1911
1912 if (IsVALU && STI.hasSDWA()) {
1913 // Write the low 16-bits of the high element into the high 16-bits of the
1914 // low element.
1915 MachineInstr *MovSDWA =
1916 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1917 .addImm(0) // $src0_modifiers
1918 .addReg(HiReg) // $src0
1919 .addImm(0) // $clamp
1920 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
1921 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1922 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
1923 .addReg(LoReg, RegState::Implicit);
1924 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1925 } else {
1926 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1927 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1928 Register ImmReg = MRI->createVirtualRegister(DstRC);
1929 if (IsVALU) {
1930 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1931 .addImm(16)
1932 .addReg(HiReg);
1933 } else {
1934 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1935 .addReg(HiReg)
1936 .addImm(16);
1937 }
1938
1939 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1940 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1941 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1942
1943 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1944 .addImm(0xffff);
1945 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1946 .addReg(LoReg)
1947 .addReg(ImmReg);
1948 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1949 .addReg(TmpReg0)
1950 .addReg(TmpReg1);
1951 }
1952
1953 I.eraseFromParent();
1954 return true;
1955 }
1956
1957 if (!DstTy.isScalar())
1958 return false;
1959
1960 if (SrcSize > 32) {
1961 int SubRegIdx = sizeToSubRegIndex(DstSize);
1962 if (SubRegIdx == -1)
1963 return false;
1964
1965 // Deal with weird cases where the class only partially supports the subreg
1966 // index.
1967 const TargetRegisterClass *SrcWithSubRC
1968 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1969 if (!SrcWithSubRC)
1970 return false;
1971
1972 if (SrcWithSubRC != SrcRC) {
1973 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1974 return false;
1975 }
1976
1977 I.getOperand(1).setSubReg(SubRegIdx);
1978 }
1979
1980 I.setDesc(TII.get(TargetOpcode::COPY));
1981 return true;
1982}
1983
1984/// \returns true if a bitmask for \p Size bits will be an inline immediate.
1985static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1986 Mask = maskTrailingOnes<unsigned>(Size);
1987 int SignedMask = static_cast<int>(Mask);
1988 return SignedMask >= -16 && SignedMask <= 64;
1989}
1990
1991// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1992const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1993 Register Reg, const MachineRegisterInfo &MRI,
1994 const TargetRegisterInfo &TRI) const {
1995 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1996 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1997 return RB;
1998
1999 // Ignore the type, since we don't use vcc in artifacts.
2000 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2001 return &RBI.getRegBankFromRegClass(*RC, LLT());
2002 return nullptr;
2003}
2004
2005bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2006 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2007 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2008 const DebugLoc &DL = I.getDebugLoc();
2009 MachineBasicBlock &MBB = *I.getParent();
2010 const Register DstReg = I.getOperand(0).getReg();
2011 const Register SrcReg = I.getOperand(1).getReg();
2012
2013 const LLT DstTy = MRI->getType(DstReg);
2014 const LLT SrcTy = MRI->getType(SrcReg);
2015 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2016 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2017 const unsigned DstSize = DstTy.getSizeInBits();
2018 if (!DstTy.isScalar())
2019 return false;
2020
2021 // Artifact casts should never use vcc.
2022 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2023
2024 // FIXME: This should probably be illegal and split earlier.
2025 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2026 if (DstSize <= 32)
2027 return selectCOPY(I);
2028
2029 const TargetRegisterClass *SrcRC =
2030 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
2031 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2032 const TargetRegisterClass *DstRC =
2033 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
2034
2035 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2036 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2037 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2038 .addReg(SrcReg)
2039 .addImm(AMDGPU::sub0)
2040 .addReg(UndefReg)
2041 .addImm(AMDGPU::sub1);
2042 I.eraseFromParent();
2043
2044 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2045 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2046 }
2047
2048 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2049 // 64-bit should have been split up in RegBankSelect
2050
2051 // Try to use an and with a mask if it will save code size.
2052 unsigned Mask;
2053 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2054 MachineInstr *ExtI =
2055 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2056 .addImm(Mask)
2057 .addReg(SrcReg);
2058 I.eraseFromParent();
2059 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2060 }
2061
2062 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2063 MachineInstr *ExtI =
2064 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2065 .addReg(SrcReg)
2066 .addImm(0) // Offset
2067 .addImm(SrcSize); // Width
2068 I.eraseFromParent();
2069 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2070 }
2071
2072 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2073 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2074 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2075 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2076 return false;
2077
2078 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2079 const unsigned SextOpc = SrcSize == 8 ?
2080 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2081 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2082 .addReg(SrcReg);
2083 I.eraseFromParent();
2084 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2085 }
2086
2087 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2088 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2089
2090 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2091 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2092 // We need a 64-bit register source, but the high bits don't matter.
2093 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2094 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2095 unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2096
2097 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2098 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2099 .addReg(SrcReg, 0, SubReg)
2100 .addImm(AMDGPU::sub0)
2101 .addReg(UndefReg)
2102 .addImm(AMDGPU::sub1);
2103
2104 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2105 .addReg(ExtReg)
2106 .addImm(SrcSize << 16);
2107
2108 I.eraseFromParent();
2109 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2110 }
2111
2112 unsigned Mask;
2113 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2114 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2115 .addReg(SrcReg)
2116 .addImm(Mask);
2117 } else {
2118 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2119 .addReg(SrcReg)
2120 .addImm(SrcSize << 16);
2121 }
2122
2123 I.eraseFromParent();
2124 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2125 }
2126
2127 return false;
2128}
2129
2130bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2131 MachineBasicBlock *BB = I.getParent();
2132 MachineOperand &ImmOp = I.getOperand(1);
2133 Register DstReg = I.getOperand(0).getReg();
2134 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2135
2136 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2137 if (ImmOp.isFPImm()) {
2138 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2139 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2140 } else if (ImmOp.isCImm()) {
2141 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2142 } else {
2143 llvm_unreachable("Not supported by g_constants")::llvm::llvm_unreachable_internal("Not supported by g_constants"
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 2143
)
;
2144 }
2145
2146 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2147 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2148
2149 unsigned Opcode;
2150 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2151 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2152 } else {
2153 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2154
2155 // We should never produce s1 values on banks other than VCC. If the user of
2156 // this already constrained the register, we may incorrectly think it's VCC
2157 // if it wasn't originally.
2158 if (Size == 1)
2159 return false;
2160 }
2161
2162 if (Size != 64) {
2163 I.setDesc(TII.get(Opcode));
2164 I.addImplicitDefUseOperands(*MF);
2165 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2166 }
2167
2168 const DebugLoc &DL = I.getDebugLoc();
2169
2170 APInt Imm(Size, I.getOperand(1).getImm());
2171
2172 MachineInstr *ResInst;
2173 if (IsSgpr && TII.isInlineConstant(Imm)) {
2174 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2175 .addImm(I.getOperand(1).getImm());
2176 } else {
2177 const TargetRegisterClass *RC = IsSgpr ?
2178 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2179 Register LoReg = MRI->createVirtualRegister(RC);
2180 Register HiReg = MRI->createVirtualRegister(RC);
2181
2182 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2183 .addImm(Imm.trunc(32).getZExtValue());
2184
2185 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2186 .addImm(Imm.ashr(32).getZExtValue());
2187
2188 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2189 .addReg(LoReg)
2190 .addImm(AMDGPU::sub0)
2191 .addReg(HiReg)
2192 .addImm(AMDGPU::sub1);
2193 }
2194
2195 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2196 // work for target independent opcodes
2197 I.eraseFromParent();
2198 const TargetRegisterClass *DstRC =
2199 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2200 if (!DstRC)
2201 return true;
2202 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2203}
2204
2205bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2206 // Only manually handle the f64 SGPR case.
2207 //
2208 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2209 // the bit ops theoretically have a second result due to the implicit def of
2210 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2211 // that is easy by disabling the check. The result works, but uses a
2212 // nonsensical sreg32orlds_and_sreg_1 regclass.
2213 //
2214 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2215 // the variadic REG_SEQUENCE operands.
2216
2217 Register Dst = MI.getOperand(0).getReg();
2218 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2219 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2220 MRI->getType(Dst) != LLT::scalar(64))
2221 return false;
2222
2223 Register Src = MI.getOperand(1).getReg();
2224 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2225 if (Fabs)
2226 Src = Fabs->getOperand(1).getReg();
2227
2228 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2229 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2230 return false;
2231
2232 MachineBasicBlock *BB = MI.getParent();
2233 const DebugLoc &DL = MI.getDebugLoc();
2234 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2235 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2236 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2237 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2238
2239 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2240 .addReg(Src, 0, AMDGPU::sub0);
2241 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2242 .addReg(Src, 0, AMDGPU::sub1);
2243 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2244 .addImm(0x80000000);
2245
2246 // Set or toggle sign bit.
2247 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2248 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2249 .addReg(HiReg)
2250 .addReg(ConstReg);
2251 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2252 .addReg(LoReg)
2253 .addImm(AMDGPU::sub0)
2254 .addReg(OpReg)
2255 .addImm(AMDGPU::sub1);
2256 MI.eraseFromParent();
2257 return true;
2258}
2259
2260// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2261bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2262 Register Dst = MI.getOperand(0).getReg();
2263 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2264 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2265 MRI->getType(Dst) != LLT::scalar(64))
2266 return false;
2267
2268 Register Src = MI.getOperand(1).getReg();
2269 MachineBasicBlock *BB = MI.getParent();
2270 const DebugLoc &DL = MI.getDebugLoc();
2271 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2272 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2273 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2274 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2275
2276 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2277 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2278 return false;
2279
2280 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2281 .addReg(Src, 0, AMDGPU::sub0);
2282 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2283 .addReg(Src, 0, AMDGPU::sub1);
2284 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2285 .addImm(0x7fffffff);
2286
2287 // Clear sign bit.
2288 // TODO: Should this used S_BITSET0_*?
2289 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2290 .addReg(HiReg)
2291 .addReg(ConstReg);
2292 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2293 .addReg(LoReg)
2294 .addImm(AMDGPU::sub0)
2295 .addReg(OpReg)
2296 .addImm(AMDGPU::sub1);
2297
2298 MI.eraseFromParent();
2299 return true;
2300}
2301
2302static bool isConstant(const MachineInstr &MI) {
2303 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2304}
2305
2306void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2307 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2308
2309 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2310
2311 assert(PtrMI)(static_cast <bool> (PtrMI) ? void (0) : __assert_fail (
"PtrMI", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2311, __extension__ __PRETTY_FUNCTION__))
;
2312
2313 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2314 return;
2315
2316 GEPInfo GEPInfo(*PtrMI);
2317
2318 for (unsigned i = 1; i != 3; ++i) {
2319 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2320 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2321 assert(OpDef)(static_cast <bool> (OpDef) ? void (0) : __assert_fail (
"OpDef", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2321, __extension__ __PRETTY_FUNCTION__))
;
2322 if (i == 2 && isConstant(*OpDef)) {
2323 // TODO: Could handle constant base + variable offset, but a combine
2324 // probably should have commuted it.
2325 assert(GEPInfo.Imm == 0)(static_cast <bool> (GEPInfo.Imm == 0) ? void (0) : __assert_fail
("GEPInfo.Imm == 0", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2325, __extension__ __PRETTY_FUNCTION__))
;
2326 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2327 continue;
2328 }
2329 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2330 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2331 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2332 else
2333 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2334 }
2335
2336 AddrInfo.push_back(GEPInfo);
2337 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2338}
2339
2340bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2341 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2342}
2343
2344bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2345 if (!MI.hasOneMemOperand())
2346 return false;
2347
2348 const MachineMemOperand *MMO = *MI.memoperands_begin();
2349 const Value *Ptr = MMO->getValue();
2350
2351 // UndefValue means this is a load of a kernel input. These are uniform.
2352 // Sometimes LDS instructions have constant pointers.
2353 // If Ptr is null, then that means this mem operand contains a
2354 // PseudoSourceValue like GOT.
2355 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2356 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2357 return true;
2358
2359 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2360 return true;
2361
2362 const Instruction *I = dyn_cast<Instruction>(Ptr);
2363 return I && I->getMetadata("amdgpu.uniform");
2364}
2365
2366bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2367 for (const GEPInfo &GEPInfo : AddrInfo) {
2368 if (!GEPInfo.VgprParts.empty())
2369 return true;
2370 }
2371 return false;
2372}
2373
2374void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2375 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2376 unsigned AS = PtrTy.getAddressSpace();
2377 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2378 STI.ldsRequiresM0Init()) {
2379 MachineBasicBlock *BB = I.getParent();
2380
2381 // If DS instructions require M0 initialization, insert it before selecting.
2382 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2383 .addImm(-1);
2384 }
2385}
2386
2387bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2388 MachineInstr &I) const {
2389 if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) {
2390 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2391 unsigned AS = PtrTy.getAddressSpace();
2392 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
2393 return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2));
2394 }
2395
2396 initM0(I);
2397 return selectImpl(I, *CoverageInfo);
2398}
2399
2400// TODO: No rtn optimization.
2401bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2402 MachineInstr &MI) const {
2403 Register PtrReg = MI.getOperand(1).getReg();
2404 const LLT PtrTy = MRI->getType(PtrReg);
2405 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2406 STI.useFlatForGlobal())
2407 return selectImpl(MI, *CoverageInfo);
2408
2409 Register DstReg = MI.getOperand(0).getReg();
2410 const LLT Ty = MRI->getType(DstReg);
2411 const bool Is64 = Ty.getSizeInBits() == 64;
2412 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2413 Register TmpReg = MRI->createVirtualRegister(
2414 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2415
2416 const DebugLoc &DL = MI.getDebugLoc();
2417 MachineBasicBlock *BB = MI.getParent();
2418
2419 Register VAddr, RSrcReg, SOffset;
2420 int64_t Offset = 0;
2421
2422 unsigned Opcode;
2423 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2424 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2425 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2426 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2427 RSrcReg, SOffset, Offset)) {
2428 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2429 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2430 } else
2431 return selectImpl(MI, *CoverageInfo);
2432
2433 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2434 .addReg(MI.getOperand(2).getReg());
2435
2436 if (VAddr)
2437 MIB.addReg(VAddr);
2438
2439 MIB.addReg(RSrcReg);
2440 if (SOffset)
2441 MIB.addReg(SOffset);
2442 else
2443 MIB.addImm(0);
2444
2445 MIB.addImm(Offset);
2446 MIB.addImm(AMDGPU::CPol::GLC);
2447 MIB.cloneMemRefs(MI);
2448
2449 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2450 .addReg(TmpReg, RegState::Kill, SubReg);
2451
2452 MI.eraseFromParent();
2453
2454 MRI->setRegClass(
2455 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2456 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2457}
2458
2459static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
2460 if (Reg.isPhysical())
2461 return false;
2462
2463 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2464 const unsigned Opcode = MI.getOpcode();
2465
2466 if (Opcode == AMDGPU::COPY)
2467 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2468
2469 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2470 Opcode == AMDGPU::G_XOR)
2471 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2472 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2473
2474 if (Opcode == TargetOpcode::G_INTRINSIC)
2475 return MI.getIntrinsicID() == Intrinsic::amdgcn_class;
2476
2477 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2478}
2479
2480bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2481 MachineBasicBlock *BB = I.getParent();
2482 MachineOperand &CondOp = I.getOperand(0);
2483 Register CondReg = CondOp.getReg();
2484 const DebugLoc &DL = I.getDebugLoc();
2485
2486 unsigned BrOpcode;
2487 Register CondPhysReg;
2488 const TargetRegisterClass *ConstrainRC;
2489
2490 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2491 // whether the branch is uniform when selecting the instruction. In
2492 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2493 // RegBankSelect knows what it's doing if the branch condition is scc, even
2494 // though it currently does not.
2495 if (!isVCC(CondReg, *MRI)) {
2496 if (MRI->getType(CondReg) != LLT::scalar(32))
2497 return false;
2498
2499 CondPhysReg = AMDGPU::SCC;
2500 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2501 ConstrainRC = &AMDGPU::SReg_32RegClass;
2502 } else {
2503 // FIXME: Should scc->vcc copies and with exec?
2504
2505 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2506 // need to insert an and with exec.
2507 if (!isVCmpResult(CondReg, *MRI)) {
2508 const bool Is64 = STI.isWave64();
2509 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2510 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2511
2512 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2513 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2514 .addReg(CondReg)
2515 .addReg(Exec);
2516 CondReg = TmpReg;
2517 }
2518
2519 CondPhysReg = TRI.getVCC();
2520 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2521 ConstrainRC = TRI.getBoolRC();
2522 }
2523
2524 if (!MRI->getRegClassOrNull(CondReg))
2525 MRI->setRegClass(CondReg, ConstrainRC);
2526
2527 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2528 .addReg(CondReg);
2529 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2530 .addMBB(I.getOperand(1).getMBB());
2531
2532 I.eraseFromParent();
2533 return true;
2534}
2535
2536bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2537 MachineInstr &I) const {
2538 Register DstReg = I.getOperand(0).getReg();
2539 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2540 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2541 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2542 if (IsVGPR)
2543 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2544
2545 return RBI.constrainGenericRegister(
2546 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2547}
2548
2549bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2550 Register DstReg = I.getOperand(0).getReg();
2551 Register SrcReg = I.getOperand(1).getReg();
2552 Register MaskReg = I.getOperand(2).getReg();
2553 LLT Ty = MRI->getType(DstReg);
2554 LLT MaskTy = MRI->getType(MaskReg);
2555
2556 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2557 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2558 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2559 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2560 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2561 return false;
2562
2563 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2564 const TargetRegisterClass &RegRC
2565 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2566
2567 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2568 *MRI);
2569 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2570 *MRI);
2571 const TargetRegisterClass *MaskRC =
2572 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2573
2574 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2575 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2576 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2577 return false;
2578
2579 MachineBasicBlock *BB = I.getParent();
2580 const DebugLoc &DL = I.getDebugLoc();
2581 if (Ty.getSizeInBits() == 32) {
2582 assert(MaskTy.getSizeInBits() == 32 &&(static_cast <bool> (MaskTy.getSizeInBits() == 32 &&
"ptrmask should have been narrowed during legalize") ? void (
0) : __assert_fail ("MaskTy.getSizeInBits() == 32 && \"ptrmask should have been narrowed during legalize\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 2583
, __extension__ __PRETTY_FUNCTION__))
2583 "ptrmask should have been narrowed during legalize")(static_cast <bool> (MaskTy.getSizeInBits() == 32 &&
"ptrmask should have been narrowed during legalize") ? void (
0) : __assert_fail ("MaskTy.getSizeInBits() == 32 && \"ptrmask should have been narrowed during legalize\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 2583
, __extension__ __PRETTY_FUNCTION__))
;
2584
2585 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2586 .addReg(SrcReg)
2587 .addReg(MaskReg);
2588 I.eraseFromParent();
2589 return true;
2590 }
2591
2592 Register HiReg = MRI->createVirtualRegister(&RegRC);
2593 Register LoReg = MRI->createVirtualRegister(&RegRC);
2594
2595 // Extract the subregisters from the source pointer.
2596 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2597 .addReg(SrcReg, 0, AMDGPU::sub0);
2598 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2599 .addReg(SrcReg, 0, AMDGPU::sub1);
2600
2601 Register MaskedLo, MaskedHi;
2602
2603 // Try to avoid emitting a bit operation when we only need to touch half of
2604 // the 64-bit pointer.
2605 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2606
2607 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2608 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2609 if ((MaskOnes & MaskLo32) == MaskLo32) {
2610 // If all the bits in the low half are 1, we only need a copy for it.
2611 MaskedLo = LoReg;
2612 } else {
2613 // Extract the mask subregister and apply the and.
2614 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2615 MaskedLo = MRI->createVirtualRegister(&RegRC);
2616
2617 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2618 .addReg(MaskReg, 0, AMDGPU::sub0);
2619 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2620 .addReg(LoReg)
2621 .addReg(MaskLo);
2622 }
2623
2624 if ((MaskOnes & MaskHi32) == MaskHi32) {
2625 // If all the bits in the high half are 1, we only need a copy for it.
2626 MaskedHi = HiReg;
2627 } else {
2628 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2629 MaskedHi = MRI->createVirtualRegister(&RegRC);
2630
2631 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2632 .addReg(MaskReg, 0, AMDGPU::sub1);
2633 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2634 .addReg(HiReg)
2635 .addReg(MaskHi);
2636 }
2637
2638 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2639 .addReg(MaskedLo)
2640 .addImm(AMDGPU::sub0)
2641 .addReg(MaskedHi)
2642 .addImm(AMDGPU::sub1);
2643 I.eraseFromParent();
2644 return true;
2645}
2646
2647/// Return the register to use for the index value, and the subregister to use
2648/// for the indirectly accessed register.
2649static std::pair<Register, unsigned>
2650computeIndirectRegIndex(MachineRegisterInfo &MRI,
2651 const SIRegisterInfo &TRI,
2652 const TargetRegisterClass *SuperRC,
2653 Register IdxReg,
2654 unsigned EltSize) {
2655 Register IdxBaseReg;
2656 int Offset;
2657
2658 std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2659 if (IdxBaseReg == AMDGPU::NoRegister) {
2660 // This will happen if the index is a known constant. This should ordinarily
2661 // be legalized out, but handle it as a register just in case.
2662 assert(Offset == 0)(static_cast <bool> (Offset == 0) ? void (0) : __assert_fail
("Offset == 0", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2662, __extension__ __PRETTY_FUNCTION__))
;
2663 IdxBaseReg = IdxReg;
2664 }
2665
2666 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2667
2668 // Skip out of bounds offsets, or else we would end up using an undefined
2669 // register.
2670 if (static_cast<unsigned>(Offset) >= SubRegs.size())
2671 return std::make_pair(IdxReg, SubRegs[0]);
2672 return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2673}
2674
2675bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2676 MachineInstr &MI) const {
2677 Register DstReg = MI.getOperand(0).getReg();
2678 Register SrcReg = MI.getOperand(1).getReg();
2679 Register IdxReg = MI.getOperand(2).getReg();
2680
2681 LLT DstTy = MRI->getType(DstReg);
2682 LLT SrcTy = MRI->getType(SrcReg);
2683
2684 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2685 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2686 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2687
2688 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2689 // into a waterfall loop.
2690 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2691 return false;
2692
2693 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2694 *MRI);
2695 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2696 *MRI);
2697 if (!SrcRC || !DstRC)
2698 return false;
2699 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2700 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2701 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2702 return false;
2703
2704 MachineBasicBlock *BB = MI.getParent();
2705 const DebugLoc &DL = MI.getDebugLoc();
2706 const bool Is64 = DstTy.getSizeInBits() == 64;
2707
2708 unsigned SubReg;
2709 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2710 DstTy.getSizeInBits() / 8);
2711
2712 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2713 if (DstTy.getSizeInBits() != 32 && !Is64)
2714 return false;
2715
2716 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2717 .addReg(IdxReg);
2718
2719 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2720 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2721 .addReg(SrcReg, 0, SubReg)
2722 .addReg(SrcReg, RegState::Implicit);
2723 MI.eraseFromParent();
2724 return true;
2725 }
2726
2727 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2728 return false;
2729
2730 if (!STI.useVGPRIndexMode()) {
2731 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2732 .addReg(IdxReg);
2733 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2734 .addReg(SrcReg, 0, SubReg)
2735 .addReg(SrcReg, RegState::Implicit);
2736 MI.eraseFromParent();
2737 return true;
2738 }
2739
2740 const MCInstrDesc &GPRIDXDesc =
2741 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2742 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2743 .addReg(SrcReg)
2744 .addReg(IdxReg)
2745 .addImm(SubReg);
2746
2747 MI.eraseFromParent();
2748 return true;
2749}
2750
2751// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2752bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2753 MachineInstr &MI) const {
2754 Register DstReg = MI.getOperand(0).getReg();
2755 Register VecReg = MI.getOperand(1).getReg();
2756 Register ValReg = MI.getOperand(2).getReg();
2757 Register IdxReg = MI.getOperand(3).getReg();
2758
2759 LLT VecTy = MRI->getType(DstReg);
2760 LLT ValTy = MRI->getType(ValReg);
2761 unsigned VecSize = VecTy.getSizeInBits();
2762 unsigned ValSize = ValTy.getSizeInBits();
2763
2764 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2765 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2766 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2767
2768 assert(VecTy.getElementType() == ValTy)(static_cast <bool> (VecTy.getElementType() == ValTy) ?
void (0) : __assert_fail ("VecTy.getElementType() == ValTy",
"llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 2768
, __extension__ __PRETTY_FUNCTION__))
;
2769
2770 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2771 // into a waterfall loop.
2772 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2773 return false;
2774
2775 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2776 *MRI);
2777 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2778 *MRI);
2779
2780 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2781 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2782 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2783 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2784 return false;
2785
2786 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2787 return false;
2788
2789 unsigned SubReg;
2790 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2791 ValSize / 8);
2792
2793 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2794 STI.useVGPRIndexMode();
2795
2796 MachineBasicBlock *BB = MI.getParent();
2797 const DebugLoc &DL = MI.getDebugLoc();
2798
2799 if (!IndexMode) {
2800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2801 .addReg(IdxReg);
2802
2803 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2804 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2805 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2806 .addReg(VecReg)
2807 .addReg(ValReg)
2808 .addImm(SubReg);
2809 MI.eraseFromParent();
2810 return true;
2811 }
2812
2813 const MCInstrDesc &GPRIDXDesc =
2814 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2815 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2816 .addReg(VecReg)
2817 .addReg(ValReg)
2818 .addReg(IdxReg)
2819 .addImm(SubReg);
2820
2821 MI.eraseFromParent();
2822 return true;
2823}
2824
2825static bool isZeroOrUndef(int X) {
2826 return X == 0 || X == -1;
2827}
2828
2829static bool isOneOrUndef(int X) {
2830 return X == 1 || X == -1;
2831}
2832
2833static bool isZeroOrOneOrUndef(int X) {
2834 return X == 0 || X == 1 || X == -1;
2835}
2836
2837// Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2838// 32-bit register.
2839static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2840 ArrayRef<int> Mask) {
2841 NewMask[0] = Mask[0];
2842 NewMask[1] = Mask[1];
2843 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2844 return Src0;
2845
2846 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1)(static_cast <bool> (NewMask[0] == 2 || NewMask[0] == 3
|| NewMask[0] == -1) ? void (0) : __assert_fail ("NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1"
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 2846
, __extension__ __PRETTY_FUNCTION__))
;
2847 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1)(static_cast <bool> (NewMask[1] == 2 || NewMask[1] == 3
|| NewMask[1] == -1) ? void (0) : __assert_fail ("NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1"
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 2847
, __extension__ __PRETTY_FUNCTION__))
;
2848
2849 // Shift the mask inputs to be 0/1;
2850 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2851 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2852 return Src1;
2853}
2854
2855// This is only legal with VOP3P instructions as an aid to op_sel matching.
2856bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2857 MachineInstr &MI) const {
2858 Register DstReg = MI.getOperand(0).getReg();
2859 Register Src0Reg = MI.getOperand(1).getReg();
2860 Register Src1Reg = MI.getOperand(2).getReg();
2861 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2862
2863 const LLT V2S16 = LLT::fixed_vector(2, 16);
2864 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2865 return false;
2866
2867 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2868 return false;
2869
2870 assert(ShufMask.size() == 2)(static_cast <bool> (ShufMask.size() == 2) ? void (0) :
__assert_fail ("ShufMask.size() == 2", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2870, __extension__ __PRETTY_FUNCTION__))
;
2871 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA")(static_cast <bool> (STI.hasSDWA() && "no target has VOP3P but not SDWA"
) ? void (0) : __assert_fail ("STI.hasSDWA() && \"no target has VOP3P but not SDWA\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 2871
, __extension__ __PRETTY_FUNCTION__))
;
2872
2873 MachineBasicBlock *MBB = MI.getParent();
2874 const DebugLoc &DL = MI.getDebugLoc();
2875
2876 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2877 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2878 const TargetRegisterClass &RC = IsVALU ?
2879 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2880
2881 // Handle the degenerate case which should have folded out.
2882 if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2883 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2884
2885 MI.eraseFromParent();
2886 return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2887 }
2888
2889 // A legal VOP3P mask only reads one of the sources.
2890 int Mask[2];
2891 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2892
2893 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2894 !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2895 return false;
2896
2897 // TODO: This also should have been folded out
2898 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2899 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2900 .addReg(SrcVec);
2901
2902 MI.eraseFromParent();
2903 return true;
2904 }
2905
2906 if (Mask[0] == 1 && Mask[1] == -1) {
2907 if (IsVALU) {
2908 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2909 .addImm(16)
2910 .addReg(SrcVec);
2911 } else {
2912 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2913 .addReg(SrcVec)
2914 .addImm(16);
2915 }
2916 } else if (Mask[0] == -1 && Mask[1] == 0) {
2917 if (IsVALU) {
2918 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2919 .addImm(16)
2920 .addReg(SrcVec);
2921 } else {
2922 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2923 .addReg(SrcVec)
2924 .addImm(16);
2925 }
2926 } else if (Mask[0] == 0 && Mask[1] == 0) {
2927 if (IsVALU) {
2928 // Write low half of the register into the high half.
2929 MachineInstr *MovSDWA =
2930 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2931 .addImm(0) // $src0_modifiers
2932 .addReg(SrcVec) // $src0
2933 .addImm(0) // $clamp
2934 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2935 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2936 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2937 .addReg(SrcVec, RegState::Implicit);
2938 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2939 } else {
2940 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2941 .addReg(SrcVec)
2942 .addReg(SrcVec);
2943 }
2944 } else if (Mask[0] == 1 && Mask[1] == 1) {
2945 if (IsVALU) {
2946 // Write high half of the register into the low half.
2947 MachineInstr *MovSDWA =
2948 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2949 .addImm(0) // $src0_modifiers
2950 .addReg(SrcVec) // $src0
2951 .addImm(0) // $clamp
2952 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
2953 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2954 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
2955 .addReg(SrcVec, RegState::Implicit);
2956 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2957 } else {
2958 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2959 .addReg(SrcVec)
2960 .addReg(SrcVec);
2961 }
2962 } else if (Mask[0] == 1 && Mask[1] == 0) {
2963 if (IsVALU) {
2964 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2965 .addReg(SrcVec)
2966 .addReg(SrcVec)
2967 .addImm(16);
2968 } else {
2969 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2970 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2971 .addReg(SrcVec)
2972 .addImm(16);
2973 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2974 .addReg(TmpReg)
2975 .addReg(SrcVec);
2976 }
2977 } else
2978 llvm_unreachable("all shuffle masks should be handled")::llvm::llvm_unreachable_internal("all shuffle masks should be handled"
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 2978
)
;
2979
2980 MI.eraseFromParent();
2981 return true;
2982}
2983
2984bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2985 MachineInstr &MI) const {
2986 if (STI.hasGFX90AInsts())
2987 return selectImpl(MI, *CoverageInfo);
2988
2989 MachineBasicBlock *MBB = MI.getParent();
2990 const DebugLoc &DL = MI.getDebugLoc();
2991
2992 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2993 Function &F = MBB->getParent()->getFunction();
2994 DiagnosticInfoUnsupported
2995 NoFpRet(F, "return versions of fp atomics not supported",
2996 MI.getDebugLoc(), DS_Error);
2997 F.getContext().diagnose(NoFpRet);
2998 return false;
2999 }
3000
3001 // FIXME: This is only needed because tablegen requires number of dst operands
3002 // in match and replace pattern to be the same. Otherwise patterns can be
3003 // exported from SDag path.
3004 MachineOperand &VDataIn = MI.getOperand(1);
3005 MachineOperand &VIndex = MI.getOperand(3);
3006 MachineOperand &VOffset = MI.getOperand(4);
3007 MachineOperand &SOffset = MI.getOperand(5);
3008 int16_t Offset = MI.getOperand(6).getImm();
3009
3010 bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
3011 bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
3012
3013 unsigned Opcode;
3014 if (HasVOffset) {
3015 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
3016 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
3017 } else {
3018 Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
3019 : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
3020 }
3021
3022 if (MRI->getType(VDataIn.getReg()).isVector()) {
3023 switch (Opcode) {
3024 case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
3025 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
3026 break;
3027 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
3028 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
3029 break;
3030 case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
3031 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
3032 break;
3033 case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
3034 Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
3035 break;
3036 }
3037 }
3038
3039 auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
3040 I.add(VDataIn);
3041
3042 if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
3043 Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
3044 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3045 BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3046 .addReg(VIndex.getReg())
3047 .addImm(AMDGPU::sub0)
3048 .addReg(VOffset.getReg())
3049 .addImm(AMDGPU::sub1);
3050
3051 I.addReg(IdxReg);
3052 } else if (HasVIndex) {
3053 I.add(VIndex);
3054 } else if (HasVOffset) {
3055 I.add(VOffset);
3056 }
3057
3058 I.add(MI.getOperand(2)); // rsrc
3059 I.add(SOffset);
3060 I.addImm(Offset);
3061 I.addImm(MI.getOperand(7).getImm()); // cpol
3062 I.cloneMemRefs(MI);
3063
3064 MI.eraseFromParent();
3065
3066 return true;
3067}
3068
3069bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
3070 MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const {
3071
3072 if (STI.hasGFX90AInsts()) {
3073 // gfx90a adds return versions of the global atomic fadd instructions so no
3074 // special handling is required.
3075 return selectImpl(MI, *CoverageInfo);
3076 }
3077
3078 MachineBasicBlock *MBB = MI.getParent();
3079 const DebugLoc &DL = MI.getDebugLoc();
3080
3081 if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
3082 Function &F = MBB->getParent()->getFunction();
3083 DiagnosticInfoUnsupported
3084 NoFpRet(F, "return versions of fp atomics not supported",
3085 MI.getDebugLoc(), DS_Error);
3086 F.getContext().diagnose(NoFpRet);
3087 return false;
3088 }
3089
3090 // FIXME: This is only needed because tablegen requires number of dst operands
3091 // in match and replace pattern to be the same. Otherwise patterns can be
3092 // exported from SDag path.
3093 auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal);
3094
3095 Register Data = DataOp.getReg();
3096 const unsigned Opc = MRI->getType(Data).isVector() ?
3097 AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3098 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3099 .addReg(Addr.first)
3100 .addReg(Data)
3101 .addImm(Addr.second)
3102 .addImm(0) // cpol
3103 .cloneMemRefs(MI);
3104
3105 MI.eraseFromParent();
3106 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3107}
3108
3109bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3110 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3111 MI.RemoveOperand(1);
3112 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3113 return true;
3114}
3115
3116bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3117 if (I.isPHI())
3118 return selectPHI(I);
3119
3120 if (!I.isPreISelOpcode()) {
3121 if (I.isCopy())
3122 return selectCOPY(I);
3123 return true;
3124 }
3125
3126 switch (I.getOpcode()) {
3127 case TargetOpcode::G_AND:
3128 case TargetOpcode::G_OR:
3129 case TargetOpcode::G_XOR:
3130 if (selectImpl(I, *CoverageInfo))
3131 return true;
3132 return selectG_AND_OR_XOR(I);
3133 case TargetOpcode::G_ADD:
3134 case TargetOpcode::G_SUB:
3135 if (selectImpl(I, *CoverageInfo))
3136 return true;
3137 return selectG_ADD_SUB(I);
3138 case TargetOpcode::G_UADDO:
3139 case TargetOpcode::G_USUBO:
3140 case TargetOpcode::G_UADDE:
3141 case TargetOpcode::G_USUBE:
3142 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3143 case TargetOpcode::G_INTTOPTR:
3144 case TargetOpcode::G_BITCAST:
3145 case TargetOpcode::G_PTRTOINT:
3146 return selectCOPY(I);
3147 case TargetOpcode::G_CONSTANT:
3148 case TargetOpcode::G_FCONSTANT:
3149 return selectG_CONSTANT(I);
3150 case TargetOpcode::G_FNEG:
3151 if (selectImpl(I, *CoverageInfo))
3152 return true;
3153 return selectG_FNEG(I);
3154 case TargetOpcode::G_FABS:
3155 if (selectImpl(I, *CoverageInfo))
3156 return true;
3157 return selectG_FABS(I);
3158 case TargetOpcode::G_EXTRACT:
3159 return selectG_EXTRACT(I);
3160 case TargetOpcode::G_MERGE_VALUES:
3161 case TargetOpcode::G_BUILD_VECTOR:
3162 case TargetOpcode::G_CONCAT_VECTORS:
3163 return selectG_MERGE_VALUES(I);
3164 case TargetOpcode::G_UNMERGE_VALUES:
3165 return selectG_UNMERGE_VALUES(I);
3166 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3167 return selectG_BUILD_VECTOR_TRUNC(I);
3168 case TargetOpcode::G_PTR_ADD:
3169 return selectG_PTR_ADD(I);
3170 case TargetOpcode::G_IMPLICIT_DEF:
3171 return selectG_IMPLICIT_DEF(I);
3172 case TargetOpcode::G_FREEZE:
3173 return selectCOPY(I);
3174 case TargetOpcode::G_INSERT:
3175 return selectG_INSERT(I);
3176 case TargetOpcode::G_INTRINSIC:
3177 return selectG_INTRINSIC(I);
3178 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3179 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3180 case TargetOpcode::G_ICMP:
3181 if (selectG_ICMP(I))
3182 return true;
3183 return selectImpl(I, *CoverageInfo);
3184 case TargetOpcode::G_LOAD:
3185 case TargetOpcode::G_STORE:
3186 case TargetOpcode::G_ATOMIC_CMPXCHG:
3187 case TargetOpcode::G_ATOMICRMW_XCHG:
3188 case TargetOpcode::G_ATOMICRMW_ADD:
3189 case TargetOpcode::G_ATOMICRMW_SUB:
3190 case TargetOpcode::G_ATOMICRMW_AND:
3191 case TargetOpcode::G_ATOMICRMW_OR:
3192 case TargetOpcode::G_ATOMICRMW_XOR:
3193 case TargetOpcode::G_ATOMICRMW_MIN:
3194 case TargetOpcode::G_ATOMICRMW_MAX:
3195 case TargetOpcode::G_ATOMICRMW_UMIN:
3196 case TargetOpcode::G_ATOMICRMW_UMAX:
3197 case TargetOpcode::G_ATOMICRMW_FADD:
3198 case AMDGPU::G_AMDGPU_ATOMIC_INC:
3199 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3200 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3201 case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3202 return selectG_LOAD_STORE_ATOMICRMW(I);
3203 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3204 return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3205 case TargetOpcode::G_SELECT:
3206 return selectG_SELECT(I);
3207 case TargetOpcode::G_TRUNC:
3208 return selectG_TRUNC(I);
3209 case TargetOpcode::G_SEXT:
3210 case TargetOpcode::G_ZEXT:
3211 case TargetOpcode::G_ANYEXT:
3212 case TargetOpcode::G_SEXT_INREG:
3213 if (selectImpl(I, *CoverageInfo))
3214 return true;
3215 return selectG_SZA_EXT(I);
3216 case TargetOpcode::G_BRCOND:
3217 return selectG_BRCOND(I);
3218 case TargetOpcode::G_GLOBAL_VALUE:
3219 return selectG_GLOBAL_VALUE(I);
3220 case TargetOpcode::G_PTRMASK:
3221 return selectG_PTRMASK(I);
3222 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3223 return selectG_EXTRACT_VECTOR_ELT(I);
3224 case TargetOpcode::G_INSERT_VECTOR_ELT:
3225 return selectG_INSERT_VECTOR_ELT(I);
3226 case TargetOpcode::G_SHUFFLE_VECTOR:
3227 return selectG_SHUFFLE_VECTOR(I);
3228 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3229 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3230 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3231 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3232 const AMDGPU::ImageDimIntrinsicInfo *Intr
3233 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3234 assert(Intr && "not an image intrinsic with image pseudo")(static_cast <bool> (Intr && "not an image intrinsic with image pseudo"
) ? void (0) : __assert_fail ("Intr && \"not an image intrinsic with image pseudo\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 3234
, __extension__ __PRETTY_FUNCTION__))
;
3235 return selectImageIntrinsic(I, Intr);
3236 }
3237 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3238 return selectBVHIntrinsic(I);
3239 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3240 return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3241 case AMDGPU::G_SBFX:
3242 case AMDGPU::G_UBFX:
3243 return selectG_SBFX_UBFX(I);
3244 case AMDGPU::G_SI_CALL:
3245 I.setDesc(TII.get(AMDGPU::SI_CALL));
3246 return true;
3247 default:
3248 return selectImpl(I, *CoverageInfo);
3249 }
3250 return false;
3251}
3252
3253InstructionSelector::ComplexRendererFns
3254AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3255 return {{
3256 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3257 }};
3258
3259}
3260
3261std::pair<Register, unsigned>
3262AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3263 bool AllowAbs) const {
3264 Register Src = Root.getReg();
3265 Register OrigSrc = Src;
3266 unsigned Mods = 0;
3267 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3268
3269 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3270 Src = MI->getOperand(1).getReg();
3271 Mods |= SISrcMods::NEG;
3272 MI = getDefIgnoringCopies(Src, *MRI);
3273 }
3274
3275 if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3276 Src = MI->getOperand(1).getReg();
3277 Mods |= SISrcMods::ABS;
3278 }
3279
3280 if (Mods != 0 &&
3281 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3282 MachineInstr *UseMI = Root.getParent();
3283
3284 // If we looked through copies to find source modifiers on an SGPR operand,
3285 // we now have an SGPR register source. To avoid potentially violating the
3286 // constant bus restriction, we need to insert a copy to a VGPR.
3287 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3288 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
3289 TII.get(AMDGPU::COPY), VGPRSrc)
3290 .addReg(Src);
3291 Src = VGPRSrc;
3292 }
3293
3294 return std::make_pair(Src, Mods);
3295}
3296
3297///
3298/// This will select either an SGPR or VGPR operand and will save us from
3299/// having to write an extra tablegen pattern.
3300InstructionSelector::ComplexRendererFns
3301AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3302 return {{
3303 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3304 }};
3305}
3306
3307InstructionSelector::ComplexRendererFns
3308AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3309 Register Src;
3310 unsigned Mods;
3311 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3312
3313 return {{
3314 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3315 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3316 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3317 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3318 }};
3319}
3320
3321InstructionSelector::ComplexRendererFns
3322AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3323 Register Src;
3324 unsigned Mods;
3325 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3326
3327 return {{
3328 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3329 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3330 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3331 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3332 }};
3333}
3334
3335InstructionSelector::ComplexRendererFns
3336AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3337 return {{
3338 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3339 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3340 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3341 }};
3342}
3343
3344InstructionSelector::ComplexRendererFns
3345AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3346 Register Src;
3347 unsigned Mods;
3348 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3349
3350 return {{
3351 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3352 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3353 }};
3354}
3355
3356InstructionSelector::ComplexRendererFns
3357AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3358 Register Src;
3359 unsigned Mods;
3360 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3361
3362 return {{
3363 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3364 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3365 }};
3366}
3367
3368InstructionSelector::ComplexRendererFns
3369AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3370 Register Reg = Root.getReg();
3371 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3372 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3373 Def->getOpcode() == AMDGPU::G_FABS))
3374 return {};
3375 return {{
3376 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3377 }};
3378}
3379
3380std::pair<Register, unsigned>
3381AMDGPUInstructionSelector::selectVOP3PModsImpl(
3382 Register Src, const MachineRegisterInfo &MRI) const {
3383 unsigned Mods = 0;
3384 MachineInstr *MI = MRI.getVRegDef(Src);
3385
3386 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3387 // It's possible to see an f32 fneg here, but unlikely.
3388 // TODO: Treat f32 fneg as only high bit.
3389 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3390 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3391 Src = MI->getOperand(1).getReg();
3392 MI = MRI.getVRegDef(Src);
Value stored to 'MI' is never read
3393 }
3394
3395 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3396
3397 // Packed instructions do not have abs modifiers.
3398 Mods |= SISrcMods::OP_SEL_1;
3399
3400 return std::make_pair(Src, Mods);
3401}
3402
3403InstructionSelector::ComplexRendererFns
3404AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3405 MachineRegisterInfo &MRI
3406 = Root.getParent()->getParent()->getParent()->getRegInfo();
3407
3408 Register Src;
3409 unsigned Mods;
3410 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3411
3412 return {{
3413 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3414 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3415 }};
3416}
3417
3418InstructionSelector::ComplexRendererFns
3419AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3420 Register Src;
3421 unsigned Mods;
3422 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3423 if (!isKnownNeverNaN(Src, *MRI))
3424 return None;
3425
3426 return {{
3427 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3428 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3429 }};
3430}
3431
3432InstructionSelector::ComplexRendererFns
3433AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3434 // FIXME: Handle op_sel
3435 return {{
3436 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3437 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3438 }};
3439}
3440
3441InstructionSelector::ComplexRendererFns
3442AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3443 SmallVector<GEPInfo, 4> AddrInfo;
3444 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3445
3446 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3447 return None;
3448
3449 const GEPInfo &GEPInfo = AddrInfo[0];
3450 Optional<int64_t> EncodedImm =
3451 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3452 if (!EncodedImm)
3453 return None;
3454
3455 unsigned PtrReg = GEPInfo.SgprParts[0];
3456 return {{
3457 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3458 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3459 }};
3460}
3461
3462InstructionSelector::ComplexRendererFns
3463AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3464 SmallVector<GEPInfo, 4> AddrInfo;
3465 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3466
3467 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3468 return None;
3469
3470 const GEPInfo &GEPInfo = AddrInfo[0];
3471 Register PtrReg = GEPInfo.SgprParts[0];
3472 Optional<int64_t> EncodedImm =
3473 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3474 if (!EncodedImm)
3475 return None;
3476
3477 return {{
3478 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3479 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3480 }};
3481}
3482
3483InstructionSelector::ComplexRendererFns
3484AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3485 MachineInstr *MI = Root.getParent();
3486 MachineBasicBlock *MBB = MI->getParent();
3487
3488 SmallVector<GEPInfo, 4> AddrInfo;
3489 getAddrModeInfo(*MI, *MRI, AddrInfo);
3490
3491 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3492 // then we can select all ptr + 32-bit offsets not just immediate offsets.
3493 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3494 return None;
3495
3496 const GEPInfo &GEPInfo = AddrInfo[0];
3497 // SGPR offset is unsigned.
3498 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3499 return None;
3500
3501 // If we make it this far we have a load with an 32-bit immediate offset.
3502 // It is OK to select this using a sgpr offset, because we have already
3503 // failed trying to select this load into one of the _IMM variants since
3504 // the _IMM Patterns are considered before the _SGPR patterns.
3505 Register PtrReg = GEPInfo.SgprParts[0];
3506 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3507 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3508 .addImm(GEPInfo.Imm);
3509 return {{
3510 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3511 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3512 }};
3513}
3514
3515std::pair<Register, int>
3516AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
3517 uint64_t FlatVariant) const {
3518 MachineInstr *MI = Root.getParent();
3519
3520 auto Default = std::make_pair(Root.getReg(), 0);
3521
3522 if (!STI.hasFlatInstOffsets())
3523 return Default;
3524
3525 Register PtrBase;
3526 int64_t ConstOffset;
3527 std::tie(PtrBase, ConstOffset) =
3528 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3529 if (ConstOffset == 0)
3530 return Default;
3531
3532 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3533 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
3534 return Default;
3535
3536 return std::make_pair(PtrBase, ConstOffset);
3537}
3538
3539InstructionSelector::ComplexRendererFns
3540AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3541 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
3542
3543 return {{
3544 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3545 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3546 }};
3547}
3548
3549InstructionSelector::ComplexRendererFns
3550AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
3551 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
3552
3553 return {{
3554 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3555 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3556 }};
3557}
3558
3559InstructionSelector::ComplexRendererFns
3560AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
3561 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
3562
3563 return {{
3564 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3565 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3566 }};
3567}
3568
3569/// Match a zero extend from a 32-bit value to 64-bits.
3570static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3571 Register ZExtSrc;
3572 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3573 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3574
3575 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3576 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3577 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3578 return false;
3579
3580 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3581 return Def->getOperand(1).getReg();
3582 }
3583
3584 return Register();
3585}
3586
3587// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3588InstructionSelector::ComplexRendererFns
3589AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3590 Register Addr = Root.getReg();
3591 Register PtrBase;
3592 int64_t ConstOffset;
3593 int64_t ImmOffset = 0;
3594
3595 // Match the immediate offset first, which canonically is moved as low as
3596 // possible.
3597 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3598
3599 if (ConstOffset != 0) {
3600 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
3601 SIInstrFlags::FlatGlobal)) {
3602 Addr = PtrBase;
3603 ImmOffset = ConstOffset;
3604 } else {
3605 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3606 if (!PtrBaseDef)
3607 return None;
3608
3609 if (isSGPR(PtrBaseDef->Reg)) {
3610 if (ConstOffset > 0) {
3611 // Offset is too large.
3612 //
3613 // saddr + large_offset -> saddr +
3614 // (voffset = large_offset & ~MaxOffset) +
3615 // (large_offset & MaxOffset);
3616 int64_t SplitImmOffset, RemainderOffset;
3617 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
3618 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
3619
3620 if (isUInt<32>(RemainderOffset)) {
3621 MachineInstr *MI = Root.getParent();
3622 MachineBasicBlock *MBB = MI->getParent();
3623 Register HighBits =
3624 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3625
3626 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3627 HighBits)
3628 .addImm(RemainderOffset);
3629
3630 return {{
3631 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
3632 [=](MachineInstrBuilder &MIB) {
3633 MIB.addReg(HighBits);
3634 }, // voffset
3635 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3636 }};
3637 }
3638 }
3639
3640 // We are adding a 64 bit SGPR and a constant. If constant bus limit
3641 // is 1 we would need to perform 1 or 2 extra moves for each half of
3642 // the constant and it is better to do a scalar add and then issue a
3643 // single VALU instruction to materialize zero. Otherwise it is less
3644 // instructions to perform VALU adds with immediates or inline literals.
3645 unsigned NumLiterals =
3646 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
3647 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
3648 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
3649 return None;
3650 }
3651 }
3652 }
3653
3654 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3655 if (!AddrDef)
3656 return None;
3657
3658 // Match the variable offset.
3659 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3660 // Look through the SGPR->VGPR copy.
3661 Register SAddr =
3662 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3663
3664 if (SAddr && isSGPR(SAddr)) {
3665 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3666
3667 // It's possible voffset is an SGPR here, but the copy to VGPR will be
3668 // inserted later.
3669 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3670 return {{[=](MachineInstrBuilder &MIB) { // saddr
3671 MIB.addReg(SAddr);
3672 },
3673 [=](MachineInstrBuilder &MIB) { // voffset
3674 MIB.addReg(VOffset);
3675 },
3676 [=](MachineInstrBuilder &MIB) { // offset
3677 MIB.addImm(ImmOffset);
3678 }}};
3679 }
3680 }
3681 }
3682
3683 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3684 // drop this.
3685 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3686 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
3687 return None;
3688
3689 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3690 // moves required to copy a 64-bit SGPR to VGPR.
3691 MachineInstr *MI = Root.getParent();
3692 MachineBasicBlock *MBB = MI->getParent();
3693 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3694
3695 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3696 .addImm(0);
3697
3698 return {{
3699 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
3700 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
3701 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3702 }};
3703}
3704
3705InstructionSelector::ComplexRendererFns
3706AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3707 Register Addr = Root.getReg();
3708 Register PtrBase;
3709 int64_t ConstOffset;
3710 int64_t ImmOffset = 0;
3711
3712 // Match the immediate offset first, which canonically is moved as low as
3713 // possible.
3714 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3715
3716 if (ConstOffset != 0 &&
3717 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
3718 SIInstrFlags::FlatScratch)) {
3719 Addr = PtrBase;
3720 ImmOffset = ConstOffset;
3721 }
3722
3723 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3724 if (!AddrDef)
3725 return None;
3726
3727 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3728 int FI = AddrDef->MI->getOperand(1).getIndex();
3729 return {{
3730 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3731 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3732 }};
3733 }
3734
3735 Register SAddr = AddrDef->Reg;
3736
3737 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3738 Register LHS = AddrDef->MI->getOperand(1).getReg();
3739 Register RHS = AddrDef->MI->getOperand(2).getReg();
3740 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3741 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3742
3743 if (LHSDef && RHSDef &&
3744 LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3745 isSGPR(RHSDef->Reg)) {
3746 int FI = LHSDef->MI->getOperand(1).getIndex();
3747 MachineInstr &I = *Root.getParent();
3748 MachineBasicBlock *BB = I.getParent();
3749 const DebugLoc &DL = I.getDebugLoc();
3750 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3751
3752 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
3753 .addFrameIndex(FI)
3754 .addReg(RHSDef->Reg);
3755 }
3756 }
3757
3758 if (!isSGPR(SAddr))
3759 return None;
3760
3761 return {{
3762 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3763 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3764 }};
3765}
3766
3767InstructionSelector::ComplexRendererFns
3768AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3769 MachineInstr *MI = Root.getParent();
3770 MachineBasicBlock *MBB = MI->getParent();
3771 MachineFunction *MF = MBB->getParent();
3772 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3773
3774 int64_t Offset = 0;
3775 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3776 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3777 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3778
3779 // TODO: Should this be inside the render function? The iterator seems to
3780 // move.
3781 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3782 HighBits)
3783 .addImm(Offset & ~4095);
3784
3785 return {{[=](MachineInstrBuilder &MIB) { // rsrc
3786 MIB.addReg(Info->getScratchRSrcReg());
3787 },
3788 [=](MachineInstrBuilder &MIB) { // vaddr
3789 MIB.addReg(HighBits);
3790 },
3791 [=](MachineInstrBuilder &MIB) { // soffset
3792 // Use constant zero for soffset and rely on eliminateFrameIndex
3793 // to choose the appropriate frame register if need be.
3794 MIB.addImm(0);
3795 },
3796 [=](MachineInstrBuilder &MIB) { // offset
3797 MIB.addImm(Offset & 4095);
3798 }}};
3799 }
3800
3801 assert(Offset == 0 || Offset == -1)(static_cast <bool> (Offset == 0 || Offset == -1) ? void
(0) : __assert_fail ("Offset == 0 || Offset == -1", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3801, __extension__ __PRETTY_FUNCTION__))
;
3802
3803 // Try to fold a frame index directly into the MUBUF vaddr field, and any
3804 // offsets.
3805 Optional<int> FI;
3806 Register VAddr = Root.getReg();
3807 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3808 Register PtrBase;
3809 int64_t ConstOffset;
3810 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
3811 if (ConstOffset != 0) {
3812 if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
3813 (!STI.privateMemoryResourceIsRangeChecked() ||
3814 KnownBits->signBitIsZero(PtrBase))) {
3815 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
3816 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3817 FI = PtrBaseDef->getOperand(1).getIndex();
3818 else
3819 VAddr = PtrBase;
3820 Offset = ConstOffset;
3821 }
3822 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3823 FI = RootDef->getOperand(1).getIndex();
3824 }
3825 }
3826
3827 return {{[=](MachineInstrBuilder &MIB) { // rsrc
3828 MIB.addReg(Info->getScratchRSrcReg());
3829 },
3830 [=](MachineInstrBuilder &MIB) { // vaddr
3831 if (FI.hasValue())
3832 MIB.addFrameIndex(FI.getValue());
3833 else
3834 MIB.addReg(VAddr);
3835 },
3836 [=](MachineInstrBuilder &MIB) { // soffset
3837 // Use constant zero for soffset and rely on eliminateFrameIndex
3838 // to choose the appropriate frame register if need be.
3839 MIB.addImm(0);
3840 },
3841 [=](MachineInstrBuilder &MIB) { // offset
3842 MIB.addImm(Offset);
3843 }}};
3844}
3845
3846bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3847 int64_t Offset) const {
3848 if (!isUInt<16>(Offset))
3849 return false;
3850
3851 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3852 return true;
3853
3854 // On Southern Islands instruction with a negative base value and an offset
3855 // don't seem to work.
3856 return KnownBits->signBitIsZero(Base);
3857}
3858
3859bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3860 int64_t Offset1,
3861 unsigned Size) const {
3862 if (Offset0 % Size != 0 || Offset1 % Size != 0)
3863 return false;
3864 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3865 return false;
3866
3867 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3868 return true;
3869
3870 // On Southern Islands instruction with a negative base value and an offset
3871 // don't seem to work.
3872 return KnownBits->signBitIsZero(Base);
3873}
3874
3875bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
3876 unsigned ShAmtBits) const {
3877 assert(MI.getOpcode() == TargetOpcode::G_AND)(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_AND
) ? void (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_AND"
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 3877
, __extension__ __PRETTY_FUNCTION__))
;
3878
3879 Optional<APInt> RHS = getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
3880 if (!RHS)
3881 return false;
3882
3883 if (RHS->countTrailingOnes() >= ShAmtBits)
3884 return true;
3885
3886 const APInt &LHSKnownZeros =
3887 KnownBits->getKnownZeroes(MI.getOperand(1).getReg());
3888 return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
3889}
3890
3891InstructionSelector::ComplexRendererFns
3892AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3893 MachineOperand &Root) const {
3894 MachineInstr *MI = Root.getParent();
3895 MachineBasicBlock *MBB = MI->getParent();
3896
3897 int64_t Offset = 0;
3898 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3899 !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3900 return {};
3901
3902 const MachineFunction *MF = MBB->getParent();
3903 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3904
3905 return {{
3906 [=](MachineInstrBuilder &MIB) { // rsrc
3907 MIB.addReg(Info->getScratchRSrcReg());
3908 },
3909 [=](MachineInstrBuilder &MIB) { // soffset
3910 MIB.addImm(0);
3911 },
3912 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3913 }};
3914}
3915
3916std::pair<Register, unsigned>
3917AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3918 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3919 if (!RootDef)
3920 return std::make_pair(Root.getReg(), 0);
3921
3922 int64_t ConstAddr = 0;
3923
3924 Register PtrBase;
3925 int64_t Offset;
3926 std::tie(PtrBase, Offset) =
3927 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3928
3929 if (Offset) {
3930 if (isDSOffsetLegal(PtrBase, Offset)) {
3931 // (add n0, c0)
3932 return std::make_pair(PtrBase, Offset);
3933 }
3934 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3935 // TODO
3936
3937
3938 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3939 // TODO
3940
3941 }
3942
3943 return std::make_pair(Root.getReg(), 0);
3944}
3945
3946InstructionSelector::ComplexRendererFns
3947AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3948 Register Reg;
3949 unsigned Offset;
3950 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3951 return {{
3952 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3953 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3954 }};
3955}
3956
3957InstructionSelector::ComplexRendererFns
3958AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3959 return selectDSReadWrite2(Root, 4);
3960}
3961
3962InstructionSelector::ComplexRendererFns
3963AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3964 return selectDSReadWrite2(Root, 8);
3965}
3966
3967InstructionSelector::ComplexRendererFns
3968AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3969 unsigned Size) const {
3970 Register Reg;
3971 unsigned Offset;
3972 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3973 return {{
3974 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3975 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3976 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3977 }};
3978}
3979
3980std::pair<Register, unsigned>
3981AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3982 unsigned Size) const {
3983 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3984 if (!RootDef)
3985 return std::make_pair(Root.getReg(), 0);
3986
3987 int64_t ConstAddr = 0;
3988
3989 Register PtrBase;
3990 int64_t Offset;
3991 std::tie(PtrBase, Offset) =
3992 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3993
3994 if (Offset) {
3995 int64_t OffsetValue0 = Offset;
3996 int64_t OffsetValue1 = Offset + Size;
3997 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
3998 // (add n0, c0)
3999 return std::make_pair(PtrBase, OffsetValue0 / Size);
4000 }
4001 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4002 // TODO
4003
4004 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4005 // TODO
4006
4007 }
4008
4009 return std::make_pair(Root.getReg(), 0);
4010}
4011
4012/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4013/// the base value with the constant offset. There may be intervening copies
4014/// between \p Root and the identified constant. Returns \p Root, 0 if this does
4015/// not match the pattern.
4016std::pair<Register, int64_t>
4017AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4018 Register Root, const MachineRegisterInfo &MRI) const {
4019 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4020 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4021 return {Root, 0};
4022
4023 MachineOperand &RHS = RootI->getOperand(2);
4024 Optional<ValueAndVReg> MaybeOffset =
4025 getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4026 if (!MaybeOffset)
4027 return {Root, 0};
4028 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4029}
4030
4031static void addZeroImm(MachineInstrBuilder &MIB) {
4032 MIB.addImm(0);
4033}
4034
4035/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
4036/// BasePtr is not valid, a null base pointer will be used.
4037static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
4038 uint32_t FormatLo, uint32_t FormatHi,
4039 Register BasePtr) {
4040 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4041 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4042 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4043 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
4044
4045 B.buildInstr(AMDGPU::S_MOV_B32)
4046 .addDef(RSrc2)
4047 .addImm(FormatLo);
4048 B.buildInstr(AMDGPU::S_MOV_B32)
4049 .addDef(RSrc3)
4050 .addImm(FormatHi);
4051
4052 // Build the half of the subregister with the constants before building the
4053 // full 128-bit register. If we are building multiple resource descriptors,
4054 // this will allow CSEing of the 2-component register.
4055 B.buildInstr(AMDGPU::REG_SEQUENCE)
4056 .addDef(RSrcHi)
4057 .addReg(RSrc2)
4058 .addImm(AMDGPU::sub0)
4059 .addReg(RSrc3)
4060 .addImm(AMDGPU::sub1);
4061
4062 Register RSrcLo = BasePtr;
4063 if (!BasePtr) {
4064 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4065 B.buildInstr(AMDGPU::S_MOV_B64)
4066 .addDef(RSrcLo)
4067 .addImm(0);
4068 }
4069
4070 B.buildInstr(AMDGPU::REG_SEQUENCE)
4071 .addDef(RSrc)
4072 .addReg(RSrcLo)
4073 .addImm(AMDGPU::sub0_sub1)
4074 .addReg(RSrcHi)
4075 .addImm(AMDGPU::sub2_sub3);
4076
4077 return RSrc;
4078}
4079
4080static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
4081 const SIInstrInfo &TII, Register BasePtr) {
4082 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4083
4084 // FIXME: Why are half the "default" bits ignored based on the addressing
4085 // mode?
4086 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
4087}
4088
4089static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
4090 const SIInstrInfo &TII, Register BasePtr) {
4091 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4092
4093 // FIXME: Why are half the "default" bits ignored based on the addressing
4094 // mode?
4095 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
4096}
4097
4098AMDGPUInstructionSelector::MUBUFAddressData
4099AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
4100 MUBUFAddressData Data;
4101 Data.N0 = Src;
4102
4103 Register PtrBase;
4104 int64_t Offset;
4105
4106 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
4107 if (isUInt<32>(Offset)) {
4108 Data.N0 = PtrBase;
4109 Data.Offset = Offset;
4110 }
4111
4112 if (MachineInstr *InputAdd
4113 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
4114 Data.N2 = InputAdd->getOperand(1).getReg();
4115 Data.N3 = InputAdd->getOperand(2).getReg();
4116
4117 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4118 // FIXME: Don't know this was defined by operand 0
4119 //
4120 // TODO: Remove this when we have copy folding optimizations after
4121 // RegBankSelect.
4122 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4123 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4124 }
4125
4126 return Data;
4127}
4128
4129/// Return if the addr64 mubuf mode should be used for the given address.
4130bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4131 // (ptr_add N2, N3) -> addr64, or
4132 // (ptr_add (ptr_add N2, N3), C1) -> addr64
4133 if (Addr.N2)
4134 return true;
4135
4136 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4137 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4138}
4139
4140/// Split an immediate offset \p ImmOffset depending on whether it fits in the
4141/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4142/// component.
4143void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4144 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4145 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4146 return;
4147
4148 // Illegal offset, store it in soffset.
4149 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4150 B.buildInstr(AMDGPU::S_MOV_B32)
4151 .addDef(SOffset)
4152 .addImm(ImmOffset);
4153 ImmOffset = 0;
4154}
4155
4156bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4157 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4158 Register &SOffset, int64_t &Offset) const {
4159 // FIXME: Predicates should stop this from reaching here.
4160 // addr64 bit was removed for volcanic islands.
4161 if (!STI.hasAddr64() || STI.useFlatForGlobal())
4162 return false;
4163
4164 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4165 if (!shouldUseAddr64(AddrData))
4166 return false;
4167
4168 Register N0 = AddrData.N0;
4169 Register N2 = AddrData.N2;
4170 Register N3 = AddrData.N3;
4171 Offset = AddrData.Offset;
4172
4173 // Base pointer for the SRD.
4174 Register SRDPtr;
4175
4176 if (N2) {
4177 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4178 assert(N3)(static_cast <bool> (N3) ? void (0) : __assert_fail ("N3"
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4178
, __extension__ __PRETTY_FUNCTION__))
;
4179 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4180 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4181 // addr64, and construct the default resource from a 0 address.
4182 VAddr = N0;
4183 } else {
4184 SRDPtr = N3;
4185 VAddr = N2;
4186 }
4187 } else {
4188 // N2 is not divergent.
4189 SRDPtr = N2;
4190 VAddr = N3;
4191 }
4192 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4193 // Use the default null pointer in the resource
4194 VAddr = N0;
4195 } else {
4196 // N0 -> offset, or
4197 // (N0 + C1) -> offset
4198 SRDPtr = N0;
4199 }
4200
4201 MachineIRBuilder B(*Root.getParent());
4202 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4203 splitIllegalMUBUFOffset(B, SOffset, Offset);
4204 return true;
4205}
4206
4207bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4208 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4209 int64_t &Offset) const {
4210
4211 // FIXME: Pattern should not reach here.
4212 if (STI.useFlatForGlobal())
4213 return false;
4214
4215 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4216 if (shouldUseAddr64(AddrData))
4217 return false;
4218
4219 // N0 -> offset, or
4220 // (N0 + C1) -> offset
4221 Register SRDPtr = AddrData.N0;
4222 Offset = AddrData.Offset;
4223
4224 // TODO: Look through extensions for 32-bit soffset.
4225 MachineIRBuilder B(*Root.getParent());
4226
4227 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4228 splitIllegalMUBUFOffset(B, SOffset, Offset);
4229 return true;
4230}
4231
4232InstructionSelector::ComplexRendererFns
4233AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4234 Register VAddr;
4235 Register RSrcReg;
4236 Register SOffset;
4237 int64_t Offset = 0;
4238
4239 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4240 return {};
4241
4242 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4243 // pattern.
4244 return {{
4245 [=](MachineInstrBuilder &MIB) { // rsrc
4246 MIB.addReg(RSrcReg);
4247 },
4248 [=](MachineInstrBuilder &MIB) { // vaddr
4249 MIB.addReg(VAddr);
4250 },
4251 [=](MachineInstrBuilder &MIB) { // soffset
4252 if (SOffset)
4253 MIB.addReg(SOffset);
4254 else
4255 MIB.addImm(0);
4256 },
4257 [=](MachineInstrBuilder &MIB) { // offset
4258 MIB.addImm(Offset);
4259 },
4260 addZeroImm, // cpol
4261 addZeroImm, // tfe
4262 addZeroImm // swz
4263 }};
4264}
4265
4266InstructionSelector::ComplexRendererFns
4267AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4268 Register RSrcReg;
4269 Register SOffset;
4270 int64_t Offset = 0;
4271
4272 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4273 return {};
4274
4275 return {{
4276 [=](MachineInstrBuilder &MIB) { // rsrc
4277 MIB.addReg(RSrcReg);
4278 },
4279 [=](MachineInstrBuilder &MIB) { // soffset
4280 if (SOffset)
4281 MIB.addReg(SOffset);
4282 else
4283 MIB.addImm(0);
4284 },
4285 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4286 addZeroImm, // cpol
4287 addZeroImm, // tfe
4288 addZeroImm, // swz
4289 }};
4290}
4291
4292InstructionSelector::ComplexRendererFns
4293AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4294 Register VAddr;
4295 Register RSrcReg;
4296 Register SOffset;
4297 int64_t Offset = 0;
4298
4299 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4300 return {};
4301
4302 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4303 // pattern.
4304 return {{
4305 [=](MachineInstrBuilder &MIB) { // rsrc
4306 MIB.addReg(RSrcReg);
4307 },
4308 [=](MachineInstrBuilder &MIB) { // vaddr
4309 MIB.addReg(VAddr);
4310 },
4311 [=](MachineInstrBuilder &MIB) { // soffset
4312 if (SOffset)
4313 MIB.addReg(SOffset);
4314 else
4315 MIB.addImm(0);
4316 },
4317 [=](MachineInstrBuilder &MIB) { // offset
4318 MIB.addImm(Offset);
4319 },
4320 [=](MachineInstrBuilder &MIB) {
4321 MIB.addImm(AMDGPU::CPol::GLC); // cpol
4322 }
4323 }};
4324}
4325
4326InstructionSelector::ComplexRendererFns
4327AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4328 Register RSrcReg;
4329 Register SOffset;
4330 int64_t Offset = 0;
4331
4332 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4333 return {};
4334
4335 return {{
4336 [=](MachineInstrBuilder &MIB) { // rsrc
4337 MIB.addReg(RSrcReg);
4338 },
4339 [=](MachineInstrBuilder &MIB) { // soffset
4340 if (SOffset)
4341 MIB.addReg(SOffset);
4342 else
4343 MIB.addImm(0);
4344 },
4345 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4346 [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
4347 }};
4348}
4349
4350/// Get an immediate that must be 32-bits, and treated as zero extended.
4351static Optional<uint64_t> getConstantZext32Val(Register Reg,
4352 const MachineRegisterInfo &MRI) {
4353 // getIConstantVRegVal sexts any values, so see if that matters.
4354 Optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
4355 if (!OffsetVal || !isInt<32>(*OffsetVal))
4356 return None;
4357 return Lo_32(*OffsetVal);
4358}
4359
4360InstructionSelector::ComplexRendererFns
4361AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4362 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4363 if (!OffsetVal)
4364 return {};
4365
4366 Optional<int64_t> EncodedImm =
4367 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4368 if (!EncodedImm)
4369 return {};
4370
4371 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4372}
4373
4374InstructionSelector::ComplexRendererFns
4375AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4376 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)(static_cast <bool> (STI.getGeneration() == AMDGPUSubtarget
::SEA_ISLANDS) ? void (0) : __assert_fail ("STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4376
, __extension__ __PRETTY_FUNCTION__))
;
4377
4378 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4379 if (!OffsetVal)
4380 return {};
4381
4382 Optional<int64_t> EncodedImm
4383 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4384 if (!EncodedImm)
4385 return {};
4386
4387 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4388}
4389
4390void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4391 const MachineInstr &MI,
4392 int OpIdx) const {
4393 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4394
, __extension__ __PRETTY_FUNCTION__))
4394 "Expected G_CONSTANT")(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4394
, __extension__ __PRETTY_FUNCTION__))
;
4395 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4396}
4397
4398void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4399 const MachineInstr &MI,
4400 int OpIdx) const {
4401 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4402
, __extension__ __PRETTY_FUNCTION__))
4402 "Expected G_CONSTANT")(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4402
, __extension__ __PRETTY_FUNCTION__))
;
4403 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4404}
4405
4406void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4407 const MachineInstr &MI,
4408 int OpIdx) const {
4409 assert(OpIdx == -1)(static_cast <bool> (OpIdx == -1) ? void (0) : __assert_fail
("OpIdx == -1", "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 4409, __extension__ __PRETTY_FUNCTION__))
;
4410
4411 const MachineOperand &Op = MI.getOperand(1);
4412 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4413 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4414 else {
4415 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT")(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& "Expected G_CONSTANT") ? void (0) : __assert_fail
("MI.getOpcode() == TargetOpcode::G_CONSTANT && \"Expected G_CONSTANT\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4415
, __extension__ __PRETTY_FUNCTION__))
;
4416 MIB.addImm(Op.getCImm()->getSExtValue());
4417 }
4418}
4419
4420void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4421 const MachineInstr &MI,
4422 int OpIdx) const {
4423 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4424
, __extension__ __PRETTY_FUNCTION__))
4424 "Expected G_CONSTANT")(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_CONSTANT
&& OpIdx == -1 && "Expected G_CONSTANT") ? void
(0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4424
, __extension__ __PRETTY_FUNCTION__))
;
4425 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4426}
4427
4428/// This only really exists to satisfy DAG type checking machinery, so is a
4429/// no-op here.
4430void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4431 const MachineInstr &MI,
4432 int OpIdx) const {
4433 MIB.addImm(MI.getOperand(OpIdx).getImm());
4434}
4435
4436void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
4437 const MachineInstr &MI,
4438 int OpIdx) const {
4439 assert(OpIdx >= 0 && "expected to match an immediate operand")(static_cast <bool> (OpIdx >= 0 && "expected to match an immediate operand"
) ? void (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4439
, __extension__ __PRETTY_FUNCTION__))
;
4440 MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
4441}
4442
4443void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4444 const MachineInstr &MI,
4445 int OpIdx) const {
4446 assert(OpIdx >= 0 && "expected to match an immediate operand")(static_cast <bool> (OpIdx >= 0 && "expected to match an immediate operand"
) ? void (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4446
, __extension__ __PRETTY_FUNCTION__))
;
4447 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4448}
4449
4450void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
4451 const MachineInstr &MI,
4452 int OpIdx) const {
4453 assert(OpIdx >= 0 && "expected to match an immediate operand")(static_cast <bool> (OpIdx >= 0 && "expected to match an immediate operand"
) ? void (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp", 4453
, __extension__ __PRETTY_FUNCTION__))
;
4454 MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
4455}
4456
4457void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4458 const MachineInstr &MI,
4459 int OpIdx) const {
4460 MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4461}
4462
4463bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4464 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
4465}
4466
4467bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4468 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
4469}
4470
4471bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4472 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
4473}
4474
4475bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4476 return TII.isInlineConstant(Imm);
4477}