Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Warning:line 3058, column 5
Value stored to 'MI' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUInstructionSelector.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-08-06-171148-17323-1 -x c++ /build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPUInstrInfo.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPURegisterBankInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "SIMachineFunctionInfo.h"
22#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27#include "llvm/CodeGen/GlobalISel/Utils.h"
28#include "llvm/CodeGen/MachineBasicBlock.h"
29#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineInstr.h"
31#include "llvm/CodeGen/MachineInstrBuilder.h"
32#include "llvm/CodeGen/MachineRegisterInfo.h"
33#include "llvm/IR/Type.h"
34#include "llvm/Support/Debug.h"
35#include "llvm/Support/raw_ostream.h"
36
37#define DEBUG_TYPE"amdgpu-isel" "amdgpu-isel"
38
39using namespace llvm;
40using namespace MIPatternMatch;
41
42static cl::opt<bool> AllowRiskySelect(
43 "amdgpu-global-isel-risky-select",
44 cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45 cl::init(false),
46 cl::ReallyHidden);
47
48#define GET_GLOBALISEL_IMPL
49#define AMDGPUSubtarget GCNSubtarget
50#include "AMDGPUGenGlobalISel.inc"
51#undef GET_GLOBALISEL_IMPL
52#undef AMDGPUSubtarget
53
54AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56 const AMDGPUTargetMachine &TM)
57 : InstructionSelector(), TII(*STI.getInstrInfo()),
58 TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59 STI(STI),
60 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61#define GET_GLOBALISEL_PREDICATES_INIT
62#include "AMDGPUGenGlobalISel.inc"
63#undef GET_GLOBALISEL_PREDICATES_INIT
64#define GET_GLOBALISEL_TEMPORARIES_INIT
65#include "AMDGPUGenGlobalISel.inc"
66#undef GET_GLOBALISEL_TEMPORARIES_INIT
67{
68}
69
70const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE"amdgpu-isel"; }
71
72void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73 CodeGenCoverage &CoverageInfo) {
74 MRI = &MF.getRegInfo();
75 InstructionSelector::setupMF(MF, KB, CoverageInfo);
76}
77
78bool AMDGPUInstructionSelector::isVCC(Register Reg,
79 const MachineRegisterInfo &MRI) const {
80 if (Register::isPhysicalRegister(Reg))
81 return Reg == TRI.getVCC();
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 return RC->hasSuperClassEq(TRI.getBoolRC()) &&
89 Ty.isValid() && Ty.getSizeInBits() == 1;
90 }
91
92 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
93 return RB->getID() == AMDGPU::VCCRegBankID;
94}
95
96bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
97 unsigned NewOpc) const {
98 MI.setDesc(TII.get(NewOpc));
99 MI.RemoveOperand(1); // Remove intrinsic ID.
100 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
101
102 MachineOperand &Dst = MI.getOperand(0);
103 MachineOperand &Src = MI.getOperand(1);
104
105 // TODO: This should be legalized to s32 if needed
106 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
107 return false;
108
109 const TargetRegisterClass *DstRC
110 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
111 const TargetRegisterClass *SrcRC
112 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
113 if (!DstRC || DstRC != SrcRC)
114 return false;
115
116 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
117 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
118}
119
120bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
121 const DebugLoc &DL = I.getDebugLoc();
122 MachineBasicBlock *BB = I.getParent();
123 I.setDesc(TII.get(TargetOpcode::COPY));
124
125 const MachineOperand &Src = I.getOperand(1);
126 MachineOperand &Dst = I.getOperand(0);
127 Register DstReg = Dst.getReg();
128 Register SrcReg = Src.getReg();
129
130 if (isVCC(DstReg, *MRI)) {
131 if (SrcReg == AMDGPU::SCC) {
132 const TargetRegisterClass *RC
133 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
134 if (!RC)
135 return true;
136 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
137 }
138
139 if (!isVCC(SrcReg, *MRI)) {
140 // TODO: Should probably leave the copy and let copyPhysReg expand it.
141 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
142 return false;
143
144 const TargetRegisterClass *SrcRC
145 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
146
147 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
148
149 // We can't trust the high bits at this point, so clear them.
150
151 // TODO: Skip masking high bits if def is known boolean.
152
153 unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
154 AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
155 BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
156 .addImm(1)
157 .addReg(SrcReg);
158 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
159 .addImm(0)
160 .addReg(MaskedReg);
161
162 if (!MRI->getRegClassOrNull(SrcReg))
163 MRI->setRegClass(SrcReg, SrcRC);
164 I.eraseFromParent();
165 return true;
166 }
167
168 const TargetRegisterClass *RC =
169 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
170 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
171 return false;
172
173 return true;
174 }
175
176 for (const MachineOperand &MO : I.operands()) {
177 if (Register::isPhysicalRegister(MO.getReg()))
178 continue;
179
180 const TargetRegisterClass *RC =
181 TRI.getConstrainedRegClassForOperand(MO, *MRI);
182 if (!RC)
183 continue;
184 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
185 }
186 return true;
187}
188
189bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
190 const Register DefReg = I.getOperand(0).getReg();
191 const LLT DefTy = MRI->getType(DefReg);
192 if (DefTy == LLT::scalar(1)) {
193 if (!AllowRiskySelect) {
194 LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Skipping risky boolean phi\n"
; } } while (false)
;
195 return false;
196 }
197
198 LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Selecting risky boolean phi\n"
; } } while (false)
;
199 }
200
201 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
202
203 const RegClassOrRegBank &RegClassOrBank =
204 MRI->getRegClassOrRegBank(DefReg);
205
206 const TargetRegisterClass *DefRC
207 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
208 if (!DefRC) {
209 if (!DefTy.isValid()) {
210 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "PHI operand has no type, not a gvreg?\n"
; } } while (false)
;
211 return false;
212 }
213
214 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
215 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
216 if (!DefRC) {
217 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "PHI operand has unexpected size/bank\n"
; } } while (false)
;
218 return false;
219 }
220 }
221
222 // TODO: Verify that all registers have the same bank
223 I.setDesc(TII.get(TargetOpcode::PHI));
224 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
225}
226
227MachineOperand
228AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
229 const TargetRegisterClass &SubRC,
230 unsigned SubIdx) const {
231
232 MachineInstr *MI = MO.getParent();
233 MachineBasicBlock *BB = MO.getParent()->getParent();
234 Register DstReg = MRI->createVirtualRegister(&SubRC);
235
236 if (MO.isReg()) {
237 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
238 Register Reg = MO.getReg();
239 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
240 .addReg(Reg, 0, ComposedSubIdx);
241
242 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
243 MO.isKill(), MO.isDead(), MO.isUndef(),
244 MO.isEarlyClobber(), 0, MO.isDebug(),
245 MO.isInternalRead());
246 }
247
248 assert(MO.isImm())((MO.isImm()) ? static_cast<void> (0) : __assert_fail (
"MO.isImm()", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 248, __PRETTY_FUNCTION__))
;
249
250 APInt Imm(64, MO.getImm());
251
252 switch (SubIdx) {
253 default:
254 llvm_unreachable("do not know to split immediate with this sub index.")::llvm::llvm_unreachable_internal("do not know to split immediate with this sub index."
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 254)
;
255 case AMDGPU::sub0:
256 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
257 case AMDGPU::sub1:
258 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
259 }
260}
261
262static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
263 switch (Opc) {
264 case AMDGPU::G_AND:
265 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
266 case AMDGPU::G_OR:
267 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
268 case AMDGPU::G_XOR:
269 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
270 default:
271 llvm_unreachable("not a bit op")::llvm::llvm_unreachable_internal("not a bit op", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 271)
;
272 }
273}
274
275bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
276 Register DstReg = I.getOperand(0).getReg();
277 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
278
279 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
280 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
281 DstRB->getID() != AMDGPU::VCCRegBankID)
282 return false;
283
284 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
285 STI.isWave64());
286 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
287
288 // Dead implicit-def of scc
289 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
290 true, // isImp
291 false, // isKill
292 true)); // isDead
293 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
294}
295
296bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
297 MachineBasicBlock *BB = I.getParent();
298 MachineFunction *MF = BB->getParent();
299 Register DstReg = I.getOperand(0).getReg();
300 const DebugLoc &DL = I.getDebugLoc();
301 LLT Ty = MRI->getType(DstReg);
302 if (Ty.isVector())
303 return false;
304
305 unsigned Size = Ty.getSizeInBits();
306 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
307 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
308 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
309
310 if (Size == 32) {
311 if (IsSALU) {
312 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
313 MachineInstr *Add =
314 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
315 .add(I.getOperand(1))
316 .add(I.getOperand(2));
317 I.eraseFromParent();
318 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
319 }
320
321 if (STI.hasAddNoCarry()) {
322 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
323 I.setDesc(TII.get(Opc));
324 I.addOperand(*MF, MachineOperand::CreateImm(0));
325 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
326 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
327 }
328
329 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
330
331 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
332 MachineInstr *Add
333 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
334 .addDef(UnusedCarry, RegState::Dead)
335 .add(I.getOperand(1))
336 .add(I.getOperand(2))
337 .addImm(0);
338 I.eraseFromParent();
339 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
340 }
341
342 assert(!Sub && "illegal sub should not reach here")((!Sub && "illegal sub should not reach here") ? static_cast
<void> (0) : __assert_fail ("!Sub && \"illegal sub should not reach here\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 342, __PRETTY_FUNCTION__))
;
343
344 const TargetRegisterClass &RC
345 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
346 const TargetRegisterClass &HalfRC
347 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
348
349 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
350 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
351 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
352 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
353
354 Register DstLo = MRI->createVirtualRegister(&HalfRC);
355 Register DstHi = MRI->createVirtualRegister(&HalfRC);
356
357 if (IsSALU) {
358 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
359 .add(Lo1)
360 .add(Lo2);
361 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
362 .add(Hi1)
363 .add(Hi2);
364 } else {
365 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
366 Register CarryReg = MRI->createVirtualRegister(CarryRC);
367 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
368 .addDef(CarryReg)
369 .add(Lo1)
370 .add(Lo2)
371 .addImm(0);
372 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
373 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
374 .add(Hi1)
375 .add(Hi2)
376 .addReg(CarryReg, RegState::Kill)
377 .addImm(0);
378
379 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
380 return false;
381 }
382
383 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
384 .addReg(DstLo)
385 .addImm(AMDGPU::sub0)
386 .addReg(DstHi)
387 .addImm(AMDGPU::sub1);
388
389
390 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
391 return false;
392
393 I.eraseFromParent();
394 return true;
395}
396
397bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
398 MachineInstr &I) const {
399 MachineBasicBlock *BB = I.getParent();
400 MachineFunction *MF = BB->getParent();
401 const DebugLoc &DL = I.getDebugLoc();
402 Register Dst0Reg = I.getOperand(0).getReg();
403 Register Dst1Reg = I.getOperand(1).getReg();
404 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
405 I.getOpcode() == AMDGPU::G_UADDE;
406 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
407 I.getOpcode() == AMDGPU::G_USUBE;
408
409 if (isVCC(Dst1Reg, *MRI)) {
410 unsigned NoCarryOpc =
411 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
412 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
413 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
414 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
415 I.addOperand(*MF, MachineOperand::CreateImm(0));
416 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
417 }
418
419 Register Src0Reg = I.getOperand(2).getReg();
420 Register Src1Reg = I.getOperand(3).getReg();
421
422 if (HasCarryIn) {
423 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
424 .addReg(I.getOperand(4).getReg());
425 }
426
427 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
428 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
429
430 BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
431 .add(I.getOperand(2))
432 .add(I.getOperand(3));
433 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
434 .addReg(AMDGPU::SCC);
435
436 if (!MRI->getRegClassOrNull(Dst1Reg))
437 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
438
439 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
440 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
442 return false;
443
444 if (HasCarryIn &&
445 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
446 AMDGPU::SReg_32RegClass, *MRI))
447 return false;
448
449 I.eraseFromParent();
450 return true;
451}
452
453// TODO: We should probably legalize these to only using 32-bit results.
454bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
455 MachineBasicBlock *BB = I.getParent();
456 Register DstReg = I.getOperand(0).getReg();
457 Register SrcReg = I.getOperand(1).getReg();
458 LLT DstTy = MRI->getType(DstReg);
459 LLT SrcTy = MRI->getType(SrcReg);
460 const unsigned SrcSize = SrcTy.getSizeInBits();
461 unsigned DstSize = DstTy.getSizeInBits();
462
463 // TODO: Should handle any multiple of 32 offset.
464 unsigned Offset = I.getOperand(2).getImm();
465 if (Offset % 32 != 0 || DstSize > 128)
466 return false;
467
468 // 16-bit operations really use 32-bit registers.
469 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
470 if (DstSize == 16)
471 DstSize = 32;
472
473 const TargetRegisterClass *DstRC =
474 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
475 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
476 return false;
477
478 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
479 const TargetRegisterClass *SrcRC =
480 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
481 if (!SrcRC)
482 return false;
483 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
484 DstSize / 32);
485 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
486 if (!SrcRC)
487 return false;
488
489 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
490 *SrcRC, I.getOperand(1));
491 const DebugLoc &DL = I.getDebugLoc();
492 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
493 .addReg(SrcReg, 0, SubReg);
494
495 I.eraseFromParent();
496 return true;
497}
498
499bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
500 MachineBasicBlock *BB = MI.getParent();
501 Register DstReg = MI.getOperand(0).getReg();
502 LLT DstTy = MRI->getType(DstReg);
503 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
504
505 const unsigned SrcSize = SrcTy.getSizeInBits();
506 if (SrcSize < 32)
507 return selectImpl(MI, *CoverageInfo);
508
509 const DebugLoc &DL = MI.getDebugLoc();
510 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
511 const unsigned DstSize = DstTy.getSizeInBits();
512 const TargetRegisterClass *DstRC =
513 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
514 if (!DstRC)
515 return false;
516
517 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
518 MachineInstrBuilder MIB =
519 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
520 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
521 MachineOperand &Src = MI.getOperand(I + 1);
522 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
523 MIB.addImm(SubRegs[I]);
524
525 const TargetRegisterClass *SrcRC
526 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
527 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
528 return false;
529 }
530
531 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
532 return false;
533
534 MI.eraseFromParent();
535 return true;
536}
537
538bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
539 MachineBasicBlock *BB = MI.getParent();
540 const int NumDst = MI.getNumOperands() - 1;
541
542 MachineOperand &Src = MI.getOperand(NumDst);
543
544 Register SrcReg = Src.getReg();
545 Register DstReg0 = MI.getOperand(0).getReg();
546 LLT DstTy = MRI->getType(DstReg0);
547 LLT SrcTy = MRI->getType(SrcReg);
548
549 const unsigned DstSize = DstTy.getSizeInBits();
550 const unsigned SrcSize = SrcTy.getSizeInBits();
551 const DebugLoc &DL = MI.getDebugLoc();
552 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
553
554 const TargetRegisterClass *SrcRC =
555 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
556 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
557 return false;
558
559 const unsigned SrcFlags = getUndefRegState(Src.isUndef());
560
561 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
562 // source, and this relies on the fact that the same subregister indices are
563 // used for both.
564 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
565 for (int I = 0, E = NumDst; I != E; ++I) {
566 MachineOperand &Dst = MI.getOperand(I);
567 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
568 .addReg(SrcReg, SrcFlags, SubRegs[I]);
569
570 // Make sure the subregister index is valid for the source register.
571 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
572 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
573 return false;
574
575 const TargetRegisterClass *DstRC =
576 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
577 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
578 return false;
579 }
580
581 MI.eraseFromParent();
582 return true;
583}
584
585bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
586 MachineInstr &MI) const {
587 if (selectImpl(MI, *CoverageInfo))
588 return true;
589
590 const LLT S32 = LLT::scalar(32);
591 const LLT V2S16 = LLT::vector(2, 16);
592
593 Register Dst = MI.getOperand(0).getReg();
594 if (MRI->getType(Dst) != V2S16)
595 return false;
596
597 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
598 if (DstBank->getID() != AMDGPU::SGPRRegBankID)
599 return false;
600
601 Register Src0 = MI.getOperand(1).getReg();
602 Register Src1 = MI.getOperand(2).getReg();
603 if (MRI->getType(Src0) != S32)
604 return false;
605
606 const DebugLoc &DL = MI.getDebugLoc();
607 MachineBasicBlock *BB = MI.getParent();
608
609 auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
610 if (ConstSrc1) {
611 auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
612 if (ConstSrc0) {
613 uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
614 uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
615
616 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
617 .addImm(Lo16 | (Hi16 << 16));
618 MI.eraseFromParent();
619 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
620 }
621 }
622
623 // TODO: This should probably be a combine somewhere
624 // (build_vector_trunc $src0, undef -> copy $src0
625 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
626 if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
627 MI.setDesc(TII.get(AMDGPU::COPY));
628 MI.RemoveOperand(2);
629 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
630 RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
631 }
632
633 Register ShiftSrc0;
634 Register ShiftSrc1;
635 int64_t ShiftAmt;
636
637 // With multiple uses of the shift, this will duplicate the shift and
638 // increase register pressure.
639 //
640 // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
641 // => (S_PACK_HH_B32_B16 $src0, $src1)
642 // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
643 // => (S_PACK_LH_B32_B16 $src0, $src1)
644 // (build_vector_trunc $src0, $src1)
645 // => (S_PACK_LL_B32_B16 $src0, $src1)
646
647 // FIXME: This is an inconvenient way to check a specific value
648 bool Shift0 = mi_match(
649 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
650 ShiftAmt == 16;
651
652 bool Shift1 = mi_match(
653 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
654 ShiftAmt == 16;
655
656 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
657 if (Shift0 && Shift1) {
658 Opc = AMDGPU::S_PACK_HH_B32_B16;
659 MI.getOperand(1).setReg(ShiftSrc0);
660 MI.getOperand(2).setReg(ShiftSrc1);
661 } else if (Shift1) {
662 Opc = AMDGPU::S_PACK_LH_B32_B16;
663 MI.getOperand(2).setReg(ShiftSrc1);
664 } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
665 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
666 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
667 .addReg(ShiftSrc0)
668 .addImm(16);
669
670 MI.eraseFromParent();
671 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
672 }
673
674 MI.setDesc(TII.get(Opc));
675 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
676}
677
678bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
679 return selectG_ADD_SUB(I);
680}
681
682bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
683 const MachineOperand &MO = I.getOperand(0);
684
685 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
686 // regbank check here is to know why getConstrainedRegClassForOperand failed.
687 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
688 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
689 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
690 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
691 return true;
692 }
693
694 return false;
695}
696
697bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
698 MachineBasicBlock *BB = I.getParent();
699
700 Register DstReg = I.getOperand(0).getReg();
701 Register Src0Reg = I.getOperand(1).getReg();
702 Register Src1Reg = I.getOperand(2).getReg();
703 LLT Src1Ty = MRI->getType(Src1Reg);
704
705 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
706 unsigned InsSize = Src1Ty.getSizeInBits();
707
708 int64_t Offset = I.getOperand(3).getImm();
709
710 // FIXME: These cases should have been illegal and unnecessary to check here.
711 if (Offset % 32 != 0 || InsSize % 32 != 0)
712 return false;
713
714 // Currently not handled by getSubRegFromChannel.
715 if (InsSize > 128)
716 return false;
717
718 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
719 if (SubReg == AMDGPU::NoSubRegister)
720 return false;
721
722 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
723 const TargetRegisterClass *DstRC =
724 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
725 if (!DstRC)
726 return false;
727
728 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
729 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
730 const TargetRegisterClass *Src0RC =
731 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
732 const TargetRegisterClass *Src1RC =
733 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
734
735 // Deal with weird cases where the class only partially supports the subreg
736 // index.
737 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
738 if (!Src0RC || !Src1RC)
739 return false;
740
741 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
742 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
743 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
744 return false;
745
746 const DebugLoc &DL = I.getDebugLoc();
747 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
748 .addReg(Src0Reg)
749 .addReg(Src1Reg)
750 .addImm(SubReg);
751
752 I.eraseFromParent();
753 return true;
754}
755
756bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
757 if (STI.getLDSBankCount() != 16)
758 return selectImpl(MI, *CoverageInfo);
759
760 Register Dst = MI.getOperand(0).getReg();
761 Register Src0 = MI.getOperand(2).getReg();
762 Register M0Val = MI.getOperand(6).getReg();
763 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
764 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
765 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
766 return false;
767
768 // This requires 2 instructions. It is possible to write a pattern to support
769 // this, but the generated isel emitter doesn't correctly deal with multiple
770 // output instructions using the same physical register input. The copy to m0
771 // is incorrectly placed before the second instruction.
772 //
773 // TODO: Match source modifiers.
774
775 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
776 const DebugLoc &DL = MI.getDebugLoc();
777 MachineBasicBlock *MBB = MI.getParent();
778
779 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
780 .addReg(M0Val);
781 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
782 .addImm(2)
783 .addImm(MI.getOperand(4).getImm()) // $attr
784 .addImm(MI.getOperand(3).getImm()); // $attrchan
785
786 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
787 .addImm(0) // $src0_modifiers
788 .addReg(Src0) // $src0
789 .addImm(MI.getOperand(4).getImm()) // $attr
790 .addImm(MI.getOperand(3).getImm()) // $attrchan
791 .addImm(0) // $src2_modifiers
792 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
793 .addImm(MI.getOperand(5).getImm()) // $high
794 .addImm(0) // $clamp
795 .addImm(0); // $omod
796
797 MI.eraseFromParent();
798 return true;
799}
800
801// We need to handle this here because tablegen doesn't support matching
802// instructions with multiple outputs.
803bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
804 Register Dst0 = MI.getOperand(0).getReg();
805 Register Dst1 = MI.getOperand(1).getReg();
806
807 LLT Ty = MRI->getType(Dst0);
808 unsigned Opc;
809 if (Ty == LLT::scalar(32))
810 Opc = AMDGPU::V_DIV_SCALE_F32;
811 else if (Ty == LLT::scalar(64))
812 Opc = AMDGPU::V_DIV_SCALE_F64;
813 else
814 return false;
815
816 const DebugLoc &DL = MI.getDebugLoc();
817 MachineBasicBlock *MBB = MI.getParent();
818
819 Register Numer = MI.getOperand(3).getReg();
820 Register Denom = MI.getOperand(4).getReg();
821 unsigned ChooseDenom = MI.getOperand(5).getImm();
822
823 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
824
825 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
826 .addDef(Dst1)
827 .addUse(Src0)
828 .addUse(Denom)
829 .addUse(Numer);
830
831 MI.eraseFromParent();
832 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
833}
834
835bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
836 unsigned IntrinsicID = I.getIntrinsicID();
837 switch (IntrinsicID) {
838 case Intrinsic::amdgcn_if_break: {
839 MachineBasicBlock *BB = I.getParent();
840
841 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
842 // SelectionDAG uses for wave32 vs wave64.
843 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
844 .add(I.getOperand(0))
845 .add(I.getOperand(2))
846 .add(I.getOperand(3));
847
848 Register DstReg = I.getOperand(0).getReg();
849 Register Src0Reg = I.getOperand(2).getReg();
850 Register Src1Reg = I.getOperand(3).getReg();
851
852 I.eraseFromParent();
853
854 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
855 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
856
857 return true;
858 }
859 case Intrinsic::amdgcn_interp_p1_f16:
860 return selectInterpP1F16(I);
861 case Intrinsic::amdgcn_wqm:
862 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
863 case Intrinsic::amdgcn_softwqm:
864 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
865 case Intrinsic::amdgcn_wwm:
866 return constrainCopyLikeIntrin(I, AMDGPU::WWM);
867 case Intrinsic::amdgcn_div_scale:
868 return selectDivScale(I);
869 case Intrinsic::amdgcn_icmp:
870 return selectIntrinsicIcmp(I);
871 case Intrinsic::amdgcn_ballot:
872 return selectBallot(I);
873 case Intrinsic::amdgcn_reloc_constant:
874 return selectRelocConstant(I);
875 case Intrinsic::returnaddress:
876 return selectReturnAddress(I);
877 default:
878 return selectImpl(I, *CoverageInfo);
879 }
880}
881
882static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
883 if (Size != 32 && Size != 64)
884 return -1;
885 switch (P) {
886 default:
887 llvm_unreachable("Unknown condition code!")::llvm::llvm_unreachable_internal("Unknown condition code!", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 887)
;
888 case CmpInst::ICMP_NE:
889 return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
890 case CmpInst::ICMP_EQ:
891 return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
892 case CmpInst::ICMP_SGT:
893 return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
894 case CmpInst::ICMP_SGE:
895 return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
896 case CmpInst::ICMP_SLT:
897 return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
898 case CmpInst::ICMP_SLE:
899 return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
900 case CmpInst::ICMP_UGT:
901 return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
902 case CmpInst::ICMP_UGE:
903 return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
904 case CmpInst::ICMP_ULT:
905 return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
906 case CmpInst::ICMP_ULE:
907 return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
908 }
909}
910
911int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
912 unsigned Size) const {
913 if (Size == 64) {
914 if (!STI.hasScalarCompareEq64())
915 return -1;
916
917 switch (P) {
918 case CmpInst::ICMP_NE:
919 return AMDGPU::S_CMP_LG_U64;
920 case CmpInst::ICMP_EQ:
921 return AMDGPU::S_CMP_EQ_U64;
922 default:
923 return -1;
924 }
925 }
926
927 if (Size != 32)
928 return -1;
929
930 switch (P) {
931 case CmpInst::ICMP_NE:
932 return AMDGPU::S_CMP_LG_U32;
933 case CmpInst::ICMP_EQ:
934 return AMDGPU::S_CMP_EQ_U32;
935 case CmpInst::ICMP_SGT:
936 return AMDGPU::S_CMP_GT_I32;
937 case CmpInst::ICMP_SGE:
938 return AMDGPU::S_CMP_GE_I32;
939 case CmpInst::ICMP_SLT:
940 return AMDGPU::S_CMP_LT_I32;
941 case CmpInst::ICMP_SLE:
942 return AMDGPU::S_CMP_LE_I32;
943 case CmpInst::ICMP_UGT:
944 return AMDGPU::S_CMP_GT_U32;
945 case CmpInst::ICMP_UGE:
946 return AMDGPU::S_CMP_GE_U32;
947 case CmpInst::ICMP_ULT:
948 return AMDGPU::S_CMP_LT_U32;
949 case CmpInst::ICMP_ULE:
950 return AMDGPU::S_CMP_LE_U32;
951 default:
952 llvm_unreachable("Unknown condition code!")::llvm::llvm_unreachable_internal("Unknown condition code!", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 952)
;
953 }
954}
955
956bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
957 MachineBasicBlock *BB = I.getParent();
958 const DebugLoc &DL = I.getDebugLoc();
959
960 Register SrcReg = I.getOperand(2).getReg();
961 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
962
963 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
964
965 Register CCReg = I.getOperand(0).getReg();
966 if (!isVCC(CCReg, *MRI)) {
967 int Opcode = getS_CMPOpcode(Pred, Size);
968 if (Opcode == -1)
969 return false;
970 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
971 .add(I.getOperand(2))
972 .add(I.getOperand(3));
973 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
974 .addReg(AMDGPU::SCC);
975 bool Ret =
976 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
977 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
978 I.eraseFromParent();
979 return Ret;
980 }
981
982 int Opcode = getV_CMPOpcode(Pred, Size);
983 if (Opcode == -1)
984 return false;
985
986 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
987 I.getOperand(0).getReg())
988 .add(I.getOperand(2))
989 .add(I.getOperand(3));
990 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
991 *TRI.getBoolRC(), *MRI);
992 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
993 I.eraseFromParent();
994 return Ret;
995}
996
997bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
998 Register Dst = I.getOperand(0).getReg();
999 if (isVCC(Dst, *MRI))
1000 return false;
1001
1002 if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1003 return false;
1004
1005 MachineBasicBlock *BB = I.getParent();
1006 const DebugLoc &DL = I.getDebugLoc();
1007 Register SrcReg = I.getOperand(2).getReg();
1008 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1009 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1010
1011 int Opcode = getV_CMPOpcode(Pred, Size);
1012 if (Opcode == -1)
1013 return false;
1014
1015 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1016 .add(I.getOperand(2))
1017 .add(I.getOperand(3));
1018 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1019 *MRI);
1020 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1021 I.eraseFromParent();
1022 return Ret;
1023}
1024
1025bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1026 MachineBasicBlock *BB = I.getParent();
1027 const DebugLoc &DL = I.getDebugLoc();
1028 Register DstReg = I.getOperand(0).getReg();
1029 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1030 const bool Is64 = Size == 64;
1031
1032 if (Size != STI.getWavefrontSize())
1033 return false;
1034
1035 Optional<ValueAndVReg> Arg =
1036 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1037
1038 if (Arg.hasValue()) {
1039 const int64_t Value = Arg.getValue().Value;
1040 if (Value == 0) {
1041 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1042 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1043 } else if (Value == -1) { // all ones
1044 Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1045 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1046 } else
1047 return false;
1048 } else {
1049 Register SrcReg = I.getOperand(2).getReg();
1050 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1051 }
1052
1053 I.eraseFromParent();
1054 return true;
1055}
1056
1057bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1058 Register DstReg = I.getOperand(0).getReg();
1059 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1060 const TargetRegisterClass *DstRC =
1061 TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1062 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1063 return false;
1064
1065 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1066
1067 Module *M = MF->getFunction().getParent();
1068 const MDNode *Metadata = I.getOperand(2).getMetadata();
1069 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1070 auto RelocSymbol = cast<GlobalVariable>(
1071 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1072
1073 MachineBasicBlock *BB = I.getParent();
1074 BuildMI(*BB, &I, I.getDebugLoc(),
1075 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1076 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1077
1078 I.eraseFromParent();
1079 return true;
1080}
1081
1082bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1083 MachineBasicBlock *MBB = I.getParent();
1084 MachineFunction &MF = *MBB->getParent();
1085 const DebugLoc &DL = I.getDebugLoc();
1086
1087 MachineOperand &Dst = I.getOperand(0);
1088 Register DstReg = Dst.getReg();
1089 unsigned Depth = I.getOperand(2).getImm();
1090
1091 const TargetRegisterClass *RC
1092 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1093 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1094 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1095 return false;
1096
1097 // Check for kernel and shader functions
1098 if (Depth != 0 ||
1099 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1100 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1101 .addImm(0);
1102 I.eraseFromParent();
1103 return true;
1104 }
1105
1106 MachineFrameInfo &MFI = MF.getFrameInfo();
1107 // There is a call to @llvm.returnaddress in this function
1108 MFI.setReturnAddressIsTaken(true);
1109
1110 // Get the return address reg and mark it as an implicit live-in
1111 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1112 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1113 AMDGPU::SReg_64RegClass);
1114 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1115 .addReg(LiveIn);
1116 I.eraseFromParent();
1117 return true;
1118}
1119
1120bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1121 // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1122 // SelectionDAG uses for wave32 vs wave64.
1123 MachineBasicBlock *BB = MI.getParent();
1124 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1125 .add(MI.getOperand(1));
1126
1127 Register Reg = MI.getOperand(1).getReg();
1128 MI.eraseFromParent();
1129
1130 if (!MRI->getRegClassOrNull(Reg))
1131 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1132 return true;
1133}
1134
1135static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
1136 switch (MF.getFunction().getCallingConv()) {
1137 case CallingConv::AMDGPU_PS:
1138 return 1;
1139 case CallingConv::AMDGPU_VS:
1140 return 2;
1141 case CallingConv::AMDGPU_GS:
1142 return 3;
1143 case CallingConv::AMDGPU_HS:
1144 case CallingConv::AMDGPU_LS:
1145 case CallingConv::AMDGPU_ES:
1146 report_fatal_error("ds_ordered_count unsupported for this calling conv");
1147 case CallingConv::AMDGPU_CS:
1148 case CallingConv::AMDGPU_KERNEL:
1149 case CallingConv::C:
1150 case CallingConv::Fast:
1151 default:
1152 // Assume other calling conventions are various compute callable functions
1153 return 0;
1154 }
1155}
1156
1157bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1158 MachineInstr &MI, Intrinsic::ID IntrID) const {
1159 MachineBasicBlock *MBB = MI.getParent();
1160 MachineFunction *MF = MBB->getParent();
1161 const DebugLoc &DL = MI.getDebugLoc();
1162
1163 unsigned IndexOperand = MI.getOperand(7).getImm();
1164 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1165 bool WaveDone = MI.getOperand(9).getImm() != 0;
1166
1167 if (WaveDone && !WaveRelease)
1168 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1169
1170 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1171 IndexOperand &= ~0x3f;
1172 unsigned CountDw = 0;
1173
1174 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1175 CountDw = (IndexOperand >> 24) & 0xf;
1176 IndexOperand &= ~(0xf << 24);
1177
1178 if (CountDw < 1 || CountDw > 4) {
1179 report_fatal_error(
1180 "ds_ordered_count: dword count must be between 1 and 4");
1181 }
1182 }
1183
1184 if (IndexOperand)
1185 report_fatal_error("ds_ordered_count: bad index operand");
1186
1187 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1188 unsigned ShaderType = getDSShaderTypeValue(*MF);
1189
1190 unsigned Offset0 = OrderedCountIndex << 2;
1191 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1192 (Instruction << 4);
1193
1194 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1195 Offset1 |= (CountDw - 1) << 6;
1196
1197 unsigned Offset = Offset0 | (Offset1 << 8);
1198
1199 Register M0Val = MI.getOperand(2).getReg();
1200 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1201 .addReg(M0Val);
1202
1203 Register DstReg = MI.getOperand(0).getReg();
1204 Register ValReg = MI.getOperand(3).getReg();
1205 MachineInstrBuilder DS =
1206 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1207 .addReg(ValReg)
1208 .addImm(Offset)
1209 .cloneMemRefs(MI);
1210
1211 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1212 return false;
1213
1214 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1215 MI.eraseFromParent();
1216 return Ret;
1217}
1218
1219static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1220 switch (IntrID) {
1221 case Intrinsic::amdgcn_ds_gws_init:
1222 return AMDGPU::DS_GWS_INIT;
1223 case Intrinsic::amdgcn_ds_gws_barrier:
1224 return AMDGPU::DS_GWS_BARRIER;
1225 case Intrinsic::amdgcn_ds_gws_sema_v:
1226 return AMDGPU::DS_GWS_SEMA_V;
1227 case Intrinsic::amdgcn_ds_gws_sema_br:
1228 return AMDGPU::DS_GWS_SEMA_BR;
1229 case Intrinsic::amdgcn_ds_gws_sema_p:
1230 return AMDGPU::DS_GWS_SEMA_P;
1231 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1232 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1233 default:
1234 llvm_unreachable("not a gws intrinsic")::llvm::llvm_unreachable_internal("not a gws intrinsic", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1234)
;
1235 }
1236}
1237
1238bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1239 Intrinsic::ID IID) const {
1240 if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1241 !STI.hasGWSSemaReleaseAll())
1242 return false;
1243
1244 // intrinsic ID, vsrc, offset
1245 const bool HasVSrc = MI.getNumOperands() == 3;
1246 assert(HasVSrc || MI.getNumOperands() == 2)((HasVSrc || MI.getNumOperands() == 2) ? static_cast<void>
(0) : __assert_fail ("HasVSrc || MI.getNumOperands() == 2", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1246, __PRETTY_FUNCTION__))
;
1247
1248 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1249 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1250 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1251 return false;
1252
1253 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1254 assert(OffsetDef)((OffsetDef) ? static_cast<void> (0) : __assert_fail ("OffsetDef"
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1254, __PRETTY_FUNCTION__))
;
1255
1256 unsigned ImmOffset;
1257
1258 MachineBasicBlock *MBB = MI.getParent();
1259 const DebugLoc &DL = MI.getDebugLoc();
1260
1261 MachineInstr *Readfirstlane = nullptr;
1262
1263 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1264 // incoming offset, in case there's an add of a constant. We'll have to put it
1265 // back later.
1266 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1267 Readfirstlane = OffsetDef;
1268 BaseOffset = OffsetDef->getOperand(1).getReg();
1269 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1270 }
1271
1272 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1273 // If we have a constant offset, try to use the 0 in m0 as the base.
1274 // TODO: Look into changing the default m0 initialization value. If the
1275 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1276 // the immediate offset.
1277
1278 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1279 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1280 .addImm(0);
1281 } else {
1282 std::tie(BaseOffset, ImmOffset, OffsetDef)
1283 = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1284
1285 if (Readfirstlane) {
1286 // We have the constant offset now, so put the readfirstlane back on the
1287 // variable component.
1288 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1289 return false;
1290
1291 Readfirstlane->getOperand(1).setReg(BaseOffset);
1292 BaseOffset = Readfirstlane->getOperand(0).getReg();
1293 } else {
1294 if (!RBI.constrainGenericRegister(BaseOffset,
1295 AMDGPU::SReg_32RegClass, *MRI))
1296 return false;
1297 }
1298
1299 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1300 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1301 .addReg(BaseOffset)
1302 .addImm(16);
1303
1304 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1305 .addReg(M0Base);
1306 }
1307
1308 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1309 // offset field) % 64. Some versions of the programming guide omit the m0
1310 // part, or claim it's from offset 0.
1311 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1312
1313 if (HasVSrc) {
1314 Register VSrc = MI.getOperand(1).getReg();
1315 MIB.addReg(VSrc);
1316 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1317 return false;
1318 }
1319
1320 MIB.addImm(ImmOffset)
1321 .addImm(-1) // $gds
1322 .cloneMemRefs(MI);
1323
1324 MI.eraseFromParent();
1325 return true;
1326}
1327
1328bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1329 bool IsAppend) const {
1330 Register PtrBase = MI.getOperand(2).getReg();
1331 LLT PtrTy = MRI->getType(PtrBase);
1332 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1333
1334 unsigned Offset;
1335 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1336
1337 // TODO: Should this try to look through readfirstlane like GWS?
1338 if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1339 PtrBase = MI.getOperand(2).getReg();
1340 Offset = 0;
1341 }
1342
1343 MachineBasicBlock *MBB = MI.getParent();
1344 const DebugLoc &DL = MI.getDebugLoc();
1345 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1346
1347 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1348 .addReg(PtrBase);
1349 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1350 return false;
1351
1352 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1353 .addImm(Offset)
1354 .addImm(IsGDS ? -1 : 0)
1355 .cloneMemRefs(MI);
1356 MI.eraseFromParent();
1357 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1358}
1359
1360static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1361 bool &IsTexFail) {
1362 if (TexFailCtrl)
1363 IsTexFail = true;
1364
1365 TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1366 TexFailCtrl &= ~(uint64_t)0x1;
1367 LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1368 TexFailCtrl &= ~(uint64_t)0x2;
1369
1370 return TexFailCtrl == 0;
1371}
1372
1373static bool parseCachePolicy(uint64_t Value,
1374 bool *GLC, bool *SLC, bool *DLC) {
1375 if (GLC) {
1376 *GLC = (Value & 0x1) ? 1 : 0;
1377 Value &= ~(uint64_t)0x1;
1378 }
1379 if (SLC) {
1380 *SLC = (Value & 0x2) ? 1 : 0;
1381 Value &= ~(uint64_t)0x2;
1382 }
1383 if (DLC) {
1384 *DLC = (Value & 0x4) ? 1 : 0;
1385 Value &= ~(uint64_t)0x4;
1386 }
1387
1388 return Value == 0;
1389}
1390
1391bool AMDGPUInstructionSelector::selectImageIntrinsic(
1392 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1393 MachineBasicBlock *MBB = MI.getParent();
1394 const DebugLoc &DL = MI.getDebugLoc();
1395
1396 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1397 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1398
1399 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1400 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1401 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1402 const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1403 AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1404 unsigned IntrOpcode = Intr->BaseOpcode;
1405 const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1406
1407 const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1408 MI.getNumExplicitDefs());
1409 int NumVAddr, NumGradients;
1410 std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1411
1412 Register VDataIn, VDataOut;
1413 LLT VDataTy;
1414 int NumVDataDwords = -1;
1415 bool IsD16 = false;
1416
1417 // XXX - Can we just get the second to last argument for ctrl?
1418 unsigned CtrlIdx; // Index of texfailctrl argument
1419 bool Unorm;
1420 if (!BaseOpcode->Sampler) {
1421 Unorm = true;
1422 CtrlIdx = VAddrIdx + NumVAddr + 1;
1423 } else {
1424 Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1425 CtrlIdx = VAddrIdx + NumVAddr + 3;
1426 }
1427
1428 bool TFE;
1429 bool LWE;
1430 bool IsTexFail = false;
1431 if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1432 return false;
1433
1434 const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1435 const bool IsA16 = (Flags & 1) != 0;
1436 const bool IsG16 = (Flags & 2) != 0;
1437
1438 // A16 implies 16 bit gradients
1439 if (IsA16 && !IsG16)
1440 return false;
1441
1442 unsigned DMask = 0;
1443 unsigned DMaskLanes = 0;
1444
1445 if (BaseOpcode->Atomic) {
1446 VDataOut = MI.getOperand(0).getReg();
1447 VDataIn = MI.getOperand(2).getReg();
1448 LLT Ty = MRI->getType(VDataIn);
1449
1450 // Be careful to allow atomic swap on 16-bit element vectors.
1451 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1452 Ty.getSizeInBits() == 128 :
1453 Ty.getSizeInBits() == 64;
1454
1455 if (BaseOpcode->AtomicX2) {
1456 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister)((MI.getOperand(3).getReg() == AMDGPU::NoRegister) ? static_cast
<void> (0) : __assert_fail ("MI.getOperand(3).getReg() == AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1456, __PRETTY_FUNCTION__))
;
1457
1458 DMask = Is64Bit ? 0xf : 0x3;
1459 NumVDataDwords = Is64Bit ? 4 : 2;
1460 } else {
1461 DMask = Is64Bit ? 0x3 : 0x1;
1462 NumVDataDwords = Is64Bit ? 2 : 1;
1463 }
1464 } else {
1465 const int DMaskIdx = 2; // Input/output + intrinsic ID.
1466
1467 DMask = MI.getOperand(DMaskIdx).getImm();
1468 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1469
1470 if (BaseOpcode->Store) {
1471 VDataIn = MI.getOperand(1).getReg();
1472 VDataTy = MRI->getType(VDataIn);
1473 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1474 } else {
1475 VDataOut = MI.getOperand(0).getReg();
1476 VDataTy = MRI->getType(VDataOut);
1477 NumVDataDwords = DMaskLanes;
1478
1479 // One memoperand is mandatory, except for getresinfo.
1480 // FIXME: Check this in verifier.
1481 if (!MI.memoperands_empty()) {
1482 const MachineMemOperand *MMO = *MI.memoperands_begin();
1483
1484 // Infer d16 from the memory size, as the register type will be mangled by
1485 // unpacked subtargets, or by TFE.
1486 IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1487
1488 if (IsD16 && !STI.hasUnpackedD16VMem())
1489 NumVDataDwords = (DMaskLanes + 1) / 2;
1490 }
1491 }
1492 }
1493
1494 // Optimize _L to _LZ when _L is zero
1495 if (LZMappingInfo) {
1496 // The legalizer replaced the register with an immediate 0 if we need to
1497 // change the opcode.
1498 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1499 if (Lod.isImm()) {
1500 assert(Lod.getImm() == 0)((Lod.getImm() == 0) ? static_cast<void> (0) : __assert_fail
("Lod.getImm() == 0", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1500, __PRETTY_FUNCTION__))
;
1501 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
1502 }
1503 }
1504
1505 // Optimize _mip away, when 'lod' is zero
1506 if (MIPMappingInfo) {
1507 const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1508 if (Lod.isImm()) {
1509 assert(Lod.getImm() == 0)((Lod.getImm() == 0) ? static_cast<void> (0) : __assert_fail
("Lod.getImm() == 0", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1509, __PRETTY_FUNCTION__))
;
1510 IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
1511 }
1512 }
1513
1514 // Set G16 opcode
1515 if (IsG16 && !IsA16) {
1516 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1517 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1518 assert(G16MappingInfo)((G16MappingInfo) ? static_cast<void> (0) : __assert_fail
("G16MappingInfo", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1518, __PRETTY_FUNCTION__))
;
1519 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1520 }
1521
1522 // TODO: Check this in verifier.
1523 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this")(((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"
) ? static_cast<void> (0) : __assert_fail ("(!IsTexFail || DMaskLanes >= 1) && \"should have legalized this\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1523, __PRETTY_FUNCTION__))
;
1524
1525 bool GLC = false;
1526 bool SLC = false;
1527 bool DLC = false;
1528 if (BaseOpcode->Atomic) {
1529 GLC = true; // TODO no-return optimization
1530 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1531 IsGFX10 ? &DLC : nullptr))
1532 return false;
1533 } else {
1534 if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1535 IsGFX10 ? &DLC : nullptr))
1536 return false;
1537 }
1538
1539 int NumVAddrRegs = 0;
1540 int NumVAddrDwords = 0;
1541 for (int I = 0; I < NumVAddr; ++I) {
1542 // Skip the $noregs and 0s inserted during legalization.
1543 MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1544 if (!AddrOp.isReg())
1545 continue; // XXX - Break?
1546
1547 Register Addr = AddrOp.getReg();
1548 if (!Addr)
1549 break;
1550
1551 ++NumVAddrRegs;
1552 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1553 }
1554
1555 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1556 // NSA, these should have beeen packed into a single value in the first
1557 // address register
1558 const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1559 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1560 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Trying to use NSA on non-NSA target\n"
; } } while (false)
;
1561 return false;
1562 }
1563
1564 if (IsTexFail)
1565 ++NumVDataDwords;
1566
1567 int Opcode = -1;
1568 if (IsGFX10) {
1569 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1570 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1571 : AMDGPU::MIMGEncGfx10Default,
1572 NumVDataDwords, NumVAddrDwords);
1573 } else {
1574 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1575 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1576 NumVDataDwords, NumVAddrDwords);
1577 if (Opcode == -1)
1578 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1579 NumVDataDwords, NumVAddrDwords);
1580 }
1581 assert(Opcode != -1)((Opcode != -1) ? static_cast<void> (0) : __assert_fail
("Opcode != -1", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1581, __PRETTY_FUNCTION__))
;
1582
1583 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1584 .cloneMemRefs(MI);
1585
1586 if (VDataOut) {
1587 if (BaseOpcode->AtomicX2) {
1588 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1589
1590 Register TmpReg = MRI->createVirtualRegister(
1591 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1592 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1593
1594 MIB.addDef(TmpReg);
1595 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1596 .addReg(TmpReg, RegState::Kill, SubReg);
1597
1598 } else {
1599 MIB.addDef(VDataOut); // vdata output
1600 }
1601 }
1602
1603 if (VDataIn)
1604 MIB.addReg(VDataIn); // vdata input
1605
1606 for (int i = 0; i != NumVAddrRegs; ++i) {
1607 MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1608 if (SrcOp.isReg()) {
1609 assert(SrcOp.getReg() != 0)((SrcOp.getReg() != 0) ? static_cast<void> (0) : __assert_fail
("SrcOp.getReg() != 0", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1609, __PRETTY_FUNCTION__))
;
1610 MIB.addReg(SrcOp.getReg());
1611 }
1612 }
1613
1614 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1615 if (BaseOpcode->Sampler)
1616 MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1617
1618 MIB.addImm(DMask); // dmask
1619
1620 if (IsGFX10)
1621 MIB.addImm(DimInfo->Encoding);
1622 MIB.addImm(Unorm);
1623 if (IsGFX10)
1624 MIB.addImm(DLC);
1625
1626 MIB.addImm(GLC);
1627 MIB.addImm(SLC);
1628 MIB.addImm(IsA16 && // a16 or r128
1629 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1630 if (IsGFX10)
1631 MIB.addImm(IsA16 ? -1 : 0);
1632
1633 MIB.addImm(TFE); // tfe
1634 MIB.addImm(LWE); // lwe
1635 if (!IsGFX10)
1636 MIB.addImm(DimInfo->DA ? -1 : 0);
1637 if (BaseOpcode->HasD16)
1638 MIB.addImm(IsD16 ? -1 : 0);
1639
1640 MI.eraseFromParent();
1641 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1642}
1643
1644bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1645 MachineInstr &I) const {
1646 unsigned IntrinsicID = I.getIntrinsicID();
1647 switch (IntrinsicID) {
1648 case Intrinsic::amdgcn_end_cf:
1649 return selectEndCfIntrinsic(I);
1650 case Intrinsic::amdgcn_ds_ordered_add:
1651 case Intrinsic::amdgcn_ds_ordered_swap:
1652 return selectDSOrderedIntrinsic(I, IntrinsicID);
1653 case Intrinsic::amdgcn_ds_gws_init:
1654 case Intrinsic::amdgcn_ds_gws_barrier:
1655 case Intrinsic::amdgcn_ds_gws_sema_v:
1656 case Intrinsic::amdgcn_ds_gws_sema_br:
1657 case Intrinsic::amdgcn_ds_gws_sema_p:
1658 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1659 return selectDSGWSIntrinsic(I, IntrinsicID);
1660 case Intrinsic::amdgcn_ds_append:
1661 return selectDSAppendConsume(I, true);
1662 case Intrinsic::amdgcn_ds_consume:
1663 return selectDSAppendConsume(I, false);
1664 default: {
1665 return selectImpl(I, *CoverageInfo);
1666 }
1667 }
1668}
1669
1670bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1671 if (selectImpl(I, *CoverageInfo))
1672 return true;
1673
1674 MachineBasicBlock *BB = I.getParent();
1675 const DebugLoc &DL = I.getDebugLoc();
1676
1677 Register DstReg = I.getOperand(0).getReg();
1678 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1679 assert(Size <= 32 || Size == 64)((Size <= 32 || Size == 64) ? static_cast<void> (0) :
__assert_fail ("Size <= 32 || Size == 64", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 1679, __PRETTY_FUNCTION__))
;
1680 const MachineOperand &CCOp = I.getOperand(1);
1681 Register CCReg = CCOp.getReg();
1682 if (!isVCC(CCReg, *MRI)) {
1683 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1684 AMDGPU::S_CSELECT_B32;
1685 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1686 .addReg(CCReg);
1687
1688 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1689 // bank, because it does not cover the register class that we used to represent
1690 // for it. So we need to manually set the register class here.
1691 if (!MRI->getRegClassOrNull(CCReg))
1692 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1693 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1694 .add(I.getOperand(2))
1695 .add(I.getOperand(3));
1696
1697 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1698 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1699 I.eraseFromParent();
1700 return Ret;
1701 }
1702
1703 // Wide VGPR select should have been split in RegBankSelect.
1704 if (Size > 32)
1705 return false;
1706
1707 MachineInstr *Select =
1708 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1709 .addImm(0)
1710 .add(I.getOperand(3))
1711 .addImm(0)
1712 .add(I.getOperand(2))
1713 .add(I.getOperand(1));
1714
1715 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1716 I.eraseFromParent();
1717 return Ret;
1718}
1719
1720bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
1721 initM0(I);
1722 return selectImpl(I, *CoverageInfo);
1723}
1724
1725static int sizeToSubRegIndex(unsigned Size) {
1726 switch (Size) {
1727 case 32:
1728 return AMDGPU::sub0;
1729 case 64:
1730 return AMDGPU::sub0_sub1;
1731 case 96:
1732 return AMDGPU::sub0_sub1_sub2;
1733 case 128:
1734 return AMDGPU::sub0_sub1_sub2_sub3;
1735 case 256:
1736 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1737 default:
1738 if (Size < 32)
1739 return AMDGPU::sub0;
1740 if (Size > 256)
1741 return -1;
1742 return sizeToSubRegIndex(PowerOf2Ceil(Size));
1743 }
1744}
1745
1746bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1747 Register DstReg = I.getOperand(0).getReg();
1748 Register SrcReg = I.getOperand(1).getReg();
1749 const LLT DstTy = MRI->getType(DstReg);
1750 const LLT SrcTy = MRI->getType(SrcReg);
1751 const LLT S1 = LLT::scalar(1);
1752
1753 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1754 const RegisterBank *DstRB;
1755 if (DstTy == S1) {
1756 // This is a special case. We don't treat s1 for legalization artifacts as
1757 // vcc booleans.
1758 DstRB = SrcRB;
1759 } else {
1760 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1761 if (SrcRB != DstRB)
1762 return false;
1763 }
1764
1765 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1766
1767 unsigned DstSize = DstTy.getSizeInBits();
1768 unsigned SrcSize = SrcTy.getSizeInBits();
1769
1770 const TargetRegisterClass *SrcRC
1771 = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1772 const TargetRegisterClass *DstRC
1773 = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1774 if (!SrcRC || !DstRC)
1775 return false;
1776
1777 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1778 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1779 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("amdgpu-isel")) { dbgs() << "Failed to constrain G_TRUNC\n"
; } } while (false)
;
1780 return false;
1781 }
1782
1783 if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1784 MachineBasicBlock *MBB = I.getParent();
1785 const DebugLoc &DL = I.getDebugLoc();
1786
1787 Register LoReg = MRI->createVirtualRegister(DstRC);
1788 Register HiReg = MRI->createVirtualRegister(DstRC);
1789 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1790 .addReg(SrcReg, 0, AMDGPU::sub0);
1791 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1792 .addReg(SrcReg, 0, AMDGPU::sub1);
1793
1794 if (IsVALU && STI.hasSDWA()) {
1795 // Write the low 16-bits of the high element into the high 16-bits of the
1796 // low element.
1797 MachineInstr *MovSDWA =
1798 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1799 .addImm(0) // $src0_modifiers
1800 .addReg(HiReg) // $src0
1801 .addImm(0) // $clamp
1802 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
1803 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1804 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
1805 .addReg(LoReg, RegState::Implicit);
1806 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1807 } else {
1808 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1809 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1810 Register ImmReg = MRI->createVirtualRegister(DstRC);
1811 if (IsVALU) {
1812 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1813 .addImm(16)
1814 .addReg(HiReg);
1815 } else {
1816 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1817 .addReg(HiReg)
1818 .addImm(16);
1819 }
1820
1821 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1822 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1823 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1824
1825 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1826 .addImm(0xffff);
1827 BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1828 .addReg(LoReg)
1829 .addReg(ImmReg);
1830 BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1831 .addReg(TmpReg0)
1832 .addReg(TmpReg1);
1833 }
1834
1835 I.eraseFromParent();
1836 return true;
1837 }
1838
1839 if (!DstTy.isScalar())
1840 return false;
1841
1842 if (SrcSize > 32) {
1843 int SubRegIdx = sizeToSubRegIndex(DstSize);
1844 if (SubRegIdx == -1)
1845 return false;
1846
1847 // Deal with weird cases where the class only partially supports the subreg
1848 // index.
1849 const TargetRegisterClass *SrcWithSubRC
1850 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1851 if (!SrcWithSubRC)
1852 return false;
1853
1854 if (SrcWithSubRC != SrcRC) {
1855 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1856 return false;
1857 }
1858
1859 I.getOperand(1).setSubReg(SubRegIdx);
1860 }
1861
1862 I.setDesc(TII.get(TargetOpcode::COPY));
1863 return true;
1864}
1865
1866/// \returns true if a bitmask for \p Size bits will be an inline immediate.
1867static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1868 Mask = maskTrailingOnes<unsigned>(Size);
1869 int SignedMask = static_cast<int>(Mask);
1870 return SignedMask >= -16 && SignedMask <= 64;
1871}
1872
1873// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1874const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1875 Register Reg, const MachineRegisterInfo &MRI,
1876 const TargetRegisterInfo &TRI) const {
1877 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1878 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1879 return RB;
1880
1881 // Ignore the type, since we don't use vcc in artifacts.
1882 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1883 return &RBI.getRegBankFromRegClass(*RC, LLT());
1884 return nullptr;
1885}
1886
1887bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1888 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1889 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1890 const DebugLoc &DL = I.getDebugLoc();
1891 MachineBasicBlock &MBB = *I.getParent();
1892 const Register DstReg = I.getOperand(0).getReg();
1893 const Register SrcReg = I.getOperand(1).getReg();
1894
1895 const LLT DstTy = MRI->getType(DstReg);
1896 const LLT SrcTy = MRI->getType(SrcReg);
1897 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1898 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1899 const unsigned DstSize = DstTy.getSizeInBits();
1900 if (!DstTy.isScalar())
1901 return false;
1902
1903 // Artifact casts should never use vcc.
1904 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1905
1906 // FIXME: This should probably be illegal and split earlier.
1907 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1908 if (DstSize <= 32)
1909 return selectCOPY(I);
1910
1911 const TargetRegisterClass *SrcRC =
1912 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1913 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1914 const TargetRegisterClass *DstRC =
1915 TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1916
1917 Register UndefReg = MRI->createVirtualRegister(SrcRC);
1918 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1919 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1920 .addReg(SrcReg)
1921 .addImm(AMDGPU::sub0)
1922 .addReg(UndefReg)
1923 .addImm(AMDGPU::sub1);
1924 I.eraseFromParent();
1925
1926 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1927 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1928 }
1929
1930 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1931 // 64-bit should have been split up in RegBankSelect
1932
1933 // Try to use an and with a mask if it will save code size.
1934 unsigned Mask;
1935 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1936 MachineInstr *ExtI =
1937 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
1938 .addImm(Mask)
1939 .addReg(SrcReg);
1940 I.eraseFromParent();
1941 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1942 }
1943
1944 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
1945 MachineInstr *ExtI =
1946 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
1947 .addReg(SrcReg)
1948 .addImm(0) // Offset
1949 .addImm(SrcSize); // Width
1950 I.eraseFromParent();
1951 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1952 }
1953
1954 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
1955 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
1956 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
1957 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
1958 return false;
1959
1960 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
1961 const unsigned SextOpc = SrcSize == 8 ?
1962 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
1963 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
1964 .addReg(SrcReg);
1965 I.eraseFromParent();
1966 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1967 }
1968
1969 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
1970 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1971
1972 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
1973 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
1974 // We need a 64-bit register source, but the high bits don't matter.
1975 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
1976 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1977 unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
1978
1979 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1980 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
1981 .addReg(SrcReg, 0, SubReg)
1982 .addImm(AMDGPU::sub0)
1983 .addReg(UndefReg)
1984 .addImm(AMDGPU::sub1);
1985
1986 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
1987 .addReg(ExtReg)
1988 .addImm(SrcSize << 16);
1989
1990 I.eraseFromParent();
1991 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
1992 }
1993
1994 unsigned Mask;
1995 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1996 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
1997 .addReg(SrcReg)
1998 .addImm(Mask);
1999 } else {
2000 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2001 .addReg(SrcReg)
2002 .addImm(SrcSize << 16);
2003 }
2004
2005 I.eraseFromParent();
2006 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2007 }
2008
2009 return false;
2010}
2011
2012bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2013 MachineBasicBlock *BB = I.getParent();
2014 MachineOperand &ImmOp = I.getOperand(1);
2015
2016 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2017 if (ImmOp.isFPImm()) {
2018 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2019 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2020 } else if (ImmOp.isCImm()) {
2021 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2022 }
2023
2024 Register DstReg = I.getOperand(0).getReg();
2025 unsigned Size;
2026 bool IsSgpr;
2027 const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
2028 if (RB) {
2029 IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
2030 Size = MRI->getType(DstReg).getSizeInBits();
2031 } else {
2032 const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
2033 IsSgpr = TRI.isSGPRClass(RC);
2034 Size = TRI.getRegSizeInBits(*RC);
2035 }
2036
2037 if (Size != 32 && Size != 64)
2038 return false;
2039
2040 unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2041 if (Size == 32) {
2042 I.setDesc(TII.get(Opcode));
2043 I.addImplicitDefUseOperands(*MF);
2044 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2045 }
2046
2047 const DebugLoc &DL = I.getDebugLoc();
2048
2049 APInt Imm(Size, I.getOperand(1).getImm());
2050
2051 MachineInstr *ResInst;
2052 if (IsSgpr && TII.isInlineConstant(Imm)) {
2053 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2054 .addImm(I.getOperand(1).getImm());
2055 } else {
2056 const TargetRegisterClass *RC = IsSgpr ?
2057 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2058 Register LoReg = MRI->createVirtualRegister(RC);
2059 Register HiReg = MRI->createVirtualRegister(RC);
2060
2061 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2062 .addImm(Imm.trunc(32).getZExtValue());
2063
2064 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2065 .addImm(Imm.ashr(32).getZExtValue());
2066
2067 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2068 .addReg(LoReg)
2069 .addImm(AMDGPU::sub0)
2070 .addReg(HiReg)
2071 .addImm(AMDGPU::sub1);
2072 }
2073
2074 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2075 // work for target independent opcodes
2076 I.eraseFromParent();
2077 const TargetRegisterClass *DstRC =
2078 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2079 if (!DstRC)
2080 return true;
2081 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2082}
2083
2084bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2085 // Only manually handle the f64 SGPR case.
2086 //
2087 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2088 // the bit ops theoretically have a second result due to the implicit def of
2089 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2090 // that is easy by disabling the check. The result works, but uses a
2091 // nonsensical sreg32orlds_and_sreg_1 regclass.
2092 //
2093 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2094 // the variadic REG_SEQUENCE operands.
2095
2096 Register Dst = MI.getOperand(0).getReg();
2097 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2098 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2099 MRI->getType(Dst) != LLT::scalar(64))
2100 return false;
2101
2102 Register Src = MI.getOperand(1).getReg();
2103 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2104 if (Fabs)
2105 Src = Fabs->getOperand(1).getReg();
2106
2107 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2108 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2109 return false;
2110
2111 MachineBasicBlock *BB = MI.getParent();
2112 const DebugLoc &DL = MI.getDebugLoc();
2113 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2114 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2115 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2116 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2117
2118 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2119 .addReg(Src, 0, AMDGPU::sub0);
2120 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2121 .addReg(Src, 0, AMDGPU::sub1);
2122 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2123 .addImm(0x80000000);
2124
2125 // Set or toggle sign bit.
2126 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2127 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2128 .addReg(HiReg)
2129 .addReg(ConstReg);
2130 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2131 .addReg(LoReg)
2132 .addImm(AMDGPU::sub0)
2133 .addReg(OpReg)
2134 .addImm(AMDGPU::sub1);
2135 MI.eraseFromParent();
2136 return true;
2137}
2138
2139// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2140bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2141 Register Dst = MI.getOperand(0).getReg();
2142 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2143 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2144 MRI->getType(Dst) != LLT::scalar(64))
2145 return false;
2146
2147 Register Src = MI.getOperand(1).getReg();
2148 MachineBasicBlock *BB = MI.getParent();
2149 const DebugLoc &DL = MI.getDebugLoc();
2150 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2151 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2152 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2153 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2154
2155 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2156 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2157 return false;
2158
2159 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2160 .addReg(Src, 0, AMDGPU::sub0);
2161 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2162 .addReg(Src, 0, AMDGPU::sub1);
2163 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2164 .addImm(0x7fffffff);
2165
2166 // Clear sign bit.
2167 // TODO: Should this used S_BITSET0_*?
2168 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2169 .addReg(HiReg)
2170 .addReg(ConstReg);
2171 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2172 .addReg(LoReg)
2173 .addImm(AMDGPU::sub0)
2174 .addReg(OpReg)
2175 .addImm(AMDGPU::sub1);
2176
2177 MI.eraseFromParent();
2178 return true;
2179}
2180
2181static bool isConstant(const MachineInstr &MI) {
2182 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2183}
2184
2185void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2186 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2187
2188 const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2189
2190 assert(PtrMI)((PtrMI) ? static_cast<void> (0) : __assert_fail ("PtrMI"
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2190, __PRETTY_FUNCTION__))
;
2191
2192 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2193 return;
2194
2195 GEPInfo GEPInfo(*PtrMI);
2196
2197 for (unsigned i = 1; i != 3; ++i) {
2198 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2199 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2200 assert(OpDef)((OpDef) ? static_cast<void> (0) : __assert_fail ("OpDef"
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2200, __PRETTY_FUNCTION__))
;
2201 if (i == 2 && isConstant(*OpDef)) {
2202 // TODO: Could handle constant base + variable offset, but a combine
2203 // probably should have commuted it.
2204 assert(GEPInfo.Imm == 0)((GEPInfo.Imm == 0) ? static_cast<void> (0) : __assert_fail
("GEPInfo.Imm == 0", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2204, __PRETTY_FUNCTION__))
;
2205 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2206 continue;
2207 }
2208 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2209 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2210 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2211 else
2212 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2213 }
2214
2215 AddrInfo.push_back(GEPInfo);
2216 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2217}
2218
2219bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2220 if (!MI.hasOneMemOperand())
2221 return false;
2222
2223 const MachineMemOperand *MMO = *MI.memoperands_begin();
2224 const Value *Ptr = MMO->getValue();
2225
2226 // UndefValue means this is a load of a kernel input. These are uniform.
2227 // Sometimes LDS instructions have constant pointers.
2228 // If Ptr is null, then that means this mem operand contains a
2229 // PseudoSourceValue like GOT.
2230 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2231 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2232 return true;
2233
2234 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2235 return true;
2236
2237 const Instruction *I = dyn_cast<Instruction>(Ptr);
2238 return I && I->getMetadata("amdgpu.uniform");
2239}
2240
2241bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2242 for (const GEPInfo &GEPInfo : AddrInfo) {
2243 if (!GEPInfo.VgprParts.empty())
2244 return true;
2245 }
2246 return false;
2247}
2248
2249void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2250 MachineBasicBlock *BB = I.getParent();
2251
2252 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2253 unsigned AS = PtrTy.getAddressSpace();
2254 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2255 STI.ldsRequiresM0Init()) {
2256 // If DS instructions require M0 initializtion, insert it before selecting.
2257 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2258 .addImm(-1);
2259 }
2260}
2261
2262bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
2263 initM0(I);
2264 return selectImpl(I, *CoverageInfo);
2265}
2266
2267// TODO: No rtn optimization.
2268bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2269 MachineInstr &MI) const {
2270 Register PtrReg = MI.getOperand(1).getReg();
2271 const LLT PtrTy = MRI->getType(PtrReg);
2272 if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2273 STI.useFlatForGlobal())
2274 return selectImpl(MI, *CoverageInfo);
2275
2276 Register DstReg = MI.getOperand(0).getReg();
2277 const LLT Ty = MRI->getType(DstReg);
2278 const bool Is64 = Ty.getSizeInBits() == 64;
2279 const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2280 Register TmpReg = MRI->createVirtualRegister(
2281 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2282
2283 const DebugLoc &DL = MI.getDebugLoc();
2284 MachineBasicBlock *BB = MI.getParent();
2285
2286 Register VAddr, RSrcReg, SOffset;
2287 int64_t Offset = 0;
2288
2289 unsigned Opcode;
2290 if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2291 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2292 AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2293 } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2294 RSrcReg, SOffset, Offset)) {
2295 Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2296 AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2297 } else
2298 return selectImpl(MI, *CoverageInfo);
2299
2300 auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2301 .addReg(MI.getOperand(2).getReg());
2302
2303 if (VAddr)
2304 MIB.addReg(VAddr);
2305
2306 MIB.addReg(RSrcReg);
2307 if (SOffset)
2308 MIB.addReg(SOffset);
2309 else
2310 MIB.addImm(0);
2311
2312 MIB.addImm(Offset);
2313 MIB.addImm(0); // slc
2314 MIB.cloneMemRefs(MI);
2315
2316 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2317 .addReg(TmpReg, RegState::Kill, SubReg);
2318
2319 MI.eraseFromParent();
2320
2321 MRI->setRegClass(
2322 DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2323 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2324}
2325
2326bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2327 MachineBasicBlock *BB = I.getParent();
2328 MachineOperand &CondOp = I.getOperand(0);
2329 Register CondReg = CondOp.getReg();
2330 const DebugLoc &DL = I.getDebugLoc();
2331
2332 unsigned BrOpcode;
2333 Register CondPhysReg;
2334 const TargetRegisterClass *ConstrainRC;
2335
2336 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2337 // whether the branch is uniform when selecting the instruction. In
2338 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2339 // RegBankSelect knows what it's doing if the branch condition is scc, even
2340 // though it currently does not.
2341 if (!isVCC(CondReg, *MRI)) {
2342 if (MRI->getType(CondReg) != LLT::scalar(32))
2343 return false;
2344
2345 CondPhysReg = AMDGPU::SCC;
2346 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2347 ConstrainRC = &AMDGPU::SReg_32RegClass;
2348 } else {
2349 // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2350 // We sort of know that a VCC producer based on the register bank, that ands
2351 // inactive lanes with 0. What if there was a logical operation with vcc
2352 // producers in different blocks/with different exec masks?
2353 // FIXME: Should scc->vcc copies and with exec?
2354 CondPhysReg = TRI.getVCC();
2355 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2356 ConstrainRC = TRI.getBoolRC();
2357 }
2358
2359 if (!MRI->getRegClassOrNull(CondReg))
2360 MRI->setRegClass(CondReg, ConstrainRC);
2361
2362 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2363 .addReg(CondReg);
2364 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2365 .addMBB(I.getOperand(1).getMBB());
2366
2367 I.eraseFromParent();
2368 return true;
2369}
2370
2371bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
2372 MachineInstr &I) const {
2373 Register DstReg = I.getOperand(0).getReg();
2374 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2375 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2376 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2377 if (IsVGPR)
2378 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2379
2380 return RBI.constrainGenericRegister(
2381 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2382}
2383
2384bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2385 Register DstReg = I.getOperand(0).getReg();
2386 Register SrcReg = I.getOperand(1).getReg();
2387 Register MaskReg = I.getOperand(2).getReg();
2388 LLT Ty = MRI->getType(DstReg);
2389 LLT MaskTy = MRI->getType(MaskReg);
2390
2391 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2392 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2393 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2394 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2395 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2396 return false;
2397
2398 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2399 const TargetRegisterClass &RegRC
2400 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2401
2402 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2403 *MRI);
2404 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2405 *MRI);
2406 const TargetRegisterClass *MaskRC =
2407 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2408
2409 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2410 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2411 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2412 return false;
2413
2414 MachineBasicBlock *BB = I.getParent();
2415 const DebugLoc &DL = I.getDebugLoc();
2416 if (Ty.getSizeInBits() == 32) {
2417 assert(MaskTy.getSizeInBits() == 32 &&((MaskTy.getSizeInBits() == 32 && "ptrmask should have been narrowed during legalize"
) ? static_cast<void> (0) : __assert_fail ("MaskTy.getSizeInBits() == 32 && \"ptrmask should have been narrowed during legalize\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2418, __PRETTY_FUNCTION__))
2418 "ptrmask should have been narrowed during legalize")((MaskTy.getSizeInBits() == 32 && "ptrmask should have been narrowed during legalize"
) ? static_cast<void> (0) : __assert_fail ("MaskTy.getSizeInBits() == 32 && \"ptrmask should have been narrowed during legalize\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2418, __PRETTY_FUNCTION__))
;
2419
2420 BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2421 .addReg(SrcReg)
2422 .addReg(MaskReg);
2423 I.eraseFromParent();
2424 return true;
2425 }
2426
2427 Register HiReg = MRI->createVirtualRegister(&RegRC);
2428 Register LoReg = MRI->createVirtualRegister(&RegRC);
2429
2430 // Extract the subregisters from the source pointer.
2431 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2432 .addReg(SrcReg, 0, AMDGPU::sub0);
2433 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2434 .addReg(SrcReg, 0, AMDGPU::sub1);
2435
2436 Register MaskedLo, MaskedHi;
2437
2438 // Try to avoid emitting a bit operation when we only need to touch half of
2439 // the 64-bit pointer.
2440 APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2441
2442 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2443 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2444 if ((MaskOnes & MaskLo32) == MaskLo32) {
2445 // If all the bits in the low half are 1, we only need a copy for it.
2446 MaskedLo = LoReg;
2447 } else {
2448 // Extract the mask subregister and apply the and.
2449 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2450 MaskedLo = MRI->createVirtualRegister(&RegRC);
2451
2452 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2453 .addReg(MaskReg, 0, AMDGPU::sub0);
2454 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2455 .addReg(LoReg)
2456 .addReg(MaskLo);
2457 }
2458
2459 if ((MaskOnes & MaskHi32) == MaskHi32) {
2460 // If all the bits in the high half are 1, we only need a copy for it.
2461 MaskedHi = HiReg;
2462 } else {
2463 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2464 MaskedHi = MRI->createVirtualRegister(&RegRC);
2465
2466 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2467 .addReg(MaskReg, 0, AMDGPU::sub1);
2468 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2469 .addReg(HiReg)
2470 .addReg(MaskHi);
2471 }
2472
2473 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2474 .addReg(MaskedLo)
2475 .addImm(AMDGPU::sub0)
2476 .addReg(MaskedHi)
2477 .addImm(AMDGPU::sub1);
2478 I.eraseFromParent();
2479 return true;
2480}
2481
2482/// Return the register to use for the index value, and the subregister to use
2483/// for the indirectly accessed register.
2484static std::pair<Register, unsigned>
2485computeIndirectRegIndex(MachineRegisterInfo &MRI,
2486 const SIRegisterInfo &TRI,
2487 const TargetRegisterClass *SuperRC,
2488 Register IdxReg,
2489 unsigned EltSize) {
2490 Register IdxBaseReg;
2491 int Offset;
2492 MachineInstr *Unused;
2493
2494 std::tie(IdxBaseReg, Offset, Unused)
2495 = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2496 if (IdxBaseReg == AMDGPU::NoRegister) {
2497 // This will happen if the index is a known constant. This should ordinarily
2498 // be legalized out, but handle it as a register just in case.
2499 assert(Offset == 0)((Offset == 0) ? static_cast<void> (0) : __assert_fail (
"Offset == 0", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2499, __PRETTY_FUNCTION__))
;
2500 IdxBaseReg = IdxReg;
2501 }
2502
2503 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2504
2505 // Skip out of bounds offsets, or else we would end up using an undefined
2506 // register.
2507 if (static_cast<unsigned>(Offset) >= SubRegs.size())
2508 return std::make_pair(IdxReg, SubRegs[0]);
2509 return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2510}
2511
2512bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2513 MachineInstr &MI) const {
2514 Register DstReg = MI.getOperand(0).getReg();
2515 Register SrcReg = MI.getOperand(1).getReg();
2516 Register IdxReg = MI.getOperand(2).getReg();
2517
2518 LLT DstTy = MRI->getType(DstReg);
2519 LLT SrcTy = MRI->getType(SrcReg);
2520
2521 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2522 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2523 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2524
2525 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2526 // into a waterfall loop.
2527 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2528 return false;
2529
2530 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2531 *MRI);
2532 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2533 *MRI);
2534 if (!SrcRC || !DstRC)
2535 return false;
2536 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2537 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2538 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2539 return false;
2540
2541 MachineBasicBlock *BB = MI.getParent();
2542 const DebugLoc &DL = MI.getDebugLoc();
2543 const bool Is64 = DstTy.getSizeInBits() == 64;
2544
2545 unsigned SubReg;
2546 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2547 DstTy.getSizeInBits() / 8);
2548
2549 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2550 if (DstTy.getSizeInBits() != 32 && !Is64)
2551 return false;
2552
2553 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2554 .addReg(IdxReg);
2555
2556 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2557 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2558 .addReg(SrcReg, 0, SubReg)
2559 .addReg(SrcReg, RegState::Implicit);
2560 MI.eraseFromParent();
2561 return true;
2562 }
2563
2564 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2565 return false;
2566
2567 if (!STI.useVGPRIndexMode()) {
2568 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2569 .addReg(IdxReg);
2570 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2571 .addReg(SrcReg, 0, SubReg)
2572 .addReg(SrcReg, RegState::Implicit);
2573 MI.eraseFromParent();
2574 return true;
2575 }
2576
2577 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2578 .addReg(IdxReg)
2579 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2580 BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2581 .addReg(SrcReg, 0, SubReg)
2582 .addReg(SrcReg, RegState::Implicit)
2583 .addReg(AMDGPU::M0, RegState::Implicit);
2584 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2585
2586 MI.eraseFromParent();
2587 return true;
2588}
2589
2590// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2591bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2592 MachineInstr &MI) const {
2593 Register DstReg = MI.getOperand(0).getReg();
2594 Register VecReg = MI.getOperand(1).getReg();
2595 Register ValReg = MI.getOperand(2).getReg();
2596 Register IdxReg = MI.getOperand(3).getReg();
2597
2598 LLT VecTy = MRI->getType(DstReg);
2599 LLT ValTy = MRI->getType(ValReg);
2600 unsigned VecSize = VecTy.getSizeInBits();
2601 unsigned ValSize = ValTy.getSizeInBits();
2602
2603 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2604 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2605 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2606
2607 assert(VecTy.getElementType() == ValTy)((VecTy.getElementType() == ValTy) ? static_cast<void> (
0) : __assert_fail ("VecTy.getElementType() == ValTy", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2607, __PRETTY_FUNCTION__))
;
2608
2609 // The index must be scalar. If it wasn't RegBankSelect should have moved this
2610 // into a waterfall loop.
2611 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2612 return false;
2613
2614 const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2615 *MRI);
2616 const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2617 *MRI);
2618
2619 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2620 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2621 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2622 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2623 return false;
2624
2625 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2626 return false;
2627
2628 unsigned SubReg;
2629 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2630 ValSize / 8);
2631
2632 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2633 STI.useVGPRIndexMode();
2634
2635 MachineBasicBlock *BB = MI.getParent();
2636 const DebugLoc &DL = MI.getDebugLoc();
2637
2638 if (IndexMode) {
2639 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2640 .addReg(IdxReg)
2641 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2642 } else {
2643 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2644 .addReg(IdxReg);
2645 }
2646
2647 const MCInstrDesc &RegWriteOp
2648 = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2649 VecRB->getID() == AMDGPU::SGPRRegBankID);
2650 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2651 .addReg(VecReg)
2652 .addReg(ValReg)
2653 .addImm(SubReg);
2654
2655 if (IndexMode)
2656 BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2657
2658 MI.eraseFromParent();
2659 return true;
2660}
2661
2662static bool isZeroOrUndef(int X) {
2663 return X == 0 || X == -1;
2664}
2665
2666static bool isOneOrUndef(int X) {
2667 return X == 1 || X == -1;
2668}
2669
2670static bool isZeroOrOneOrUndef(int X) {
2671 return X == 0 || X == 1 || X == -1;
2672}
2673
2674// Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2675// 32-bit register.
2676static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2677 ArrayRef<int> Mask) {
2678 NewMask[0] = Mask[0];
2679 NewMask[1] = Mask[1];
2680 if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2681 return Src0;
2682
2683 assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1)((NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1) ? static_cast
<void> (0) : __assert_fail ("NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1"
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2683, __PRETTY_FUNCTION__))
;
2684 assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1)((NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1) ? static_cast
<void> (0) : __assert_fail ("NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1"
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2684, __PRETTY_FUNCTION__))
;
2685
2686 // Shift the mask inputs to be 0/1;
2687 NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2688 NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2689 return Src1;
2690}
2691
2692// This is only legal with VOP3P instructions as an aid to op_sel matching.
2693bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2694 MachineInstr &MI) const {
2695 Register DstReg = MI.getOperand(0).getReg();
2696 Register Src0Reg = MI.getOperand(1).getReg();
2697 Register Src1Reg = MI.getOperand(2).getReg();
2698 ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2699
2700 const LLT V2S16 = LLT::vector(2, 16);
2701 if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2702 return false;
2703
2704 if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2705 return false;
2706
2707 assert(ShufMask.size() == 2)((ShufMask.size() == 2) ? static_cast<void> (0) : __assert_fail
("ShufMask.size() == 2", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2707, __PRETTY_FUNCTION__))
;
2708 assert(STI.hasSDWA() && "no target has VOP3P but not SDWA")((STI.hasSDWA() && "no target has VOP3P but not SDWA"
) ? static_cast<void> (0) : __assert_fail ("STI.hasSDWA() && \"no target has VOP3P but not SDWA\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2708, __PRETTY_FUNCTION__))
;
2709
2710 MachineBasicBlock *MBB = MI.getParent();
2711 const DebugLoc &DL = MI.getDebugLoc();
2712
2713 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2714 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2715 const TargetRegisterClass &RC = IsVALU ?
2716 AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2717
2718 // Handle the degenerate case which should have folded out.
2719 if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2720 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2721
2722 MI.eraseFromParent();
2723 return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2724 }
2725
2726 // A legal VOP3P mask only reads one of the sources.
2727 int Mask[2];
2728 Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2729
2730 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2731 !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2732 return false;
2733
2734 // TODO: This also should have been folded out
2735 if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2736 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2737 .addReg(SrcVec);
2738
2739 MI.eraseFromParent();
2740 return true;
2741 }
2742
2743 if (Mask[0] == 1 && Mask[1] == -1) {
2744 if (IsVALU) {
2745 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2746 .addImm(16)
2747 .addReg(SrcVec);
2748 } else {
2749 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2750 .addReg(SrcVec)
2751 .addImm(16);
2752 }
2753 } else if (Mask[0] == -1 && Mask[1] == 0) {
2754 if (IsVALU) {
2755 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2756 .addImm(16)
2757 .addReg(SrcVec);
2758 } else {
2759 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2760 .addReg(SrcVec)
2761 .addImm(16);
2762 }
2763 } else if (Mask[0] == 0 && Mask[1] == 0) {
2764 if (IsVALU) {
2765 // Write low half of the register into the high half.
2766 MachineInstr *MovSDWA =
2767 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2768 .addImm(0) // $src0_modifiers
2769 .addReg(SrcVec) // $src0
2770 .addImm(0) // $clamp
2771 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2772 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2773 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2774 .addReg(SrcVec, RegState::Implicit);
2775 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2776 } else {
2777 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2778 .addReg(SrcVec)
2779 .addReg(SrcVec);
2780 }
2781 } else if (Mask[0] == 1 && Mask[1] == 1) {
2782 if (IsVALU) {
2783 // Write high half of the register into the low half.
2784 MachineInstr *MovSDWA =
2785 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2786 .addImm(0) // $src0_modifiers
2787 .addReg(SrcVec) // $src0
2788 .addImm(0) // $clamp
2789 .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
2790 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2791 .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
2792 .addReg(SrcVec, RegState::Implicit);
2793 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2794 } else {
2795 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2796 .addReg(SrcVec)
2797 .addReg(SrcVec);
2798 }
2799 } else if (Mask[0] == 1 && Mask[1] == 0) {
2800 if (IsVALU) {
2801 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2802 .addReg(SrcVec)
2803 .addReg(SrcVec)
2804 .addImm(16);
2805 } else {
2806 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2807 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2808 .addReg(SrcVec)
2809 .addImm(16);
2810 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2811 .addReg(TmpReg)
2812 .addReg(SrcVec);
2813 }
2814 } else
2815 llvm_unreachable("all shuffle masks should be handled")::llvm::llvm_unreachable_internal("all shuffle masks should be handled"
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2815)
;
2816
2817 MI.eraseFromParent();
2818 return true;
2819}
2820
2821bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2822 if (I.isPHI())
2823 return selectPHI(I);
2824
2825 if (!I.isPreISelOpcode()) {
2826 if (I.isCopy())
2827 return selectCOPY(I);
2828 return true;
2829 }
2830
2831 switch (I.getOpcode()) {
2832 case TargetOpcode::G_AND:
2833 case TargetOpcode::G_OR:
2834 case TargetOpcode::G_XOR:
2835 if (selectImpl(I, *CoverageInfo))
2836 return true;
2837 return selectG_AND_OR_XOR(I);
2838 case TargetOpcode::G_ADD:
2839 case TargetOpcode::G_SUB:
2840 if (selectImpl(I, *CoverageInfo))
2841 return true;
2842 return selectG_ADD_SUB(I);
2843 case TargetOpcode::G_UADDO:
2844 case TargetOpcode::G_USUBO:
2845 case TargetOpcode::G_UADDE:
2846 case TargetOpcode::G_USUBE:
2847 return selectG_UADDO_USUBO_UADDE_USUBE(I);
2848 case TargetOpcode::G_INTTOPTR:
2849 case TargetOpcode::G_BITCAST:
2850 case TargetOpcode::G_PTRTOINT:
2851 return selectCOPY(I);
2852 case TargetOpcode::G_CONSTANT:
2853 case TargetOpcode::G_FCONSTANT:
2854 return selectG_CONSTANT(I);
2855 case TargetOpcode::G_FNEG:
2856 if (selectImpl(I, *CoverageInfo))
2857 return true;
2858 return selectG_FNEG(I);
2859 case TargetOpcode::G_FABS:
2860 if (selectImpl(I, *CoverageInfo))
2861 return true;
2862 return selectG_FABS(I);
2863 case TargetOpcode::G_EXTRACT:
2864 return selectG_EXTRACT(I);
2865 case TargetOpcode::G_MERGE_VALUES:
2866 case TargetOpcode::G_BUILD_VECTOR:
2867 case TargetOpcode::G_CONCAT_VECTORS:
2868 return selectG_MERGE_VALUES(I);
2869 case TargetOpcode::G_UNMERGE_VALUES:
2870 return selectG_UNMERGE_VALUES(I);
2871 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2872 return selectG_BUILD_VECTOR_TRUNC(I);
2873 case TargetOpcode::G_PTR_ADD:
2874 return selectG_PTR_ADD(I);
2875 case TargetOpcode::G_IMPLICIT_DEF:
2876 return selectG_IMPLICIT_DEF(I);
2877 case TargetOpcode::G_FREEZE:
2878 return selectCOPY(I);
2879 case TargetOpcode::G_INSERT:
2880 return selectG_INSERT(I);
2881 case TargetOpcode::G_INTRINSIC:
2882 return selectG_INTRINSIC(I);
2883 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2884 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2885 case TargetOpcode::G_ICMP:
2886 if (selectG_ICMP(I))
2887 return true;
2888 return selectImpl(I, *CoverageInfo);
2889 case TargetOpcode::G_LOAD:
2890 case TargetOpcode::G_ATOMIC_CMPXCHG:
2891 case TargetOpcode::G_ATOMICRMW_XCHG:
2892 case TargetOpcode::G_ATOMICRMW_ADD:
2893 case TargetOpcode::G_ATOMICRMW_SUB:
2894 case TargetOpcode::G_ATOMICRMW_AND:
2895 case TargetOpcode::G_ATOMICRMW_OR:
2896 case TargetOpcode::G_ATOMICRMW_XOR:
2897 case TargetOpcode::G_ATOMICRMW_MIN:
2898 case TargetOpcode::G_ATOMICRMW_MAX:
2899 case TargetOpcode::G_ATOMICRMW_UMIN:
2900 case TargetOpcode::G_ATOMICRMW_UMAX:
2901 case TargetOpcode::G_ATOMICRMW_FADD:
2902 case AMDGPU::G_AMDGPU_ATOMIC_INC:
2903 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2904 return selectG_LOAD_ATOMICRMW(I);
2905 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2906 return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2907 case TargetOpcode::G_SELECT:
2908 return selectG_SELECT(I);
2909 case TargetOpcode::G_STORE:
2910 return selectG_STORE(I);
2911 case TargetOpcode::G_TRUNC:
2912 return selectG_TRUNC(I);
2913 case TargetOpcode::G_SEXT:
2914 case TargetOpcode::G_ZEXT:
2915 case TargetOpcode::G_ANYEXT:
2916 case TargetOpcode::G_SEXT_INREG:
2917 if (selectImpl(I, *CoverageInfo))
2918 return true;
2919 return selectG_SZA_EXT(I);
2920 case TargetOpcode::G_BRCOND:
2921 return selectG_BRCOND(I);
2922 case TargetOpcode::G_FRAME_INDEX:
2923 case TargetOpcode::G_GLOBAL_VALUE:
2924 return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
2925 case TargetOpcode::G_PTRMASK:
2926 return selectG_PTRMASK(I);
2927 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2928 return selectG_EXTRACT_VECTOR_ELT(I);
2929 case TargetOpcode::G_INSERT_VECTOR_ELT:
2930 return selectG_INSERT_VECTOR_ELT(I);
2931 case TargetOpcode::G_SHUFFLE_VECTOR:
2932 return selectG_SHUFFLE_VECTOR(I);
2933 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2934 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2935 const AMDGPU::ImageDimIntrinsicInfo *Intr
2936 = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
2937 assert(Intr && "not an image intrinsic with image pseudo")((Intr && "not an image intrinsic with image pseudo")
? static_cast<void> (0) : __assert_fail ("Intr && \"not an image intrinsic with image pseudo\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 2937, __PRETTY_FUNCTION__))
;
2938 return selectImageIntrinsic(I, Intr);
2939 }
2940 default:
2941 return selectImpl(I, *CoverageInfo);
2942 }
2943 return false;
2944}
2945
2946InstructionSelector::ComplexRendererFns
2947AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
2948 return {{
2949 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2950 }};
2951
2952}
2953
2954std::pair<Register, unsigned>
2955AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
2956 Register Src = Root.getReg();
2957 Register OrigSrc = Src;
2958 unsigned Mods = 0;
2959 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
2960
2961 if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
2962 Src = MI->getOperand(1).getReg();
2963 Mods |= SISrcMods::NEG;
2964 MI = getDefIgnoringCopies(Src, *MRI);
2965 }
2966
2967 if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
2968 Src = MI->getOperand(1).getReg();
2969 Mods |= SISrcMods::ABS;
2970 }
2971
2972 if (Mods != 0 &&
2973 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
2974 MachineInstr *UseMI = Root.getParent();
2975
2976 // If we looked through copies to find source modifiers on an SGPR operand,
2977 // we now have an SGPR register source. To avoid potentially violating the
2978 // constant bus restriction, we need to insert a copy to a VGPR.
2979 Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
2980 BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
2981 TII.get(AMDGPU::COPY), VGPRSrc)
2982 .addReg(Src);
2983 Src = VGPRSrc;
2984 }
2985
2986 return std::make_pair(Src, Mods);
2987}
2988
2989///
2990/// This will select either an SGPR or VGPR operand and will save us from
2991/// having to write an extra tablegen pattern.
2992InstructionSelector::ComplexRendererFns
2993AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
2994 return {{
2995 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2996 }};
2997}
2998
2999InstructionSelector::ComplexRendererFns
3000AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3001 Register Src;
3002 unsigned Mods;
3003 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3004
3005 return {{
3006 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3007 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3008 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3009 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3010 }};
3011}
3012
3013InstructionSelector::ComplexRendererFns
3014AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3015 return {{
3016 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3017 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3018 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3019 }};
3020}
3021
3022InstructionSelector::ComplexRendererFns
3023AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3024 Register Src;
3025 unsigned Mods;
3026 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3027
3028 return {{
3029 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3030 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3031 }};
3032}
3033
3034InstructionSelector::ComplexRendererFns
3035AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3036 Register Reg = Root.getReg();
3037 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3038 if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3039 Def->getOpcode() == AMDGPU::G_FABS))
3040 return {};
3041 return {{
3042 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3043 }};
3044}
3045
3046std::pair<Register, unsigned>
3047AMDGPUInstructionSelector::selectVOP3PModsImpl(
3048 Register Src, const MachineRegisterInfo &MRI) const {
3049 unsigned Mods = 0;
3050 MachineInstr *MI = MRI.getVRegDef(Src);
3051
3052 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3053 // It's possible to see an f32 fneg here, but unlikely.
3054 // TODO: Treat f32 fneg as only high bit.
3055 MRI.getType(Src) == LLT::vector(2, 16)) {
3056 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3057 Src = MI->getOperand(1).getReg();
3058 MI = MRI.getVRegDef(Src);
Value stored to 'MI' is never read
3059 }
3060
3061 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3062
3063 // Packed instructions do not have abs modifiers.
3064 Mods |= SISrcMods::OP_SEL_1;
3065
3066 return std::make_pair(Src, Mods);
3067}
3068
3069InstructionSelector::ComplexRendererFns
3070AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3071 MachineRegisterInfo &MRI
3072 = Root.getParent()->getParent()->getParent()->getRegInfo();
3073
3074 Register Src;
3075 unsigned Mods;
3076 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3077
3078 return {{
3079 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3080 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3081 }};
3082}
3083
3084InstructionSelector::ComplexRendererFns
3085AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3086 Register Src;
3087 unsigned Mods;
3088 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3089 if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3090 return None;
3091
3092 return {{
3093 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3094 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3095 }};
3096}
3097
3098InstructionSelector::ComplexRendererFns
3099AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3100 // FIXME: Handle op_sel
3101 return {{
3102 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3103 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3104 }};
3105}
3106
3107InstructionSelector::ComplexRendererFns
3108AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3109 SmallVector<GEPInfo, 4> AddrInfo;
3110 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3111
3112 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3113 return None;
3114
3115 const GEPInfo &GEPInfo = AddrInfo[0];
3116 Optional<int64_t> EncodedImm =
3117 AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3118 if (!EncodedImm)
3119 return None;
3120
3121 unsigned PtrReg = GEPInfo.SgprParts[0];
3122 return {{
3123 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3124 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3125 }};
3126}
3127
3128InstructionSelector::ComplexRendererFns
3129AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3130 SmallVector<GEPInfo, 4> AddrInfo;
3131 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3132
3133 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3134 return None;
3135
3136 const GEPInfo &GEPInfo = AddrInfo[0];
3137 Register PtrReg = GEPInfo.SgprParts[0];
3138 Optional<int64_t> EncodedImm =
3139 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3140 if (!EncodedImm)
3141 return None;
3142
3143 return {{
3144 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3145 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3146 }};
3147}
3148
3149InstructionSelector::ComplexRendererFns
3150AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3151 MachineInstr *MI = Root.getParent();
3152 MachineBasicBlock *MBB = MI->getParent();
3153
3154 SmallVector<GEPInfo, 4> AddrInfo;
3155 getAddrModeInfo(*MI, *MRI, AddrInfo);
3156
3157 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3158 // then we can select all ptr + 32-bit offsets not just immediate offsets.
3159 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3160 return None;
3161
3162 const GEPInfo &GEPInfo = AddrInfo[0];
3163 // SGPR offset is unsigned.
3164 if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3165 return None;
3166
3167 // If we make it this far we have a load with an 32-bit immediate offset.
3168 // It is OK to select this using a sgpr offset, because we have already
3169 // failed trying to select this load into one of the _IMM variants since
3170 // the _IMM Patterns are considered before the _SGPR patterns.
3171 Register PtrReg = GEPInfo.SgprParts[0];
3172 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3173 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3174 .addImm(GEPInfo.Imm);
3175 return {{
3176 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3177 [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3178 }};
3179}
3180
3181template <bool Signed>
3182InstructionSelector::ComplexRendererFns
3183AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3184 MachineInstr *MI = Root.getParent();
3185
3186 InstructionSelector::ComplexRendererFns Default = {{
3187 [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3188 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset
3189 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc
3190 }};
3191
3192 if (!STI.hasFlatInstOffsets())
3193 return Default;
3194
3195 const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3196 if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3197 return Default;
3198
3199 Optional<int64_t> Offset =
3200 getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3201 if (!Offset.hasValue())
3202 return Default;
3203
3204 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3205 if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3206 return Default;
3207
3208 Register BasePtr = OpDef->getOperand(1).getReg();
3209
3210 return {{
3211 [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3212 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3213 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc
3214 }};
3215}
3216
3217InstructionSelector::ComplexRendererFns
3218AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3219 return selectFlatOffsetImpl<false>(Root);
3220}
3221
3222InstructionSelector::ComplexRendererFns
3223AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3224 return selectFlatOffsetImpl<true>(Root);
3225}
3226
3227static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3228 auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3229 return PSV && PSV->isStack();
3230}
3231
3232InstructionSelector::ComplexRendererFns
3233AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3234 MachineInstr *MI = Root.getParent();
3235 MachineBasicBlock *MBB = MI->getParent();
3236 MachineFunction *MF = MBB->getParent();
3237 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3238
3239 int64_t Offset = 0;
3240 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3241 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3242 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3243
3244 // TODO: Should this be inside the render function? The iterator seems to
3245 // move.
3246 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3247 HighBits)
3248 .addImm(Offset & ~4095);
3249
3250 return {{[=](MachineInstrBuilder &MIB) { // rsrc
3251 MIB.addReg(Info->getScratchRSrcReg());
3252 },
3253 [=](MachineInstrBuilder &MIB) { // vaddr
3254 MIB.addReg(HighBits);
3255 },
3256 [=](MachineInstrBuilder &MIB) { // soffset
3257 const MachineMemOperand *MMO = *MI->memoperands_begin();
3258 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3259
3260 if (isStackPtrRelative(PtrInfo))
3261 MIB.addReg(Info->getStackPtrOffsetReg());
3262 else
3263 MIB.addImm(0);
3264 },
3265 [=](MachineInstrBuilder &MIB) { // offset
3266 MIB.addImm(Offset & 4095);
3267 }}};
3268 }
3269
3270 assert(Offset == 0 || Offset == -1)((Offset == 0 || Offset == -1) ? static_cast<void> (0) :
__assert_fail ("Offset == 0 || Offset == -1", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3270, __PRETTY_FUNCTION__))
;
3271
3272 // Try to fold a frame index directly into the MUBUF vaddr field, and any
3273 // offsets.
3274 Optional<int> FI;
3275 Register VAddr = Root.getReg();
3276 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3277 if (isBaseWithConstantOffset(Root, *MRI)) {
3278 const MachineOperand &LHS = RootDef->getOperand(1);
3279 const MachineOperand &RHS = RootDef->getOperand(2);
3280 const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3281 const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3282 if (LHSDef && RHSDef) {
3283 int64_t PossibleOffset =
3284 RHSDef->getOperand(1).getCImm()->getSExtValue();
3285 if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3286 (!STI.privateMemoryResourceIsRangeChecked() ||
3287 KnownBits->signBitIsZero(LHS.getReg()))) {
3288 if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3289 FI = LHSDef->getOperand(1).getIndex();
3290 else
3291 VAddr = LHS.getReg();
3292 Offset = PossibleOffset;
3293 }
3294 }
3295 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3296 FI = RootDef->getOperand(1).getIndex();
3297 }
3298 }
3299
3300 return {{[=](MachineInstrBuilder &MIB) { // rsrc
3301 MIB.addReg(Info->getScratchRSrcReg());
3302 },
3303 [=](MachineInstrBuilder &MIB) { // vaddr
3304 if (FI.hasValue())
3305 MIB.addFrameIndex(FI.getValue());
3306 else
3307 MIB.addReg(VAddr);
3308 },
3309 [=](MachineInstrBuilder &MIB) { // soffset
3310 // If we don't know this private access is a local stack object, it
3311 // needs to be relative to the entry point's scratch wave offset.
3312 // TODO: Should split large offsets that don't fit like above.
3313 // TODO: Don't use scratch wave offset just because the offset
3314 // didn't fit.
3315 if (!Info->isEntryFunction() && FI.hasValue())
3316 MIB.addReg(Info->getStackPtrOffsetReg());
3317 else
3318 MIB.addImm(0);
3319 },
3320 [=](MachineInstrBuilder &MIB) { // offset
3321 MIB.addImm(Offset);
3322 }}};
3323}
3324
3325bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3326 int64_t Offset,
3327 unsigned OffsetBits) const {
3328 if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3329 (OffsetBits == 8 && !isUInt<8>(Offset)))
3330 return false;
3331
3332 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3333 return true;
3334
3335 // On Southern Islands instruction with a negative base value and an offset
3336 // don't seem to work.
3337 return KnownBits->signBitIsZero(Base);
3338}
3339
3340InstructionSelector::ComplexRendererFns
3341AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3342 MachineOperand &Root) const {
3343 MachineInstr *MI = Root.getParent();
3344 MachineBasicBlock *MBB = MI->getParent();
3345
3346 int64_t Offset = 0;
3347 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3348 !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3349 return {};
3350
3351 const MachineFunction *MF = MBB->getParent();
3352 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3353 const MachineMemOperand *MMO = *MI->memoperands_begin();
3354 const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3355
3356 return {{
3357 [=](MachineInstrBuilder &MIB) { // rsrc
3358 MIB.addReg(Info->getScratchRSrcReg());
3359 },
3360 [=](MachineInstrBuilder &MIB) { // soffset
3361 if (isStackPtrRelative(PtrInfo))
3362 MIB.addReg(Info->getStackPtrOffsetReg());
3363 else
3364 MIB.addImm(0);
3365 },
3366 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3367 }};
3368}
3369
3370std::pair<Register, unsigned>
3371AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3372 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3373 if (!RootDef)
3374 return std::make_pair(Root.getReg(), 0);
3375
3376 int64_t ConstAddr = 0;
3377
3378 Register PtrBase;
3379 int64_t Offset;
3380 std::tie(PtrBase, Offset) =
3381 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3382
3383 if (Offset) {
3384 if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3385 // (add n0, c0)
3386 return std::make_pair(PtrBase, Offset);
3387 }
3388 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3389 // TODO
3390
3391
3392 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3393 // TODO
3394
3395 }
3396
3397 return std::make_pair(Root.getReg(), 0);
3398}
3399
3400InstructionSelector::ComplexRendererFns
3401AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3402 Register Reg;
3403 unsigned Offset;
3404 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3405 return {{
3406 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3407 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3408 }};
3409}
3410
3411InstructionSelector::ComplexRendererFns
3412AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3413 Register Reg;
3414 unsigned Offset;
3415 std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
3416 return {{
3417 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3418 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3419 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3420 }};
3421}
3422
3423std::pair<Register, unsigned>
3424AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
3425 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3426 if (!RootDef)
3427 return std::make_pair(Root.getReg(), 0);
3428
3429 int64_t ConstAddr = 0;
3430
3431 Register PtrBase;
3432 int64_t Offset;
3433 std::tie(PtrBase, Offset) =
3434 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3435
3436 if (Offset) {
3437 int64_t DWordOffset0 = Offset / 4;
3438 int64_t DWordOffset1 = DWordOffset0 + 1;
3439 if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
3440 // (add n0, c0)
3441 return std::make_pair(PtrBase, DWordOffset0);
3442 }
3443 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3444 // TODO
3445
3446 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3447 // TODO
3448
3449 }
3450
3451 return std::make_pair(Root.getReg(), 0);
3452}
3453
3454/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3455/// the base value with the constant offset. There may be intervening copies
3456/// between \p Root and the identified constant. Returns \p Root, 0 if this does
3457/// not match the pattern.
3458std::pair<Register, int64_t>
3459AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3460 Register Root, const MachineRegisterInfo &MRI) const {
3461 MachineInstr *RootI = MRI.getVRegDef(Root);
3462 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3463 return {Root, 0};
3464
3465 MachineOperand &RHS = RootI->getOperand(2);
3466 Optional<ValueAndVReg> MaybeOffset
3467 = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3468 if (!MaybeOffset)
3469 return {Root, 0};
3470 return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3471}
3472
3473static void addZeroImm(MachineInstrBuilder &MIB) {
3474 MIB.addImm(0);
3475}
3476
3477/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3478/// BasePtr is not valid, a null base pointer will be used.
3479static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3480 uint32_t FormatLo, uint32_t FormatHi,
3481 Register BasePtr) {
3482 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3483 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3484 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3485 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3486
3487 B.buildInstr(AMDGPU::S_MOV_B32)
3488 .addDef(RSrc2)
3489 .addImm(FormatLo);
3490 B.buildInstr(AMDGPU::S_MOV_B32)
3491 .addDef(RSrc3)
3492 .addImm(FormatHi);
3493
3494 // Build the half of the subregister with the constants before building the
3495 // full 128-bit register. If we are building multiple resource descriptors,
3496 // this will allow CSEing of the 2-component register.
3497 B.buildInstr(AMDGPU::REG_SEQUENCE)
3498 .addDef(RSrcHi)
3499 .addReg(RSrc2)
3500 .addImm(AMDGPU::sub0)
3501 .addReg(RSrc3)
3502 .addImm(AMDGPU::sub1);
3503
3504 Register RSrcLo = BasePtr;
3505 if (!BasePtr) {
3506 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3507 B.buildInstr(AMDGPU::S_MOV_B64)
3508 .addDef(RSrcLo)
3509 .addImm(0);
3510 }
3511
3512 B.buildInstr(AMDGPU::REG_SEQUENCE)
3513 .addDef(RSrc)
3514 .addReg(RSrcLo)
3515 .addImm(AMDGPU::sub0_sub1)
3516 .addReg(RSrcHi)
3517 .addImm(AMDGPU::sub2_sub3);
3518
3519 return RSrc;
3520}
3521
3522static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3523 const SIInstrInfo &TII, Register BasePtr) {
3524 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3525
3526 // FIXME: Why are half the "default" bits ignored based on the addressing
3527 // mode?
3528 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3529}
3530
3531static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3532 const SIInstrInfo &TII, Register BasePtr) {
3533 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3534
3535 // FIXME: Why are half the "default" bits ignored based on the addressing
3536 // mode?
3537 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3538}
3539
3540AMDGPUInstructionSelector::MUBUFAddressData
3541AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3542 MUBUFAddressData Data;
3543 Data.N0 = Src;
3544
3545 Register PtrBase;
3546 int64_t Offset;
3547
3548 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3549 if (isUInt<32>(Offset)) {
3550 Data.N0 = PtrBase;
3551 Data.Offset = Offset;
3552 }
3553
3554 if (MachineInstr *InputAdd
3555 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3556 Data.N2 = InputAdd->getOperand(1).getReg();
3557 Data.N3 = InputAdd->getOperand(2).getReg();
3558
3559 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3560 // FIXME: Don't know this was defined by operand 0
3561 //
3562 // TODO: Remove this when we have copy folding optimizations after
3563 // RegBankSelect.
3564 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3565 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3566 }
3567
3568 return Data;
3569}
3570
3571/// Return if the addr64 mubuf mode should be used for the given address.
3572bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3573 // (ptr_add N2, N3) -> addr64, or
3574 // (ptr_add (ptr_add N2, N3), C1) -> addr64
3575 if (Addr.N2)
3576 return true;
3577
3578 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3579 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3580}
3581
3582/// Split an immediate offset \p ImmOffset depending on whether it fits in the
3583/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3584/// component.
3585void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3586 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3587 if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3588 return;
3589
3590 // Illegal offset, store it in soffset.
3591 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3592 B.buildInstr(AMDGPU::S_MOV_B32)
3593 .addDef(SOffset)
3594 .addImm(ImmOffset);
3595 ImmOffset = 0;
3596}
3597
3598bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3599 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3600 Register &SOffset, int64_t &Offset) const {
3601 // FIXME: Predicates should stop this from reaching here.
3602 // addr64 bit was removed for volcanic islands.
3603 if (!STI.hasAddr64() || STI.useFlatForGlobal())
3604 return false;
3605
3606 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3607 if (!shouldUseAddr64(AddrData))
3608 return false;
3609
3610 Register N0 = AddrData.N0;
3611 Register N2 = AddrData.N2;
3612 Register N3 = AddrData.N3;
3613 Offset = AddrData.Offset;
3614
3615 // Base pointer for the SRD.
3616 Register SRDPtr;
3617
3618 if (N2) {
3619 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3620 assert(N3)((N3) ? static_cast<void> (0) : __assert_fail ("N3", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3620, __PRETTY_FUNCTION__))
;
3621 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3622 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3623 // addr64, and construct the default resource from a 0 address.
3624 VAddr = N0;
3625 } else {
3626 SRDPtr = N3;
3627 VAddr = N2;
3628 }
3629 } else {
3630 // N2 is not divergent.
3631 SRDPtr = N2;
3632 VAddr = N3;
3633 }
3634 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3635 // Use the default null pointer in the resource
3636 VAddr = N0;
3637 } else {
3638 // N0 -> offset, or
3639 // (N0 + C1) -> offset
3640 SRDPtr = N0;
3641 }
3642
3643 MachineIRBuilder B(*Root.getParent());
3644 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3645 splitIllegalMUBUFOffset(B, SOffset, Offset);
3646 return true;
3647}
3648
3649bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3650 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3651 int64_t &Offset) const {
3652 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3653 if (shouldUseAddr64(AddrData))
3654 return false;
3655
3656 // N0 -> offset, or
3657 // (N0 + C1) -> offset
3658 Register SRDPtr = AddrData.N0;
3659 Offset = AddrData.Offset;
3660
3661 // TODO: Look through extensions for 32-bit soffset.
3662 MachineIRBuilder B(*Root.getParent());
3663
3664 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3665 splitIllegalMUBUFOffset(B, SOffset, Offset);
3666 return true;
3667}
3668
3669InstructionSelector::ComplexRendererFns
3670AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3671 Register VAddr;
3672 Register RSrcReg;
3673 Register SOffset;
3674 int64_t Offset = 0;
3675
3676 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3677 return {};
3678
3679 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3680 // pattern.
3681 return {{
3682 [=](MachineInstrBuilder &MIB) { // rsrc
3683 MIB.addReg(RSrcReg);
3684 },
3685 [=](MachineInstrBuilder &MIB) { // vaddr
3686 MIB.addReg(VAddr);
3687 },
3688 [=](MachineInstrBuilder &MIB) { // soffset
3689 if (SOffset)
3690 MIB.addReg(SOffset);
3691 else
3692 MIB.addImm(0);
3693 },
3694 [=](MachineInstrBuilder &MIB) { // offset
3695 MIB.addImm(Offset);
3696 },
3697 addZeroImm, // glc
3698 addZeroImm, // slc
3699 addZeroImm, // tfe
3700 addZeroImm, // dlc
3701 addZeroImm // swz
3702 }};
3703}
3704
3705InstructionSelector::ComplexRendererFns
3706AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3707 Register RSrcReg;
3708 Register SOffset;
3709 int64_t Offset = 0;
3710
3711 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3712 return {};
3713
3714 return {{
3715 [=](MachineInstrBuilder &MIB) { // rsrc
3716 MIB.addReg(RSrcReg);
3717 },
3718 [=](MachineInstrBuilder &MIB) { // soffset
3719 if (SOffset)
3720 MIB.addReg(SOffset);
3721 else
3722 MIB.addImm(0);
3723 },
3724 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3725 addZeroImm, // glc
3726 addZeroImm, // slc
3727 addZeroImm, // tfe
3728 addZeroImm, // dlc
3729 addZeroImm // swz
3730 }};
3731}
3732
3733InstructionSelector::ComplexRendererFns
3734AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3735 Register VAddr;
3736 Register RSrcReg;
3737 Register SOffset;
3738 int64_t Offset = 0;
3739
3740 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3741 return {};
3742
3743 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3744 // pattern.
3745 return {{
3746 [=](MachineInstrBuilder &MIB) { // rsrc
3747 MIB.addReg(RSrcReg);
3748 },
3749 [=](MachineInstrBuilder &MIB) { // vaddr
3750 MIB.addReg(VAddr);
3751 },
3752 [=](MachineInstrBuilder &MIB) { // soffset
3753 if (SOffset)
3754 MIB.addReg(SOffset);
3755 else
3756 MIB.addImm(0);
3757 },
3758 [=](MachineInstrBuilder &MIB) { // offset
3759 MIB.addImm(Offset);
3760 },
3761 addZeroImm // slc
3762 }};
3763}
3764
3765InstructionSelector::ComplexRendererFns
3766AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3767 Register RSrcReg;
3768 Register SOffset;
3769 int64_t Offset = 0;
3770
3771 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3772 return {};
3773
3774 return {{
3775 [=](MachineInstrBuilder &MIB) { // rsrc
3776 MIB.addReg(RSrcReg);
3777 },
3778 [=](MachineInstrBuilder &MIB) { // soffset
3779 if (SOffset)
3780 MIB.addReg(SOffset);
3781 else
3782 MIB.addImm(0);
3783 },
3784 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3785 addZeroImm // slc
3786 }};
3787}
3788
3789/// Get an immediate that must be 32-bits, and treated as zero extended.
3790static Optional<uint64_t> getConstantZext32Val(Register Reg,
3791 const MachineRegisterInfo &MRI) {
3792 // getConstantVRegVal sexts any values, so see if that matters.
3793 Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3794 if (!OffsetVal || !isInt<32>(*OffsetVal))
3795 return None;
3796 return Lo_32(*OffsetVal);
3797}
3798
3799InstructionSelector::ComplexRendererFns
3800AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3801 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3802 if (!OffsetVal)
3803 return {};
3804
3805 Optional<int64_t> EncodedImm =
3806 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3807 if (!EncodedImm)
3808 return {};
3809
3810 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
3811}
3812
3813InstructionSelector::ComplexRendererFns
3814AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3815 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)((STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) ? static_cast
<void> (0) : __assert_fail ("STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3815, __PRETTY_FUNCTION__))
;
3816
3817 Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3818 if (!OffsetVal)
3819 return {};
3820
3821 Optional<int64_t> EncodedImm
3822 = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3823 if (!EncodedImm)
3824 return {};
3825
3826 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
3827}
3828
3829void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3830 const MachineInstr &MI,
3831 int OpIdx) const {
3832 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&((MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx
== -1 && "Expected G_CONSTANT") ? static_cast<void
> (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3833, __PRETTY_FUNCTION__))
3833 "Expected G_CONSTANT")((MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx
== -1 && "Expected G_CONSTANT") ? static_cast<void
> (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3833, __PRETTY_FUNCTION__))
;
3834 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
3835}
3836
3837void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
3838 const MachineInstr &MI,
3839 int OpIdx) const {
3840 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&((MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx
== -1 && "Expected G_CONSTANT") ? static_cast<void
> (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3841, __PRETTY_FUNCTION__))
3841 "Expected G_CONSTANT")((MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx
== -1 && "Expected G_CONSTANT") ? static_cast<void
> (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3841, __PRETTY_FUNCTION__))
;
3842 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
3843}
3844
3845void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
3846 const MachineInstr &MI,
3847 int OpIdx) const {
3848 assert(OpIdx == -1)((OpIdx == -1) ? static_cast<void> (0) : __assert_fail (
"OpIdx == -1", "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3848, __PRETTY_FUNCTION__))
;
3849
3850 const MachineOperand &Op = MI.getOperand(1);
3851 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
3852 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
3853 else {
3854 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT")((MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"
) ? static_cast<void> (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3854, __PRETTY_FUNCTION__))
;
3855 MIB.addImm(Op.getCImm()->getSExtValue());
3856 }
3857}
3858
3859void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
3860 const MachineInstr &MI,
3861 int OpIdx) const {
3862 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&((MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx
== -1 && "Expected G_CONSTANT") ? static_cast<void
> (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3863, __PRETTY_FUNCTION__))
3863 "Expected G_CONSTANT")((MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx
== -1 && "Expected G_CONSTANT") ? static_cast<void
> (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && \"Expected G_CONSTANT\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3863, __PRETTY_FUNCTION__))
;
3864 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
3865}
3866
3867/// This only really exists to satisfy DAG type checking machinery, so is a
3868/// no-op here.
3869void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
3870 const MachineInstr &MI,
3871 int OpIdx) const {
3872 MIB.addImm(MI.getOperand(OpIdx).getImm());
3873}
3874
3875void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
3876 const MachineInstr &MI,
3877 int OpIdx) const {
3878 assert(OpIdx >= 0 && "expected to match an immediate operand")((OpIdx >= 0 && "expected to match an immediate operand"
) ? static_cast<void> (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3878, __PRETTY_FUNCTION__))
;
3879 MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
3880}
3881
3882void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
3883 const MachineInstr &MI,
3884 int OpIdx) const {
3885 assert(OpIdx >= 0 && "expected to match an immediate operand")((OpIdx >= 0 && "expected to match an immediate operand"
) ? static_cast<void> (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3885, __PRETTY_FUNCTION__))
;
3886 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
3887}
3888
3889void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
3890 const MachineInstr &MI,
3891 int OpIdx) const {
3892 assert(OpIdx >= 0 && "expected to match an immediate operand")((OpIdx >= 0 && "expected to match an immediate operand"
) ? static_cast<void> (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3892, __PRETTY_FUNCTION__))
;
3893 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
3894}
3895
3896void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
3897 const MachineInstr &MI,
3898 int OpIdx) const {
3899 assert(OpIdx >= 0 && "expected to match an immediate operand")((OpIdx >= 0 && "expected to match an immediate operand"
) ? static_cast<void> (0) : __assert_fail ("OpIdx >= 0 && \"expected to match an immediate operand\""
, "/build/llvm-toolchain-snapshot-12~++20200806111125+5446ec85070/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp"
, 3899, __PRETTY_FUNCTION__))
;
3900 MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
3901}
3902
3903bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
3904 return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
3905}
3906
3907bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
3908 return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
3909}
3910
3911bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
3912 return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
3913}
3914
3915bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
3916 return TII.isInlineConstant(Imm);
3917}