Bug Summary

File:build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Warning:line 2204, column 15
Called C++ object pointer is uninitialized

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name SIInstrInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-15/lib/clang/15.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-15/lib/clang/15.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/= -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-03-11-015528-30204-1 -x c++ /build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "SIMachineFunctionInfo.h"
21#include "llvm/Analysis/ValueTracking.h"
22#include "llvm/CodeGen/LiveIntervals.h"
23#include "llvm/CodeGen/LiveVariables.h"
24#include "llvm/CodeGen/MachineDominators.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/RegisterScavenging.h"
27#include "llvm/CodeGen/ScheduleDAG.h"
28#include "llvm/IR/DiagnosticInfo.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/MC/MCContext.h"
31#include "llvm/Support/CommandLine.h"
32#include "llvm/Target/TargetMachine.h"
33
34using namespace llvm;
35
36#define DEBUG_TYPE"si-instr-info" "si-instr-info"
37
38#define GET_INSTRINFO_CTOR_DTOR
39#include "AMDGPUGenInstrInfo.inc"
40
41namespace llvm {
42
43class AAResults;
44
45namespace AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50}
51}
52
53
54// Must be at least 4 to be able to branch over minimum unconditional branch
55// code. This is only for making it possible to write reasonably small tests for
56// long branches.
57static cl::opt<unsigned>
58BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
59 cl::desc("Restrict range of branch instructions (DEBUG)"));
60
61static cl::opt<bool> Fix16BitCopies(
62 "amdgpu-fix-16-bit-physreg-copies",
63 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
64 cl::init(true),
65 cl::ReallyHidden);
66
67SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
68 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
86static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
112 AAResults *AA) const {
113 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
114 // Normally VALU use of exec would block the rematerialization, but that
115 // is OK in this case to have an implicit exec read as all VALU do.
116 // We really want all of the generic logic for this except for this.
117
118 // Another potential implicit use is mode register. The core logic of
119 // the RA will not attempt rematerialization if mode is set anywhere
120 // in the function, otherwise it is safe since mode is not changed.
121
122 // There is difference to generic method which does not allow
123 // rematerialization if there are virtual register uses. We allow this,
124 // therefore this method includes SOP instructions as well.
125 return !MI.hasImplicitDef() &&
126 MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() &&
127 !MI.mayRaiseFPException();
128 }
129
130 return false;
131}
132
133// Returns true if the scalar result of a VALU instruction depends on exec.
134static bool resultDependsOnExec(const MachineInstr &MI) {
135 // Ignore comparisons which are only used masked with exec.
136 // This allows some hoisting/sinking of VALU comparisons.
137 if (MI.isCompare()) {
138 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
139 Register DstReg = MI.getOperand(0).getReg();
140 if (!DstReg.isVirtual())
141 return true;
142 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
143 switch (Use.getOpcode()) {
144 case AMDGPU::S_AND_SAVEEXEC_B32:
145 case AMDGPU::S_AND_SAVEEXEC_B64:
146 break;
147 case AMDGPU::S_AND_B32:
148 case AMDGPU::S_AND_B64:
149 if (!Use.readsRegister(AMDGPU::EXEC))
150 return true;
151 break;
152 default:
153 return true;
154 }
155 }
156 return false;
157 }
158
159 switch (MI.getOpcode()) {
160 default:
161 break;
162 case AMDGPU::V_READFIRSTLANE_B32:
163 return true;
164 }
165
166 return false;
167}
168
169bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
170 // Any implicit use of exec by VALU is not a real register read.
171 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
172 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
173}
174
175bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
176 int64_t &Offset0,
177 int64_t &Offset1) const {
178 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
179 return false;
180
181 unsigned Opc0 = Load0->getMachineOpcode();
182 unsigned Opc1 = Load1->getMachineOpcode();
183
184 // Make sure both are actually loads.
185 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
186 return false;
187
188 if (isDS(Opc0) && isDS(Opc1)) {
189
190 // FIXME: Handle this case:
191 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
192 return false;
193
194 // Check base reg.
195 if (Load0->getOperand(0) != Load1->getOperand(0))
196 return false;
197
198 // Skip read2 / write2 variants for simplicity.
199 // TODO: We should report true if the used offsets are adjacent (excluded
200 // st64 versions).
201 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
202 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
203 if (Offset0Idx == -1 || Offset1Idx == -1)
204 return false;
205
206 // XXX - be careful of dataless loads
207 // getNamedOperandIdx returns the index for MachineInstrs. Since they
208 // include the output in the operand list, but SDNodes don't, we need to
209 // subtract the index by one.
210 Offset0Idx -= get(Opc0).NumDefs;
211 Offset1Idx -= get(Opc1).NumDefs;
212 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
213 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
214 return true;
215 }
216
217 if (isSMRD(Opc0) && isSMRD(Opc1)) {
218 // Skip time and cache invalidation instructions.
219 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
220 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
221 return false;
222
223 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1))(static_cast <bool> (getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue
(Load1)) ? void (0) : __assert_fail ("getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 223, __extension__
__PRETTY_FUNCTION__))
;
224
225 // Check base reg.
226 if (Load0->getOperand(0) != Load1->getOperand(0))
227 return false;
228
229 const ConstantSDNode *Load0Offset =
230 dyn_cast<ConstantSDNode>(Load0->getOperand(1));
231 const ConstantSDNode *Load1Offset =
232 dyn_cast<ConstantSDNode>(Load1->getOperand(1));
233
234 if (!Load0Offset || !Load1Offset)
235 return false;
236
237 Offset0 = Load0Offset->getZExtValue();
238 Offset1 = Load1Offset->getZExtValue();
239 return true;
240 }
241
242 // MUBUF and MTBUF can access the same addresses.
243 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
244
245 // MUBUF and MTBUF have vaddr at different indices.
246 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
247 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
248 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
249 return false;
250
251 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
252 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
253
254 if (OffIdx0 == -1 || OffIdx1 == -1)
255 return false;
256
257 // getNamedOperandIdx returns the index for MachineInstrs. Since they
258 // include the output in the operand list, but SDNodes don't, we need to
259 // subtract the index by one.
260 OffIdx0 -= get(Opc0).NumDefs;
261 OffIdx1 -= get(Opc1).NumDefs;
262
263 SDValue Off0 = Load0->getOperand(OffIdx0);
264 SDValue Off1 = Load1->getOperand(OffIdx1);
265
266 // The offset might be a FrameIndexSDNode.
267 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
268 return false;
269
270 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
271 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
272 return true;
273 }
274
275 return false;
276}
277
278static bool isStride64(unsigned Opc) {
279 switch (Opc) {
280 case AMDGPU::DS_READ2ST64_B32:
281 case AMDGPU::DS_READ2ST64_B64:
282 case AMDGPU::DS_WRITE2ST64_B32:
283 case AMDGPU::DS_WRITE2ST64_B64:
284 return true;
285 default:
286 return false;
287 }
288}
289
290bool SIInstrInfo::getMemOperandsWithOffsetWidth(
291 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
292 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
293 const TargetRegisterInfo *TRI) const {
294 if (!LdSt.mayLoadOrStore())
295 return false;
296
297 unsigned Opc = LdSt.getOpcode();
298 OffsetIsScalable = false;
299 const MachineOperand *BaseOp, *OffsetOp;
300 int DataOpIdx;
301
302 if (isDS(LdSt)) {
303 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
304 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
305 if (OffsetOp) {
306 // Normal, single offset LDS instruction.
307 if (!BaseOp) {
308 // DS_CONSUME/DS_APPEND use M0 for the base address.
309 // TODO: find the implicit use operand for M0 and use that as BaseOp?
310 return false;
311 }
312 BaseOps.push_back(BaseOp);
313 Offset = OffsetOp->getImm();
314 // Get appropriate operand, and compute width accordingly.
315 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
316 if (DataOpIdx == -1)
317 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
318 Width = getOpSize(LdSt, DataOpIdx);
319 } else {
320 // The 2 offset instructions use offset0 and offset1 instead. We can treat
321 // these as a load with a single offset if the 2 offsets are consecutive.
322 // We will use this for some partially aligned loads.
323 const MachineOperand *Offset0Op =
324 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
325 const MachineOperand *Offset1Op =
326 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
327
328 unsigned Offset0 = Offset0Op->getImm();
329 unsigned Offset1 = Offset1Op->getImm();
330 if (Offset0 + 1 != Offset1)
331 return false;
332
333 // Each of these offsets is in element sized units, so we need to convert
334 // to bytes of the individual reads.
335
336 unsigned EltSize;
337 if (LdSt.mayLoad())
338 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
339 else {
340 assert(LdSt.mayStore())(static_cast <bool> (LdSt.mayStore()) ? void (0) : __assert_fail
("LdSt.mayStore()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 340, __extension__ __PRETTY_FUNCTION__))
;
341 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
342 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
343 }
344
345 if (isStride64(Opc))
346 EltSize *= 64;
347
348 BaseOps.push_back(BaseOp);
349 Offset = EltSize * Offset0;
350 // Get appropriate operand(s), and compute width accordingly.
351 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
352 if (DataOpIdx == -1) {
353 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
354 Width = getOpSize(LdSt, DataOpIdx);
355 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
356 Width += getOpSize(LdSt, DataOpIdx);
357 } else {
358 Width = getOpSize(LdSt, DataOpIdx);
359 }
360 }
361 return true;
362 }
363
364 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
365 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
366 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
367 return false;
368 BaseOps.push_back(RSrc);
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
370 if (BaseOp && !BaseOp->isFI())
371 BaseOps.push_back(BaseOp);
372 const MachineOperand *OffsetImm =
373 getNamedOperand(LdSt, AMDGPU::OpName::offset);
374 Offset = OffsetImm->getImm();
375 const MachineOperand *SOffset =
376 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
377 if (SOffset) {
378 if (SOffset->isReg())
379 BaseOps.push_back(SOffset);
380 else
381 Offset += SOffset->getImm();
382 }
383 // Get appropriate operand, and compute width accordingly.
384 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
385 if (DataOpIdx == -1)
386 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
387 Width = getOpSize(LdSt, DataOpIdx);
388 return true;
389 }
390
391 if (isMIMG(LdSt)) {
392 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
393 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
394 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
395 if (VAddr0Idx >= 0) {
396 // GFX10 possible NSA encoding.
397 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
398 BaseOps.push_back(&LdSt.getOperand(I));
399 } else {
400 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
401 }
402 Offset = 0;
403 // Get appropriate operand, and compute width accordingly.
404 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
405 Width = getOpSize(LdSt, DataOpIdx);
406 return true;
407 }
408
409 if (isSMRD(LdSt)) {
410 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
411 if (!BaseOp) // e.g. S_MEMTIME
412 return false;
413 BaseOps.push_back(BaseOp);
414 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
415 Offset = OffsetOp ? OffsetOp->getImm() : 0;
416 // Get appropriate operand, and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
418 Width = getOpSize(LdSt, DataOpIdx);
419 return true;
420 }
421
422 if (isFLAT(LdSt)) {
423 // Instructions have either vaddr or saddr or both or none.
424 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
425 if (BaseOp)
426 BaseOps.push_back(BaseOp);
427 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
428 if (BaseOp)
429 BaseOps.push_back(BaseOp);
430 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
431 // Get appropriate operand, and compute width accordingly.
432 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
433 if (DataOpIdx == -1)
434 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
435 Width = getOpSize(LdSt, DataOpIdx);
436 return true;
437 }
438
439 return false;
440}
441
442static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
443 ArrayRef<const MachineOperand *> BaseOps1,
444 const MachineInstr &MI2,
445 ArrayRef<const MachineOperand *> BaseOps2) {
446 // Only examine the first "base" operand of each instruction, on the
447 // assumption that it represents the real base address of the memory access.
448 // Other operands are typically offsets or indices from this base address.
449 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
450 return true;
451
452 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
453 return false;
454
455 auto MO1 = *MI1.memoperands_begin();
456 auto MO2 = *MI2.memoperands_begin();
457 if (MO1->getAddrSpace() != MO2->getAddrSpace())
458 return false;
459
460 auto Base1 = MO1->getValue();
461 auto Base2 = MO2->getValue();
462 if (!Base1 || !Base2)
463 return false;
464 Base1 = getUnderlyingObject(Base1);
465 Base2 = getUnderlyingObject(Base2);
466
467 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
468 return false;
469
470 return Base1 == Base2;
471}
472
473bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
474 ArrayRef<const MachineOperand *> BaseOps2,
475 unsigned NumLoads,
476 unsigned NumBytes) const {
477 // If the mem ops (to be clustered) do not have the same base ptr, then they
478 // should not be clustered
479 if (!BaseOps1.empty() && !BaseOps2.empty()) {
480 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
481 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
482 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
483 return false;
484 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
485 // If only one base op is empty, they do not have the same base ptr
486 return false;
487 }
488
489 // In order to avoid register pressure, on an average, the number of DWORDS
490 // loaded together by all clustered mem ops should not exceed 8. This is an
491 // empirical value based on certain observations and performance related
492 // experiments.
493 // The good thing about this heuristic is - it avoids clustering of too many
494 // sub-word loads, and also avoids clustering of wide loads. Below is the
495 // brief summary of how the heuristic behaves for various `LoadSize`.
496 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
497 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
498 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
499 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
500 // (5) LoadSize >= 17: do not cluster
501 const unsigned LoadSize = NumBytes / NumLoads;
502 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
503 return NumDWORDs <= 8;
504}
505
506// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
507// the first 16 loads will be interleaved with the stores, and the next 16 will
508// be clustered as expected. It should really split into 2 16 store batches.
509//
510// Loads are clustered until this returns false, rather than trying to schedule
511// groups of stores. This also means we have to deal with saying different
512// address space loads should be clustered, and ones which might cause bank
513// conflicts.
514//
515// This might be deprecated so it might not be worth that much effort to fix.
516bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
517 int64_t Offset0, int64_t Offset1,
518 unsigned NumLoads) const {
519 assert(Offset1 > Offset0 &&(static_cast <bool> (Offset1 > Offset0 && "Second offset should be larger than first offset!"
) ? void (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 520, __extension__
__PRETTY_FUNCTION__))
520 "Second offset should be larger than first offset!")(static_cast <bool> (Offset1 > Offset0 && "Second offset should be larger than first offset!"
) ? void (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 520, __extension__
__PRETTY_FUNCTION__))
;
521 // If we have less than 16 loads in a row, and the offsets are within 64
522 // bytes, then schedule together.
523
524 // A cacheline is 64 bytes (for global memory).
525 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
526}
527
528static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
529 MachineBasicBlock::iterator MI,
530 const DebugLoc &DL, MCRegister DestReg,
531 MCRegister SrcReg, bool KillSrc,
532 const char *Msg = "illegal SGPR to VGPR copy") {
533 MachineFunction *MF = MBB.getParent();
534 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
535 LLVMContext &C = MF->getFunction().getContext();
536 C.diagnose(IllegalCopy);
537
538 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
539 .addReg(SrcReg, getKillRegState(KillSrc));
540}
541
542/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
543/// to directly copy, so an intermediate VGPR needs to be used.
544static void indirectCopyToAGPR(const SIInstrInfo &TII,
545 MachineBasicBlock &MBB,
546 MachineBasicBlock::iterator MI,
547 const DebugLoc &DL, MCRegister DestReg,
548 MCRegister SrcReg, bool KillSrc,
549 RegScavenger &RS,
550 Register ImpDefSuperReg = Register(),
551 Register ImpUseSuperReg = Register()) {
552 const SIRegisterInfo &RI = TII.getRegisterInfo();
553
554 assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg
) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) ? void (0) : __assert_fail
("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 555, __extension__
__PRETTY_FUNCTION__))
555 AMDGPU::AGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg
) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) ? void (0) : __assert_fail
("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 555, __extension__
__PRETTY_FUNCTION__))
;
556
557 // First try to find defining accvgpr_write to avoid temporary registers.
558 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
559 --Def;
560 if (!Def->definesRegister(SrcReg, &RI))
561 continue;
562 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
563 break;
564
565 MachineOperand &DefOp = Def->getOperand(1);
566 assert(DefOp.isReg() || DefOp.isImm())(static_cast <bool> (DefOp.isReg() || DefOp.isImm()) ? void
(0) : __assert_fail ("DefOp.isReg() || DefOp.isImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 566, __extension__ __PRETTY_FUNCTION__))
;
567
568 if (DefOp.isReg()) {
569 // Check that register source operand if not clobbered before MI.
570 // Immediate operands are always safe to propagate.
571 bool SafeToPropagate = true;
572 for (auto I = Def; I != MI && SafeToPropagate; ++I)
573 if (I->modifiesRegister(DefOp.getReg(), &RI))
574 SafeToPropagate = false;
575
576 if (!SafeToPropagate)
577 break;
578
579 DefOp.setIsKill(false);
580 }
581
582 MachineInstrBuilder Builder =
583 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
584 .add(DefOp);
585 if (ImpDefSuperReg)
586 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
587
588 if (ImpUseSuperReg) {
589 Builder.addReg(ImpUseSuperReg,
590 getKillRegState(KillSrc) | RegState::Implicit);
591 }
592
593 return;
594 }
595
596 RS.enterBasicBlock(MBB);
597 RS.forward(MI);
598
599 // Ideally we want to have three registers for a long reg_sequence copy
600 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
601 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
602 *MBB.getParent());
603
604 // Registers in the sequence are allocated contiguously so we can just
605 // use register number to pick one of three round-robin temps.
606 unsigned RegNo = DestReg % 3;
607 Register Tmp;
608 if (!TII.getSubtarget().hasGFX90AInsts()) {
609 Tmp = AMDGPU::VGPR32;
610 assert(MBB.getParent()->getRegInfo().isReserved(AMDGPU::VGPR32))(static_cast <bool> (MBB.getParent()->getRegInfo().isReserved
(AMDGPU::VGPR32)) ? void (0) : __assert_fail ("MBB.getParent()->getRegInfo().isReserved(AMDGPU::VGPR32)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 610, __extension__
__PRETTY_FUNCTION__))
;
611
612 // Only loop through if there are any free registers left, otherwise
613 // scavenger may report a fatal error without emergency spill slot
614 // or spill with the slot.
615 while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
616 Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
617 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
618 break;
619 Tmp = Tmp2;
620 RS.setRegUsed(Tmp);
621 }
622 } else {
623 Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
624 RS.setRegUsed(Tmp);
625 }
626
627 // Insert copy to temporary VGPR.
628 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
629 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
630 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
631 } else {
632 assert(AMDGPU::SReg_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg
)) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 632, __extension__
__PRETTY_FUNCTION__))
;
633 }
634
635 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
636 .addReg(SrcReg, getKillRegState(KillSrc));
637 if (ImpUseSuperReg) {
638 UseBuilder.addReg(ImpUseSuperReg,
639 getKillRegState(KillSrc) | RegState::Implicit);
640 }
641
642 MachineInstrBuilder DefBuilder
643 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
644 .addReg(Tmp, RegState::Kill);
645
646 if (ImpDefSuperReg)
647 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
648}
649
650static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
651 MachineBasicBlock::iterator MI, const DebugLoc &DL,
652 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
653 const TargetRegisterClass *RC, bool Forward) {
654 const SIRegisterInfo &RI = TII.getRegisterInfo();
655 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
656 MachineBasicBlock::iterator I = MI;
657 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
658
659 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
660 int16_t SubIdx = BaseIndices[Idx];
661 Register Reg = RI.getSubReg(DestReg, SubIdx);
662 unsigned Opcode = AMDGPU::S_MOV_B32;
663
664 // Is SGPR aligned? If so try to combine with next.
665 Register Src = RI.getSubReg(SrcReg, SubIdx);
666 bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
667 bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
668 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
669 // Can use SGPR64 copy
670 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
671 SubIdx = RI.getSubRegFromChannel(Channel, 2);
672 Opcode = AMDGPU::S_MOV_B64;
673 Idx++;
674 }
675
676 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
677 .addReg(RI.getSubReg(SrcReg, SubIdx))
678 .addReg(SrcReg, RegState::Implicit);
679
680 if (!FirstMI)
681 FirstMI = LastMI;
682
683 if (!Forward)
684 I--;
685 }
686
687 assert(FirstMI && LastMI)(static_cast <bool> (FirstMI && LastMI) ? void (
0) : __assert_fail ("FirstMI && LastMI", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 687, __extension__ __PRETTY_FUNCTION__))
;
688 if (!Forward)
689 std::swap(FirstMI, LastMI);
690
691 FirstMI->addOperand(
692 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
693
694 if (KillSrc)
695 LastMI->addRegisterKilled(SrcReg, &RI);
696}
697
698void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
699 MachineBasicBlock::iterator MI,
700 const DebugLoc &DL, MCRegister DestReg,
701 MCRegister SrcReg, bool KillSrc) const {
702 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
703
704 // FIXME: This is hack to resolve copies between 16 bit and 32 bit
705 // registers until all patterns are fixed.
706 if (Fix16BitCopies &&
707 ((RI.getRegSizeInBits(*RC) == 16) ^
708 (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) {
709 MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
710 MCRegister Super = RI.get32BitRegister(RegToFix);
711 assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix)(static_cast <bool> (RI.getSubReg(Super, AMDGPU::lo16) ==
RegToFix) ? void (0) : __assert_fail ("RI.getSubReg(Super, AMDGPU::lo16) == RegToFix"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 711, __extension__
__PRETTY_FUNCTION__))
;
712 RegToFix = Super;
713
714 if (DestReg == SrcReg) {
715 // Insert empty bundle since ExpandPostRA expects an instruction here.
716 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
717 return;
718 }
719
720 RC = RI.getPhysRegClass(DestReg);
721 }
722
723 if (RC == &AMDGPU::VGPR_32RegClass) {
724 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg
) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass
.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 726, __extension__
__PRETTY_FUNCTION__))
725 AMDGPU::SReg_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg
) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass
.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 726, __extension__
__PRETTY_FUNCTION__))
726 AMDGPU::AGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg
) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass
.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 726, __extension__
__PRETTY_FUNCTION__))
;
727 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
728 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
729 BuildMI(MBB, MI, DL, get(Opc), DestReg)
730 .addReg(SrcReg, getKillRegState(KillSrc));
731 return;
732 }
733
734 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
735 RC == &AMDGPU::SReg_32RegClass) {
736 if (SrcReg == AMDGPU::SCC) {
737 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
738 .addImm(1)
739 .addImm(0);
740 return;
741 }
742
743 if (DestReg == AMDGPU::VCC_LO) {
744 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
745 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
746 .addReg(SrcReg, getKillRegState(KillSrc));
747 } else {
748 // FIXME: Hack until VReg_1 removed.
749 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg
)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 749, __extension__
__PRETTY_FUNCTION__))
;
750 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
751 .addImm(0)
752 .addReg(SrcReg, getKillRegState(KillSrc));
753 }
754
755 return;
756 }
757
758 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
759 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
760 return;
761 }
762
763 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
764 .addReg(SrcReg, getKillRegState(KillSrc));
765 return;
766 }
767
768 if (RC == &AMDGPU::SReg_64RegClass) {
769 if (SrcReg == AMDGPU::SCC) {
770 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
771 .addImm(1)
772 .addImm(0);
773 return;
774 }
775
776 if (DestReg == AMDGPU::VCC) {
777 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
778 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
779 .addReg(SrcReg, getKillRegState(KillSrc));
780 } else {
781 // FIXME: Hack until VReg_1 removed.
782 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg
)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 782, __extension__
__PRETTY_FUNCTION__))
;
783 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
784 .addImm(0)
785 .addReg(SrcReg, getKillRegState(KillSrc));
786 }
787
788 return;
789 }
790
791 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
792 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
793 return;
794 }
795
796 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
797 .addReg(SrcReg, getKillRegState(KillSrc));
798 return;
799 }
800
801 if (DestReg == AMDGPU::SCC) {
802 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
803 // but SelectionDAG emits such copies for i1 sources.
804 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
805 // This copy can only be produced by patterns
806 // with explicit SCC, which are known to be enabled
807 // only for subtargets with S_CMP_LG_U64 present.
808 assert(ST.hasScalarCompareEq64())(static_cast <bool> (ST.hasScalarCompareEq64()) ? void (
0) : __assert_fail ("ST.hasScalarCompareEq64()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 808, __extension__ __PRETTY_FUNCTION__))
;
809 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
810 .addReg(SrcReg, getKillRegState(KillSrc))
811 .addImm(0);
812 } else {
813 assert(AMDGPU::SReg_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg
)) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 813, __extension__
__PRETTY_FUNCTION__))
;
814 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
815 .addReg(SrcReg, getKillRegState(KillSrc))
816 .addImm(0);
817 }
818
819 return;
820 }
821
822 if (RC == &AMDGPU::AGPR_32RegClass) {
823 if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
824 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
825 .addReg(SrcReg, getKillRegState(KillSrc));
826 return;
827 }
828
829 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
830 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
831 .addReg(SrcReg, getKillRegState(KillSrc));
832 return;
833 }
834
835 // FIXME: Pass should maintain scavenger to avoid scan through the block on
836 // every AGPR spill.
837 RegScavenger RS;
838 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS);
839 return;
840 }
841
842 const unsigned Size = RI.getRegSizeInBits(*RC);
843 if (Size == 16) {
844 assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains
(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU
::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass
.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 847, __extension__
__PRETTY_FUNCTION__))
845 AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains
(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU
::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass
.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 847, __extension__
__PRETTY_FUNCTION__))
846 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains
(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU
::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass
.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 847, __extension__
__PRETTY_FUNCTION__))
847 AMDGPU::AGPR_LO16RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains
(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU
::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass
.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 847, __extension__
__PRETTY_FUNCTION__))
;
848
849 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
850 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
851 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
852 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
853 bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
854 AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
855 AMDGPU::AGPR_LO16RegClass.contains(DestReg);
856 bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
857 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
858 AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
859 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
860 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
861
862 if (IsSGPRDst) {
863 if (!IsSGPRSrc) {
864 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
865 return;
866 }
867
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
869 .addReg(NewSrcReg, getKillRegState(KillSrc));
870 return;
871 }
872
873 if (IsAGPRDst || IsAGPRSrc) {
874 if (!DstLow || !SrcLow) {
875 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
876 "Cannot use hi16 subreg with an AGPR!");
877 }
878
879 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
880 return;
881 }
882
883 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
884 if (!DstLow || !SrcLow) {
885 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
886 "Cannot use hi16 subreg on VI!");
887 }
888
889 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
890 .addReg(NewSrcReg, getKillRegState(KillSrc));
891 return;
892 }
893
894 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
895 .addImm(0) // src0_modifiers
896 .addReg(NewSrcReg)
897 .addImm(0) // clamp
898 .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
899 : AMDGPU::SDWA::SdwaSel::WORD_1)
900 .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
901 .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
902 : AMDGPU::SDWA::SdwaSel::WORD_1)
903 .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
904 // First implicit operand is $exec.
905 MIB->tieOperands(0, MIB->getNumOperands() - 1);
906 return;
907 }
908
909 const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
910 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
911 if (ST.hasMovB64()) {
912 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
913 .addReg(SrcReg, getKillRegState(KillSrc));
914 return;
915 }
916 if (ST.hasPackedFP32Ops()) {
917 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
918 .addImm(SISrcMods::OP_SEL_1)
919 .addReg(SrcReg)
920 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
921 .addReg(SrcReg)
922 .addImm(0) // op_sel_lo
923 .addImm(0) // op_sel_hi
924 .addImm(0) // neg_lo
925 .addImm(0) // neg_hi
926 .addImm(0) // clamp
927 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
928 return;
929 }
930 }
931
932 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
933 if (RI.isSGPRClass(RC)) {
934 if (!RI.isSGPRClass(SrcRC)) {
935 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
936 return;
937 }
938 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
939 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
940 Forward);
941 return;
942 }
943
944 unsigned EltSize = 4;
945 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
946 if (RI.isAGPRClass(RC)) {
947 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
948 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
949 else if (RI.hasVGPRs(SrcRC))
950 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
951 else
952 Opcode = AMDGPU::INSTRUCTION_LIST_END;
953 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
954 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
955 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
956 (RI.isProperlyAlignedRC(*RC) &&
957 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
958 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
959 if (ST.hasMovB64()) {
960 Opcode = AMDGPU::V_MOV_B64_e32;
961 EltSize = 8;
962 } else if (ST.hasPackedFP32Ops()) {
963 Opcode = AMDGPU::V_PK_MOV_B32;
964 EltSize = 8;
965 }
966 }
967
968 // For the cases where we need an intermediate instruction/temporary register
969 // (destination is an AGPR), we need a scavenger.
970 //
971 // FIXME: The pass should maintain this for us so we don't have to re-scan the
972 // whole block for every handled copy.
973 std::unique_ptr<RegScavenger> RS;
974 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
975 RS.reset(new RegScavenger());
976
977 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
978
979 // If there is an overlap, we can't kill the super-register on the last
980 // instruction, since it will also kill the components made live by this def.
981 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
982
983 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
984 unsigned SubIdx;
985 if (Forward)
986 SubIdx = SubIndices[Idx];
987 else
988 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
989
990 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
991
992 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
993 Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register();
994 Register ImpUseSuper = SrcReg;
995 indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
996 RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
997 ImpDefSuper, ImpUseSuper);
998 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
999 Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
1000 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1001 MachineInstrBuilder MIB =
1002 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
1003 .addImm(SISrcMods::OP_SEL_1)
1004 .addReg(SrcSubReg)
1005 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1006 .addReg(SrcSubReg)
1007 .addImm(0) // op_sel_lo
1008 .addImm(0) // op_sel_hi
1009 .addImm(0) // neg_lo
1010 .addImm(0) // neg_hi
1011 .addImm(0) // clamp
1012 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1013 if (Idx == 0)
1014 MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
1015 } else {
1016 MachineInstrBuilder Builder =
1017 BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
1018 .addReg(RI.getSubReg(SrcReg, SubIdx));
1019 if (Idx == 0)
1020 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1021
1022 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1023 }
1024 }
1025}
1026
1027int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1028 int NewOpc;
1029
1030 // Try to map original to commuted opcode
1031 NewOpc = AMDGPU::getCommuteRev(Opcode);
1032 if (NewOpc != -1)
1033 // Check if the commuted (REV) opcode exists on the target.
1034 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1035
1036 // Try to map commuted to original opcode
1037 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1038 if (NewOpc != -1)
1039 // Check if the original (non-REV) opcode exists on the target.
1040 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1041
1042 return Opcode;
1043}
1044
1045void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
1046 MachineBasicBlock::iterator MI,
1047 const DebugLoc &DL, unsigned DestReg,
1048 int64_t Value) const {
1049 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1050 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1051 if (RegClass == &AMDGPU::SReg_32RegClass ||
1052 RegClass == &AMDGPU::SGPR_32RegClass ||
1053 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1054 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1055 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1056 .addImm(Value);
1057 return;
1058 }
1059
1060 if (RegClass == &AMDGPU::SReg_64RegClass ||
1061 RegClass == &AMDGPU::SGPR_64RegClass ||
1062 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1063 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1064 .addImm(Value);
1065 return;
1066 }
1067
1068 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1069 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1070 .addImm(Value);
1071 return;
1072 }
1073 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1074 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1075 .addImm(Value);
1076 return;
1077 }
1078
1079 unsigned EltSize = 4;
1080 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1081 if (RI.isSGPRClass(RegClass)) {
1082 if (RI.getRegSizeInBits(*RegClass) > 32) {
1083 Opcode = AMDGPU::S_MOV_B64;
1084 EltSize = 8;
1085 } else {
1086 Opcode = AMDGPU::S_MOV_B32;
1087 EltSize = 4;
1088 }
1089 }
1090
1091 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1092 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1093 int64_t IdxValue = Idx == 0 ? Value : 0;
1094
1095 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1096 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1097 Builder.addImm(IdxValue);
1098 }
1099}
1100
1101const TargetRegisterClass *
1102SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1103 return &AMDGPU::VGPR_32RegClass;
1104}
1105
1106void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1107 MachineBasicBlock::iterator I,
1108 const DebugLoc &DL, Register DstReg,
1109 ArrayRef<MachineOperand> Cond,
1110 Register TrueReg,
1111 Register FalseReg) const {
1112 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1113 const TargetRegisterClass *BoolXExecRC =
1114 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1115 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&(static_cast <bool> (MRI.getRegClass(DstReg) == &AMDGPU
::VGPR_32RegClass && "Not a VGPR32 reg") ? void (0) :
__assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1116, __extension__
__PRETTY_FUNCTION__))
1116 "Not a VGPR32 reg")(static_cast <bool> (MRI.getRegClass(DstReg) == &AMDGPU
::VGPR_32RegClass && "Not a VGPR32 reg") ? void (0) :
__assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1116, __extension__
__PRETTY_FUNCTION__))
;
1117
1118 if (Cond.size() == 1) {
1119 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1120 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1121 .add(Cond[0]);
1122 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1123 .addImm(0)
1124 .addReg(FalseReg)
1125 .addImm(0)
1126 .addReg(TrueReg)
1127 .addReg(SReg);
1128 } else if (Cond.size() == 2) {
1129 assert(Cond[0].isImm() && "Cond[0] is not an immediate")(static_cast <bool> (Cond[0].isImm() && "Cond[0] is not an immediate"
) ? void (0) : __assert_fail ("Cond[0].isImm() && \"Cond[0] is not an immediate\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1129, __extension__
__PRETTY_FUNCTION__))
;
1130 switch (Cond[0].getImm()) {
1131 case SIInstrInfo::SCC_TRUE: {
1132 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1133 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1134 : AMDGPU::S_CSELECT_B64), SReg)
1135 .addImm(1)
1136 .addImm(0);
1137 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1138 .addImm(0)
1139 .addReg(FalseReg)
1140 .addImm(0)
1141 .addReg(TrueReg)
1142 .addReg(SReg);
1143 break;
1144 }
1145 case SIInstrInfo::SCC_FALSE: {
1146 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1147 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1148 : AMDGPU::S_CSELECT_B64), SReg)
1149 .addImm(0)
1150 .addImm(1);
1151 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1152 .addImm(0)
1153 .addReg(FalseReg)
1154 .addImm(0)
1155 .addReg(TrueReg)
1156 .addReg(SReg);
1157 break;
1158 }
1159 case SIInstrInfo::VCCNZ: {
1160 MachineOperand RegOp = Cond[1];
1161 RegOp.setImplicit(false);
1162 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1163 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1164 .add(RegOp);
1165 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1166 .addImm(0)
1167 .addReg(FalseReg)
1168 .addImm(0)
1169 .addReg(TrueReg)
1170 .addReg(SReg);
1171 break;
1172 }
1173 case SIInstrInfo::VCCZ: {
1174 MachineOperand RegOp = Cond[1];
1175 RegOp.setImplicit(false);
1176 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1177 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1178 .add(RegOp);
1179 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1180 .addImm(0)
1181 .addReg(TrueReg)
1182 .addImm(0)
1183 .addReg(FalseReg)
1184 .addReg(SReg);
1185 break;
1186 }
1187 case SIInstrInfo::EXECNZ: {
1188 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1189 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1190 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1191 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1192 .addImm(0);
1193 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1194 : AMDGPU::S_CSELECT_B64), SReg)
1195 .addImm(1)
1196 .addImm(0);
1197 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1198 .addImm(0)
1199 .addReg(FalseReg)
1200 .addImm(0)
1201 .addReg(TrueReg)
1202 .addReg(SReg);
1203 break;
1204 }
1205 case SIInstrInfo::EXECZ: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1208 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1209 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1210 .addImm(0);
1211 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1212 : AMDGPU::S_CSELECT_B64), SReg)
1213 .addImm(0)
1214 .addImm(1);
1215 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1216 .addImm(0)
1217 .addReg(FalseReg)
1218 .addImm(0)
1219 .addReg(TrueReg)
1220 .addReg(SReg);
1221 llvm_unreachable("Unhandled branch predicate EXECZ")::llvm::llvm_unreachable_internal("Unhandled branch predicate EXECZ"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1221)
;
1222 break;
1223 }
1224 default:
1225 llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate",
"llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1225)
;
1226 }
1227 } else {
1228 llvm_unreachable("Can only handle Cond size 1 or 2")::llvm::llvm_unreachable_internal("Can only handle Cond size 1 or 2"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1228)
;
1229 }
1230}
1231
1232Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1233 MachineBasicBlock::iterator I,
1234 const DebugLoc &DL,
1235 Register SrcReg, int Value) const {
1236 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1237 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1238 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1239 .addImm(Value)
1240 .addReg(SrcReg);
1241
1242 return Reg;
1243}
1244
1245Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1246 MachineBasicBlock::iterator I,
1247 const DebugLoc &DL,
1248 Register SrcReg, int Value) const {
1249 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1250 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1251 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1252 .addImm(Value)
1253 .addReg(SrcReg);
1254
1255 return Reg;
1256}
1257
1258unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1259
1260 if (RI.isAGPRClass(DstRC))
1261 return AMDGPU::COPY;
1262 if (RI.getRegSizeInBits(*DstRC) == 32) {
1263 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1264 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1265 return AMDGPU::S_MOV_B64;
1266 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1267 return AMDGPU::V_MOV_B64_PSEUDO;
1268 }
1269 return AMDGPU::COPY;
1270}
1271
1272const MCInstrDesc &
1273SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1274 bool IsIndirectSrc) const {
1275 if (IsIndirectSrc) {
1276 if (VecSize <= 32) // 4 bytes
1277 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1278 if (VecSize <= 64) // 8 bytes
1279 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1280 if (VecSize <= 96) // 12 bytes
1281 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1282 if (VecSize <= 128) // 16 bytes
1283 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1284 if (VecSize <= 160) // 20 bytes
1285 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1286 if (VecSize <= 256) // 32 bytes
1287 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1288 if (VecSize <= 512) // 64 bytes
1289 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1290 if (VecSize <= 1024) // 128 bytes
1291 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1292
1293 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegReadGPRIDX pseudos"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1293)
;
1294 }
1295
1296 if (VecSize <= 32) // 4 bytes
1297 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1298 if (VecSize <= 64) // 8 bytes
1299 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1300 if (VecSize <= 96) // 12 bytes
1301 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1302 if (VecSize <= 128) // 16 bytes
1303 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1304 if (VecSize <= 160) // 20 bytes
1305 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1306 if (VecSize <= 256) // 32 bytes
1307 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1308 if (VecSize <= 512) // 64 bytes
1309 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1310 if (VecSize <= 1024) // 128 bytes
1311 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1312
1313 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWriteGPRIDX pseudos"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1313)
;
1314}
1315
1316static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1317 if (VecSize <= 32) // 4 bytes
1318 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1319 if (VecSize <= 64) // 8 bytes
1320 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1321 if (VecSize <= 96) // 12 bytes
1322 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1323 if (VecSize <= 128) // 16 bytes
1324 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1325 if (VecSize <= 160) // 20 bytes
1326 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1327 if (VecSize <= 256) // 32 bytes
1328 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1329 if (VecSize <= 512) // 64 bytes
1330 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1331 if (VecSize <= 1024) // 128 bytes
1332 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1333
1334 llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1334)
;
1335}
1336
1337static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1338 if (VecSize <= 32) // 4 bytes
1339 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1340 if (VecSize <= 64) // 8 bytes
1341 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1342 if (VecSize <= 96) // 12 bytes
1343 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1344 if (VecSize <= 128) // 16 bytes
1345 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1346 if (VecSize <= 160) // 20 bytes
1347 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1348 if (VecSize <= 256) // 32 bytes
1349 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1350 if (VecSize <= 512) // 64 bytes
1351 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1352 if (VecSize <= 1024) // 128 bytes
1353 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1354
1355 llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1355)
;
1356}
1357
1358static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1359 if (VecSize <= 64) // 8 bytes
1360 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1361 if (VecSize <= 128) // 16 bytes
1362 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1363 if (VecSize <= 256) // 32 bytes
1364 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1365 if (VecSize <= 512) // 64 bytes
1366 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1367 if (VecSize <= 1024) // 128 bytes
1368 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1369
1370 llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1370)
;
1371}
1372
1373const MCInstrDesc &
1374SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1375 bool IsSGPR) const {
1376 if (IsSGPR) {
1377 switch (EltSize) {
1378 case 32:
1379 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1380 case 64:
1381 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1382 default:
1383 llvm_unreachable("invalid reg indexing elt size")::llvm::llvm_unreachable_internal("invalid reg indexing elt size"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1383)
;
1384 }
1385 }
1386
1387 assert(EltSize == 32 && "invalid reg indexing elt size")(static_cast <bool> (EltSize == 32 && "invalid reg indexing elt size"
) ? void (0) : __assert_fail ("EltSize == 32 && \"invalid reg indexing elt size\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1387, __extension__
__PRETTY_FUNCTION__))
;
1388 return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1389}
1390
1391static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1392 switch (Size) {
1393 case 4:
1394 return AMDGPU::SI_SPILL_S32_SAVE;
1395 case 8:
1396 return AMDGPU::SI_SPILL_S64_SAVE;
1397 case 12:
1398 return AMDGPU::SI_SPILL_S96_SAVE;
1399 case 16:
1400 return AMDGPU::SI_SPILL_S128_SAVE;
1401 case 20:
1402 return AMDGPU::SI_SPILL_S160_SAVE;
1403 case 24:
1404 return AMDGPU::SI_SPILL_S192_SAVE;
1405 case 28:
1406 return AMDGPU::SI_SPILL_S224_SAVE;
1407 case 32:
1408 return AMDGPU::SI_SPILL_S256_SAVE;
1409 case 64:
1410 return AMDGPU::SI_SPILL_S512_SAVE;
1411 case 128:
1412 return AMDGPU::SI_SPILL_S1024_SAVE;
1413 default:
1414 llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1414)
;
1415 }
1416}
1417
1418static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1419 switch (Size) {
1420 case 4:
1421 return AMDGPU::SI_SPILL_V32_SAVE;
1422 case 8:
1423 return AMDGPU::SI_SPILL_V64_SAVE;
1424 case 12:
1425 return AMDGPU::SI_SPILL_V96_SAVE;
1426 case 16:
1427 return AMDGPU::SI_SPILL_V128_SAVE;
1428 case 20:
1429 return AMDGPU::SI_SPILL_V160_SAVE;
1430 case 24:
1431 return AMDGPU::SI_SPILL_V192_SAVE;
1432 case 28:
1433 return AMDGPU::SI_SPILL_V224_SAVE;
1434 case 32:
1435 return AMDGPU::SI_SPILL_V256_SAVE;
1436 case 64:
1437 return AMDGPU::SI_SPILL_V512_SAVE;
1438 case 128:
1439 return AMDGPU::SI_SPILL_V1024_SAVE;
1440 default:
1441 llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1441)
;
1442 }
1443}
1444
1445static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1446 switch (Size) {
1447 case 4:
1448 return AMDGPU::SI_SPILL_A32_SAVE;
1449 case 8:
1450 return AMDGPU::SI_SPILL_A64_SAVE;
1451 case 12:
1452 return AMDGPU::SI_SPILL_A96_SAVE;
1453 case 16:
1454 return AMDGPU::SI_SPILL_A128_SAVE;
1455 case 20:
1456 return AMDGPU::SI_SPILL_A160_SAVE;
1457 case 24:
1458 return AMDGPU::SI_SPILL_A192_SAVE;
1459 case 28:
1460 return AMDGPU::SI_SPILL_A224_SAVE;
1461 case 32:
1462 return AMDGPU::SI_SPILL_A256_SAVE;
1463 case 64:
1464 return AMDGPU::SI_SPILL_A512_SAVE;
1465 case 128:
1466 return AMDGPU::SI_SPILL_A1024_SAVE;
1467 default:
1468 llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1468)
;
1469 }
1470}
1471
1472static unsigned getAVSpillSaveOpcode(unsigned Size) {
1473 switch (Size) {
1474 case 4:
1475 return AMDGPU::SI_SPILL_AV32_SAVE;
1476 case 8:
1477 return AMDGPU::SI_SPILL_AV64_SAVE;
1478 case 12:
1479 return AMDGPU::SI_SPILL_AV96_SAVE;
1480 case 16:
1481 return AMDGPU::SI_SPILL_AV128_SAVE;
1482 case 20:
1483 return AMDGPU::SI_SPILL_AV160_SAVE;
1484 case 24:
1485 return AMDGPU::SI_SPILL_AV192_SAVE;
1486 case 28:
1487 return AMDGPU::SI_SPILL_AV224_SAVE;
1488 case 32:
1489 return AMDGPU::SI_SPILL_AV256_SAVE;
1490 case 64:
1491 return AMDGPU::SI_SPILL_AV512_SAVE;
1492 case 128:
1493 return AMDGPU::SI_SPILL_AV1024_SAVE;
1494 default:
1495 llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1495)
;
1496 }
1497}
1498
1499void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
1500 MachineBasicBlock::iterator MI,
1501 Register SrcReg, bool isKill,
1502 int FrameIndex,
1503 const TargetRegisterClass *RC,
1504 const TargetRegisterInfo *TRI) const {
1505 MachineFunction *MF = MBB.getParent();
1506 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1507 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1508 const DebugLoc &DL = MBB.findDebugLoc(MI);
1509
1510 MachinePointerInfo PtrInfo
1511 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1512 MachineMemOperand *MMO = MF->getMachineMemOperand(
1513 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1514 FrameInfo.getObjectAlign(FrameIndex));
1515 unsigned SpillSize = TRI->getSpillSize(*RC);
1516
1517 MachineRegisterInfo &MRI = MF->getRegInfo();
1518 if (RI.isSGPRClass(RC)) {
1519 MFI->setHasSpilledSGPRs();
1520 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled")(static_cast <bool> (SrcReg != AMDGPU::M0 && "m0 should not be spilled"
) ? void (0) : __assert_fail ("SrcReg != AMDGPU::M0 && \"m0 should not be spilled\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1520, __extension__
__PRETTY_FUNCTION__))
;
1521 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&(static_cast <bool> (SrcReg != AMDGPU::EXEC_LO &&
SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC &&
"exec should not be spilled") ? void (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1522, __extension__
__PRETTY_FUNCTION__))
1522 SrcReg != AMDGPU::EXEC && "exec should not be spilled")(static_cast <bool> (SrcReg != AMDGPU::EXEC_LO &&
SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC &&
"exec should not be spilled") ? void (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1522, __extension__
__PRETTY_FUNCTION__))
;
1523
1524 // We are only allowed to create one new instruction when spilling
1525 // registers, so we need to use pseudo instruction for spilling SGPRs.
1526 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1527
1528 // The SGPR spill/restore instructions only work on number sgprs, so we need
1529 // to make sure we are using the correct register class.
1530 if (SrcReg.isVirtual() && SpillSize == 4) {
1531 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1532 }
1533
1534 BuildMI(MBB, MI, DL, OpDesc)
1535 .addReg(SrcReg, getKillRegState(isKill)) // data
1536 .addFrameIndex(FrameIndex) // addr
1537 .addMemOperand(MMO)
1538 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1539
1540 if (RI.spillSGPRToVGPR())
1541 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1542 return;
1543 }
1544
1545 unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize)
1546 : RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
1547 : getVGPRSpillSaveOpcode(SpillSize);
1548 MFI->setHasSpilledVGPRs();
1549
1550 BuildMI(MBB, MI, DL, get(Opcode))
1551 .addReg(SrcReg, getKillRegState(isKill)) // data
1552 .addFrameIndex(FrameIndex) // addr
1553 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1554 .addImm(0) // offset
1555 .addMemOperand(MMO);
1556}
1557
1558static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1559 switch (Size) {
1560 case 4:
1561 return AMDGPU::SI_SPILL_S32_RESTORE;
1562 case 8:
1563 return AMDGPU::SI_SPILL_S64_RESTORE;
1564 case 12:
1565 return AMDGPU::SI_SPILL_S96_RESTORE;
1566 case 16:
1567 return AMDGPU::SI_SPILL_S128_RESTORE;
1568 case 20:
1569 return AMDGPU::SI_SPILL_S160_RESTORE;
1570 case 24:
1571 return AMDGPU::SI_SPILL_S192_RESTORE;
1572 case 28:
1573 return AMDGPU::SI_SPILL_S224_RESTORE;
1574 case 32:
1575 return AMDGPU::SI_SPILL_S256_RESTORE;
1576 case 64:
1577 return AMDGPU::SI_SPILL_S512_RESTORE;
1578 case 128:
1579 return AMDGPU::SI_SPILL_S1024_RESTORE;
1580 default:
1581 llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1581)
;
1582 }
1583}
1584
1585static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1586 switch (Size) {
1587 case 4:
1588 return AMDGPU::SI_SPILL_V32_RESTORE;
1589 case 8:
1590 return AMDGPU::SI_SPILL_V64_RESTORE;
1591 case 12:
1592 return AMDGPU::SI_SPILL_V96_RESTORE;
1593 case 16:
1594 return AMDGPU::SI_SPILL_V128_RESTORE;
1595 case 20:
1596 return AMDGPU::SI_SPILL_V160_RESTORE;
1597 case 24:
1598 return AMDGPU::SI_SPILL_V192_RESTORE;
1599 case 28:
1600 return AMDGPU::SI_SPILL_V224_RESTORE;
1601 case 32:
1602 return AMDGPU::SI_SPILL_V256_RESTORE;
1603 case 64:
1604 return AMDGPU::SI_SPILL_V512_RESTORE;
1605 case 128:
1606 return AMDGPU::SI_SPILL_V1024_RESTORE;
1607 default:
1608 llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1608)
;
1609 }
1610}
1611
1612static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1613 switch (Size) {
1614 case 4:
1615 return AMDGPU::SI_SPILL_A32_RESTORE;
1616 case 8:
1617 return AMDGPU::SI_SPILL_A64_RESTORE;
1618 case 12:
1619 return AMDGPU::SI_SPILL_A96_RESTORE;
1620 case 16:
1621 return AMDGPU::SI_SPILL_A128_RESTORE;
1622 case 20:
1623 return AMDGPU::SI_SPILL_A160_RESTORE;
1624 case 24:
1625 return AMDGPU::SI_SPILL_A192_RESTORE;
1626 case 28:
1627 return AMDGPU::SI_SPILL_A224_RESTORE;
1628 case 32:
1629 return AMDGPU::SI_SPILL_A256_RESTORE;
1630 case 64:
1631 return AMDGPU::SI_SPILL_A512_RESTORE;
1632 case 128:
1633 return AMDGPU::SI_SPILL_A1024_RESTORE;
1634 default:
1635 llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1635)
;
1636 }
1637}
1638
1639static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1640 switch (Size) {
1641 case 4:
1642 return AMDGPU::SI_SPILL_AV32_RESTORE;
1643 case 8:
1644 return AMDGPU::SI_SPILL_AV64_RESTORE;
1645 case 12:
1646 return AMDGPU::SI_SPILL_AV96_RESTORE;
1647 case 16:
1648 return AMDGPU::SI_SPILL_AV128_RESTORE;
1649 case 20:
1650 return AMDGPU::SI_SPILL_AV160_RESTORE;
1651 case 24:
1652 return AMDGPU::SI_SPILL_AV192_RESTORE;
1653 case 28:
1654 return AMDGPU::SI_SPILL_AV224_RESTORE;
1655 case 32:
1656 return AMDGPU::SI_SPILL_AV256_RESTORE;
1657 case 64:
1658 return AMDGPU::SI_SPILL_AV512_RESTORE;
1659 case 128:
1660 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1661 default:
1662 llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1662)
;
1663 }
1664}
1665
1666void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1667 MachineBasicBlock::iterator MI,
1668 Register DestReg, int FrameIndex,
1669 const TargetRegisterClass *RC,
1670 const TargetRegisterInfo *TRI) const {
1671 MachineFunction *MF = MBB.getParent();
1672 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1673 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1674 const DebugLoc &DL = MBB.findDebugLoc(MI);
1675 unsigned SpillSize = TRI->getSpillSize(*RC);
1676
1677 MachinePointerInfo PtrInfo
1678 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1679
1680 MachineMemOperand *MMO = MF->getMachineMemOperand(
1681 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1682 FrameInfo.getObjectAlign(FrameIndex));
1683
1684 if (RI.isSGPRClass(RC)) {
1685 MFI->setHasSpilledSGPRs();
1686 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into")(static_cast <bool> (DestReg != AMDGPU::M0 && "m0 should not be reloaded into"
) ? void (0) : __assert_fail ("DestReg != AMDGPU::M0 && \"m0 should not be reloaded into\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1686, __extension__
__PRETTY_FUNCTION__))
;
1687 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&(static_cast <bool> (DestReg != AMDGPU::EXEC_LO &&
DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC
&& "exec should not be spilled") ? void (0) : __assert_fail
("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1688, __extension__
__PRETTY_FUNCTION__))
1688 DestReg != AMDGPU::EXEC && "exec should not be spilled")(static_cast <bool> (DestReg != AMDGPU::EXEC_LO &&
DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC
&& "exec should not be spilled") ? void (0) : __assert_fail
("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1688, __extension__
__PRETTY_FUNCTION__))
;
1689
1690 // FIXME: Maybe this should not include a memoperand because it will be
1691 // lowered to non-memory instructions.
1692 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1693 if (DestReg.isVirtual() && SpillSize == 4) {
1694 MachineRegisterInfo &MRI = MF->getRegInfo();
1695 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1696 }
1697
1698 if (RI.spillSGPRToVGPR())
1699 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1700 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1701 .addFrameIndex(FrameIndex) // addr
1702 .addMemOperand(MMO)
1703 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1704
1705 return;
1706 }
1707
1708 unsigned Opcode = RI.isVectorSuperClass(RC)
1709 ? getAVSpillRestoreOpcode(SpillSize)
1710 : RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
1711 : getVGPRSpillRestoreOpcode(SpillSize);
1712 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1713 .addFrameIndex(FrameIndex) // vaddr
1714 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1715 .addImm(0) // offset
1716 .addMemOperand(MMO);
1717}
1718
1719void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1720 MachineBasicBlock::iterator MI) const {
1721 insertNoops(MBB, MI, 1);
1722}
1723
1724void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
1725 MachineBasicBlock::iterator MI,
1726 unsigned Quantity) const {
1727 DebugLoc DL = MBB.findDebugLoc(MI);
1728 while (Quantity > 0) {
1729 unsigned Arg = std::min(Quantity, 8u);
1730 Quantity -= Arg;
1731 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1732 }
1733}
1734
1735void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1736 auto MF = MBB.getParent();
1737 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1738
1739 assert(Info->isEntryFunction())(static_cast <bool> (Info->isEntryFunction()) ? void
(0) : __assert_fail ("Info->isEntryFunction()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1739, __extension__ __PRETTY_FUNCTION__))
;
1740
1741 if (MBB.succ_empty()) {
1742 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1743 if (HasNoTerminator) {
1744 if (Info->returnsVoid()) {
1745 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1746 } else {
1747 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1748 }
1749 }
1750 }
1751}
1752
1753unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
1754 switch (MI.getOpcode()) {
1755 default:
1756 if (MI.isMetaInstruction())
1757 return 0;
1758 return 1; // FIXME: Do wait states equal cycles?
1759
1760 case AMDGPU::S_NOP:
1761 return MI.getOperand(0).getImm() + 1;
1762
1763 // FIXME: Any other pseudo instruction?
1764 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
1765 // hazard, even if one exist, won't really be visible. Should we handle it?
1766 case AMDGPU::SI_MASKED_UNREACHABLE:
1767 case AMDGPU::WAVE_BARRIER:
1768 return 0;
1769 }
1770}
1771
1772bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1773 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1774 MachineBasicBlock &MBB = *MI.getParent();
1775 DebugLoc DL = MBB.findDebugLoc(MI);
1776 switch (MI.getOpcode()) {
1
Control jumps to 'case V_MOV_B64_DPP_PSEUDO:' at line 1898
1777 default: return TargetInstrInfo::expandPostRAPseudo(MI);
1778 case AMDGPU::S_MOV_B64_term:
1779 // This is only a terminator to get the correct spill code placement during
1780 // register allocation.
1781 MI.setDesc(get(AMDGPU::S_MOV_B64));
1782 break;
1783
1784 case AMDGPU::S_MOV_B32_term:
1785 // This is only a terminator to get the correct spill code placement during
1786 // register allocation.
1787 MI.setDesc(get(AMDGPU::S_MOV_B32));
1788 break;
1789
1790 case AMDGPU::S_XOR_B64_term:
1791 // This is only a terminator to get the correct spill code placement during
1792 // register allocation.
1793 MI.setDesc(get(AMDGPU::S_XOR_B64));
1794 break;
1795
1796 case AMDGPU::S_XOR_B32_term:
1797 // This is only a terminator to get the correct spill code placement during
1798 // register allocation.
1799 MI.setDesc(get(AMDGPU::S_XOR_B32));
1800 break;
1801 case AMDGPU::S_OR_B64_term:
1802 // This is only a terminator to get the correct spill code placement during
1803 // register allocation.
1804 MI.setDesc(get(AMDGPU::S_OR_B64));
1805 break;
1806 case AMDGPU::S_OR_B32_term:
1807 // This is only a terminator to get the correct spill code placement during
1808 // register allocation.
1809 MI.setDesc(get(AMDGPU::S_OR_B32));
1810 break;
1811
1812 case AMDGPU::S_ANDN2_B64_term:
1813 // This is only a terminator to get the correct spill code placement during
1814 // register allocation.
1815 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1816 break;
1817
1818 case AMDGPU::S_ANDN2_B32_term:
1819 // This is only a terminator to get the correct spill code placement during
1820 // register allocation.
1821 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
1822 break;
1823
1824 case AMDGPU::S_AND_B64_term:
1825 // This is only a terminator to get the correct spill code placement during
1826 // register allocation.
1827 MI.setDesc(get(AMDGPU::S_AND_B64));
1828 break;
1829
1830 case AMDGPU::S_AND_B32_term:
1831 // This is only a terminator to get the correct spill code placement during
1832 // register allocation.
1833 MI.setDesc(get(AMDGPU::S_AND_B32));
1834 break;
1835
1836 case AMDGPU::V_MOV_B64_PSEUDO: {
1837 Register Dst = MI.getOperand(0).getReg();
1838 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1839 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1840
1841 const MachineOperand &SrcOp = MI.getOperand(1);
1842 // FIXME: Will this work for 64-bit floating point immediates?
1843 assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail
("!SrcOp.isFPImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1843, __extension__ __PRETTY_FUNCTION__))
;
1844 if (ST.hasMovB64()) {
1845 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
1846 if (!isLiteralConstant(MI, 1) || isUInt<32>(SrcOp.getImm()))
1847 break;
1848 }
1849 if (SrcOp.isImm()) {
1850 APInt Imm(64, SrcOp.getImm());
1851 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
1852 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
1853 if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
1854 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
1855 .addImm(SISrcMods::OP_SEL_1)
1856 .addImm(Lo.getSExtValue())
1857 .addImm(SISrcMods::OP_SEL_1)
1858 .addImm(Lo.getSExtValue())
1859 .addImm(0) // op_sel_lo
1860 .addImm(0) // op_sel_hi
1861 .addImm(0) // neg_lo
1862 .addImm(0) // neg_hi
1863 .addImm(0); // clamp
1864 } else {
1865 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1866 .addImm(Lo.getSExtValue())
1867 .addReg(Dst, RegState::Implicit | RegState::Define);
1868 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1869 .addImm(Hi.getSExtValue())
1870 .addReg(Dst, RegState::Implicit | RegState::Define);
1871 }
1872 } else {
1873 assert(SrcOp.isReg())(static_cast <bool> (SrcOp.isReg()) ? void (0) : __assert_fail
("SrcOp.isReg()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1873
, __extension__ __PRETTY_FUNCTION__))
;
1874 if (ST.hasPackedFP32Ops() &&
1875 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
1876 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
1877 .addImm(SISrcMods::OP_SEL_1) // src0_mod
1878 .addReg(SrcOp.getReg())
1879 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
1880 .addReg(SrcOp.getReg())
1881 .addImm(0) // op_sel_lo
1882 .addImm(0) // op_sel_hi
1883 .addImm(0) // neg_lo
1884 .addImm(0) // neg_hi
1885 .addImm(0); // clamp
1886 } else {
1887 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1888 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1889 .addReg(Dst, RegState::Implicit | RegState::Define);
1890 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1891 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1892 .addReg(Dst, RegState::Implicit | RegState::Define);
1893 }
1894 }
1895 MI.eraseFromParent();
1896 break;
1897 }
1898 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
1899 expandMovDPP64(MI);
2
Calling 'SIInstrInfo::expandMovDPP64'
1900 break;
1901 }
1902 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
1903 const MachineOperand &SrcOp = MI.getOperand(1);
1904 assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail
("!SrcOp.isFPImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 1904, __extension__ __PRETTY_FUNCTION__))
;
1905 APInt Imm(64, SrcOp.getImm());
1906 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
1907 MI.setDesc(get(AMDGPU::S_MOV_B64));
1908 break;
1909 }
1910
1911 Register Dst = MI.getOperand(0).getReg();
1912 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1913 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1914
1915 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
1916 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
1917 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
1918 .addImm(Lo.getSExtValue())
1919 .addReg(Dst, RegState::Implicit | RegState::Define);
1920 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
1921 .addImm(Hi.getSExtValue())
1922 .addReg(Dst, RegState::Implicit | RegState::Define);
1923 MI.eraseFromParent();
1924 break;
1925 }
1926 case AMDGPU::V_SET_INACTIVE_B32: {
1927 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1928 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1929 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
1930 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
1931 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1932 .add(MI.getOperand(1));
1933 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
1934 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
1935 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1936 .add(MI.getOperand(2));
1937 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1938 .addReg(Exec);
1939 MI.eraseFromParent();
1940 break;
1941 }
1942 case AMDGPU::V_SET_INACTIVE_B64: {
1943 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1944 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1945 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1946 MI.getOperand(0).getReg())
1947 .add(MI.getOperand(1));
1948 expandPostRAPseudo(*Copy);
1949 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
1950 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
1951 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1952 MI.getOperand(0).getReg())
1953 .add(MI.getOperand(2));
1954 expandPostRAPseudo(*Copy);
1955 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1956 .addReg(Exec);
1957 MI.eraseFromParent();
1958 break;
1959 }
1960 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
1961 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
1962 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
1963 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
1964 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
1965 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
1966 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
1967 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
1968 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
1969 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
1970 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
1971 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
1972 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
1973 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
1974 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
1975 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
1976 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
1977 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
1978 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
1979 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
1980 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
1981 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
1982
1983 unsigned Opc;
1984 if (RI.hasVGPRs(EltRC)) {
1985 Opc = AMDGPU::V_MOVRELD_B32_e32;
1986 } else {
1987 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
1988 : AMDGPU::S_MOVRELD_B32;
1989 }
1990
1991 const MCInstrDesc &OpDesc = get(Opc);
1992 Register VecReg = MI.getOperand(0).getReg();
1993 bool IsUndef = MI.getOperand(1).isUndef();
1994 unsigned SubReg = MI.getOperand(3).getImm();
1995 assert(VecReg == MI.getOperand(1).getReg())(static_cast <bool> (VecReg == MI.getOperand(1).getReg(
)) ? void (0) : __assert_fail ("VecReg == MI.getOperand(1).getReg()"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1995, __extension__
__PRETTY_FUNCTION__))
;
1996
1997 MachineInstrBuilder MIB =
1998 BuildMI(MBB, MI, DL, OpDesc)
1999 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2000 .add(MI.getOperand(2))
2001 .addReg(VecReg, RegState::ImplicitDefine)
2002 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2003
2004 const int ImpDefIdx =
2005 OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
2006 const int ImpUseIdx = ImpDefIdx + 1;
2007 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2008 MI.eraseFromParent();
2009 break;
2010 }
2011 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2012 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2013 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2014 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2015 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2016 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2017 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2018 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2019 assert(ST.useVGPRIndexMode())(static_cast <bool> (ST.useVGPRIndexMode()) ? void (0) :
__assert_fail ("ST.useVGPRIndexMode()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 2019, __extension__ __PRETTY_FUNCTION__))
;
2020 Register VecReg = MI.getOperand(0).getReg();
2021 bool IsUndef = MI.getOperand(1).isUndef();
2022 Register Idx = MI.getOperand(3).getReg();
2023 Register SubReg = MI.getOperand(4).getImm();
2024
2025 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2026 .addReg(Idx)
2027 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2028 SetOn->getOperand(3).setIsUndef();
2029
2030 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2031 MachineInstrBuilder MIB =
2032 BuildMI(MBB, MI, DL, OpDesc)
2033 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2034 .add(MI.getOperand(2))
2035 .addReg(VecReg, RegState::ImplicitDefine)
2036 .addReg(VecReg,
2037 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2038
2039 const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
2040 const int ImpUseIdx = ImpDefIdx + 1;
2041 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2042
2043 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2044
2045 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2046
2047 MI.eraseFromParent();
2048 break;
2049 }
2050 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2051 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2052 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2053 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2054 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2055 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2056 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2057 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2058 assert(ST.useVGPRIndexMode())(static_cast <bool> (ST.useVGPRIndexMode()) ? void (0) :
__assert_fail ("ST.useVGPRIndexMode()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 2058, __extension__ __PRETTY_FUNCTION__))
;
2059 Register Dst = MI.getOperand(0).getReg();
2060 Register VecReg = MI.getOperand(1).getReg();
2061 bool IsUndef = MI.getOperand(1).isUndef();
2062 Register Idx = MI.getOperand(2).getReg();
2063 Register SubReg = MI.getOperand(3).getImm();
2064
2065 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2066 .addReg(Idx)
2067 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2068 SetOn->getOperand(3).setIsUndef();
2069
2070 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2071 .addDef(Dst)
2072 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2073 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2074
2075 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2076
2077 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2078
2079 MI.eraseFromParent();
2080 break;
2081 }
2082 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2083 MachineFunction &MF = *MBB.getParent();
2084 Register Reg = MI.getOperand(0).getReg();
2085 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2086 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2087
2088 // Create a bundle so these instructions won't be re-ordered by the
2089 // post-RA scheduler.
2090 MIBundleBuilder Bundler(MBB, MI);
2091 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2092
2093 // Add 32-bit offset from this instruction to the start of the
2094 // constant data.
2095 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
2096 .addReg(RegLo)
2097 .add(MI.getOperand(1)));
2098
2099 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2100 .addReg(RegHi);
2101 MIB.add(MI.getOperand(2));
2102
2103 Bundler.append(MIB);
2104 finalizeBundle(MBB, Bundler.begin());
2105
2106 MI.eraseFromParent();
2107 break;
2108 }
2109 case AMDGPU::ENTER_STRICT_WWM: {
2110 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2111 // Whole Wave Mode is entered.
2112 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2113 : AMDGPU::S_OR_SAVEEXEC_B64));
2114 break;
2115 }
2116 case AMDGPU::ENTER_STRICT_WQM: {
2117 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2118 // STRICT_WQM is entered.
2119 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2120 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2121 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2122 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2123 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2124
2125 MI.eraseFromParent();
2126 break;
2127 }
2128 case AMDGPU::EXIT_STRICT_WWM:
2129 case AMDGPU::EXIT_STRICT_WQM: {
2130 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2131 // WWM/STICT_WQM is exited.
2132 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2133 break;
2134 }
2135 case AMDGPU::SI_RETURN: {
2136 const MachineFunction *MF = MBB.getParent();
2137 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2138 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2139 // Hiding the return address use with SI_RETURN may lead to extra kills in
2140 // the function and missing live-ins. We are fine in practice because callee
2141 // saved register handling ensures the register value is restored before
2142 // RET, but we need the undef flag here to appease the MachineVerifier
2143 // liveness checks.
2144 MachineInstrBuilder MIB =
2145 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2146 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2147
2148 MIB.copyImplicitOps(MI);
2149 MI.eraseFromParent();
2150 break;
2151 }
2152 }
2153 return true;
2154}
2155
2156std::pair<MachineInstr*, MachineInstr*>
2157SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2158 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO)(static_cast <bool> (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO
) ? void (0) : __assert_fail ("MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2158, __extension__
__PRETTY_FUNCTION__))
;
3
'?' condition is true
2159
2160 MachineBasicBlock &MBB = *MI.getParent();
2161 DebugLoc DL = MBB.findDebugLoc(MI);
2162 MachineFunction *MF = MBB.getParent();
2163 MachineRegisterInfo &MRI = MF->getRegInfo();
2164 Register Dst = MI.getOperand(0).getReg();
2165 unsigned Part = 0;
2166 MachineInstr *Split[2];
2167
2168 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
4
Assuming '__begin1' is equal to '__end1'
2169 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2170 if (Dst.isPhysical()) {
2171 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2172 } else {
2173 assert(MRI.isSSA())(static_cast <bool> (MRI.isSSA()) ? void (0) : __assert_fail
("MRI.isSSA()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2173
, __extension__ __PRETTY_FUNCTION__))
;
2174 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2175 MovDPP.addDef(Tmp);
2176 }
2177
2178 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2179 const MachineOperand &SrcOp = MI.getOperand(I);
2180 assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail
("!SrcOp.isFPImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 2180, __extension__ __PRETTY_FUNCTION__))
;
2181 if (SrcOp.isImm()) {
2182 APInt Imm(64, SrcOp.getImm());
2183 Imm.ashrInPlace(Part * 32);
2184 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2185 } else {
2186 assert(SrcOp.isReg())(static_cast <bool> (SrcOp.isReg()) ? void (0) : __assert_fail
("SrcOp.isReg()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2186
, __extension__ __PRETTY_FUNCTION__))
;
2187 Register Src = SrcOp.getReg();
2188 if (Src.isPhysical())
2189 MovDPP.addReg(RI.getSubReg(Src, Sub));
2190 else
2191 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2192 }
2193 }
2194
2195 for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
2196 MovDPP.addImm(MI.getOperand(I).getImm());
2197
2198 Split[Part] = MovDPP;
2199 ++Part;
2200 }
2201
2202 if (Dst.isVirtual())
5
Calling 'Register::isVirtual'
11
Returning from 'Register::isVirtual'
12
Taking true branch
2203 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2204 .addReg(Split[0]->getOperand(0).getReg())
13
Called C++ object pointer is uninitialized
2205 .addImm(AMDGPU::sub0)
2206 .addReg(Split[1]->getOperand(0).getReg())
2207 .addImm(AMDGPU::sub1);
2208
2209 MI.eraseFromParent();
2210 return std::make_pair(Split[0], Split[1]);
2211}
2212
2213bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
2214 MachineOperand &Src0,
2215 unsigned Src0OpName,
2216 MachineOperand &Src1,
2217 unsigned Src1OpName) const {
2218 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2219 if (!Src0Mods)
2220 return false;
2221
2222 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2223 assert(Src1Mods &&(static_cast <bool> (Src1Mods && "All commutable instructions have both src0 and src1 modifiers"
) ? void (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2224, __extension__
__PRETTY_FUNCTION__))
2224 "All commutable instructions have both src0 and src1 modifiers")(static_cast <bool> (Src1Mods && "All commutable instructions have both src0 and src1 modifiers"
) ? void (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2224, __extension__
__PRETTY_FUNCTION__))
;
2225
2226 int Src0ModsVal = Src0Mods->getImm();
2227 int Src1ModsVal = Src1Mods->getImm();
2228
2229 Src1Mods->setImm(Src0ModsVal);
2230 Src0Mods->setImm(Src1ModsVal);
2231 return true;
2232}
2233
2234static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2235 MachineOperand &RegOp,
2236 MachineOperand &NonRegOp) {
2237 Register Reg = RegOp.getReg();
2238 unsigned SubReg = RegOp.getSubReg();
2239 bool IsKill = RegOp.isKill();
2240 bool IsDead = RegOp.isDead();
2241 bool IsUndef = RegOp.isUndef();
2242 bool IsDebug = RegOp.isDebug();
2243
2244 if (NonRegOp.isImm())
2245 RegOp.ChangeToImmediate(NonRegOp.getImm());
2246 else if (NonRegOp.isFI())
2247 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2248 else if (NonRegOp.isGlobal()) {
2249 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2250 NonRegOp.getTargetFlags());
2251 } else
2252 return nullptr;
2253
2254 // Make sure we don't reinterpret a subreg index in the target flags.
2255 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2256
2257 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2258 NonRegOp.setSubReg(SubReg);
2259
2260 return &MI;
2261}
2262
2263MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2264 unsigned Src0Idx,
2265 unsigned Src1Idx) const {
2266 assert(!NewMI && "this should never be used")(static_cast <bool> (!NewMI && "this should never be used"
) ? void (0) : __assert_fail ("!NewMI && \"this should never be used\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2266, __extension__
__PRETTY_FUNCTION__))
;
2267
2268 unsigned Opc = MI.getOpcode();
2269 int CommutedOpcode = commuteOpcode(Opc);
2270 if (CommutedOpcode == -1)
2271 return nullptr;
2272
2273 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU
::OpName::src0) == static_cast<int>(Src0Idx) &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast
<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"
) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__
__PRETTY_FUNCTION__))
2274 static_cast<int>(Src0Idx) &&(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU
::OpName::src0) == static_cast<int>(Src0Idx) &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast
<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"
) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__
__PRETTY_FUNCTION__))
2275 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU
::OpName::src0) == static_cast<int>(Src0Idx) &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast
<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"
) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__
__PRETTY_FUNCTION__))
2276 static_cast<int>(Src1Idx) &&(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU
::OpName::src0) == static_cast<int>(Src0Idx) &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast
<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"
) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__
__PRETTY_FUNCTION__))
2277 "inconsistency with findCommutedOpIndices")(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU
::OpName::src0) == static_cast<int>(Src0Idx) &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast
<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"
) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__
__PRETTY_FUNCTION__))
;
2278
2279 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2280 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2281
2282 MachineInstr *CommutedMI = nullptr;
2283 if (Src0.isReg() && Src1.isReg()) {
2284 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2285 // Be sure to copy the source modifiers to the right place.
2286 CommutedMI
2287 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2288 }
2289
2290 } else if (Src0.isReg() && !Src1.isReg()) {
2291 // src0 should always be able to support any operand type, so no need to
2292 // check operand legality.
2293 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2294 } else if (!Src0.isReg() && Src1.isReg()) {
2295 if (isOperandLegal(MI, Src1Idx, &Src0))
2296 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2297 } else {
2298 // FIXME: Found two non registers to commute. This does happen.
2299 return nullptr;
2300 }
2301
2302 if (CommutedMI) {
2303 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2304 Src1, AMDGPU::OpName::src1_modifiers);
2305
2306 CommutedMI->setDesc(get(CommutedOpcode));
2307 }
2308
2309 return CommutedMI;
2310}
2311
2312// This needs to be implemented because the source modifiers may be inserted
2313// between the true commutable operands, and the base
2314// TargetInstrInfo::commuteInstruction uses it.
2315bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2316 unsigned &SrcOpIdx0,
2317 unsigned &SrcOpIdx1) const {
2318 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2319}
2320
2321bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
2322 unsigned &SrcOpIdx1) const {
2323 if (!Desc.isCommutable())
2324 return false;
2325
2326 unsigned Opc = Desc.getOpcode();
2327 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2328 if (Src0Idx == -1)
2329 return false;
2330
2331 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2332 if (Src1Idx == -1)
2333 return false;
2334
2335 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2336}
2337
2338bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2339 int64_t BrOffset) const {
2340 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2341 // block is unanalyzable.
2342 assert(BranchOp != AMDGPU::S_SETPC_B64)(static_cast <bool> (BranchOp != AMDGPU::S_SETPC_B64) ?
void (0) : __assert_fail ("BranchOp != AMDGPU::S_SETPC_B64",
"llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2342, __extension__
__PRETTY_FUNCTION__))
;
2343
2344 // Convert to dwords.
2345 BrOffset /= 4;
2346
2347 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2348 // from the next instruction.
2349 BrOffset -= 1;
2350
2351 return isIntN(BranchOffsetBits, BrOffset);
2352}
2353
2354MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
2355 const MachineInstr &MI) const {
2356 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
2357 // This would be a difficult analysis to perform, but can always be legal so
2358 // there's no need to analyze it.
2359 return nullptr;
2360 }
2361
2362 return MI.getOperand(0).getMBB();
2363}
2364
2365void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2366 MachineBasicBlock &DestBB,
2367 MachineBasicBlock &RestoreBB,
2368 const DebugLoc &DL, int64_t BrOffset,
2369 RegScavenger *RS) const {
2370 assert(RS && "RegScavenger required for long branching")(static_cast <bool> (RS && "RegScavenger required for long branching"
) ? void (0) : __assert_fail ("RS && \"RegScavenger required for long branching\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2370, __extension__
__PRETTY_FUNCTION__))
;
2371 assert(MBB.empty() &&(static_cast <bool> (MBB.empty() && "new block should be inserted for expanding unconditional branch"
) ? void (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2372, __extension__
__PRETTY_FUNCTION__))
2372 "new block should be inserted for expanding unconditional branch")(static_cast <bool> (MBB.empty() && "new block should be inserted for expanding unconditional branch"
) ? void (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2372, __extension__
__PRETTY_FUNCTION__))
;
2373 assert(MBB.pred_size() == 1)(static_cast <bool> (MBB.pred_size() == 1) ? void (0) :
__assert_fail ("MBB.pred_size() == 1", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 2373, __extension__ __PRETTY_FUNCTION__))
;
2374 assert(RestoreBB.empty() &&(static_cast <bool> (RestoreBB.empty() && "restore block should be inserted for restoring clobbered registers"
) ? void (0) : __assert_fail ("RestoreBB.empty() && \"restore block should be inserted for restoring clobbered registers\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2375, __extension__
__PRETTY_FUNCTION__))
2375 "restore block should be inserted for restoring clobbered registers")(static_cast <bool> (RestoreBB.empty() && "restore block should be inserted for restoring clobbered registers"
) ? void (0) : __assert_fail ("RestoreBB.empty() && \"restore block should be inserted for restoring clobbered registers\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2375, __extension__
__PRETTY_FUNCTION__))
;
2376
2377 MachineFunction *MF = MBB.getParent();
2378 MachineRegisterInfo &MRI = MF->getRegInfo();
2379
2380 // FIXME: Virtual register workaround for RegScavenger not working with empty
2381 // blocks.
2382 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2383
2384 auto I = MBB.end();
2385
2386 // We need to compute the offset relative to the instruction immediately after
2387 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2388 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2389
2390 auto &MCCtx = MF->getContext();
2391 MCSymbol *PostGetPCLabel =
2392 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2393 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2394
2395 MCSymbol *OffsetLo =
2396 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2397 MCSymbol *OffsetHi =
2398 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2399 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2400 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2401 .addReg(PCReg, 0, AMDGPU::sub0)
2402 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2403 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2404 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2405 .addReg(PCReg, 0, AMDGPU::sub1)
2406 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2407
2408 // Insert the indirect branch after the other terminator.
2409 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2410 .addReg(PCReg);
2411
2412 // FIXME: If spilling is necessary, this will fail because this scavenger has
2413 // no emergency stack slots. It is non-trivial to spill in this situation,
2414 // because the restore code needs to be specially placed after the
2415 // jump. BranchRelaxation then needs to be made aware of the newly inserted
2416 // block.
2417 //
2418 // If a spill is needed for the pc register pair, we need to insert a spill
2419 // restore block right before the destination block, and insert a short branch
2420 // into the old destination block's fallthrough predecessor.
2421 // e.g.:
2422 //
2423 // s_cbranch_scc0 skip_long_branch:
2424 //
2425 // long_branch_bb:
2426 // spill s[8:9]
2427 // s_getpc_b64 s[8:9]
2428 // s_add_u32 s8, s8, restore_bb
2429 // s_addc_u32 s9, s9, 0
2430 // s_setpc_b64 s[8:9]
2431 //
2432 // skip_long_branch:
2433 // foo;
2434 //
2435 // .....
2436 //
2437 // dest_bb_fallthrough_predecessor:
2438 // bar;
2439 // s_branch dest_bb
2440 //
2441 // restore_bb:
2442 // restore s[8:9]
2443 // fallthrough dest_bb
2444 ///
2445 // dest_bb:
2446 // buzz;
2447
2448 RS->enterBasicBlockEnd(MBB);
2449 Register Scav = RS->scavengeRegisterBackwards(
2450 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2451 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2452 if (Scav) {
2453 RS->setRegUsed(Scav);
2454 MRI.replaceRegWith(PCReg, Scav);
2455 MRI.clearVirtRegs();
2456 } else {
2457 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2458 // SGPR spill.
2459 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2460 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2461 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2462 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2463 MRI.clearVirtRegs();
2464 }
2465
2466 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2467 // Now, the distance could be defined.
2468 auto *Offset = MCBinaryExpr::createSub(
2469 MCSymbolRefExpr::create(DestLabel, MCCtx),
2470 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2471 // Add offset assignments.
2472 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2473 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2474 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2475 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2476}
2477
2478unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2479 switch (Cond) {
2480 case SIInstrInfo::SCC_TRUE:
2481 return AMDGPU::S_CBRANCH_SCC1;
2482 case SIInstrInfo::SCC_FALSE:
2483 return AMDGPU::S_CBRANCH_SCC0;
2484 case SIInstrInfo::VCCNZ:
2485 return AMDGPU::S_CBRANCH_VCCNZ;
2486 case SIInstrInfo::VCCZ:
2487 return AMDGPU::S_CBRANCH_VCCZ;
2488 case SIInstrInfo::EXECNZ:
2489 return AMDGPU::S_CBRANCH_EXECNZ;
2490 case SIInstrInfo::EXECZ:
2491 return AMDGPU::S_CBRANCH_EXECZ;
2492 default:
2493 llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate",
"llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2493)
;
2494 }
2495}
2496
2497SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
2498 switch (Opcode) {
2499 case AMDGPU::S_CBRANCH_SCC0:
2500 return SCC_FALSE;
2501 case AMDGPU::S_CBRANCH_SCC1:
2502 return SCC_TRUE;
2503 case AMDGPU::S_CBRANCH_VCCNZ:
2504 return VCCNZ;
2505 case AMDGPU::S_CBRANCH_VCCZ:
2506 return VCCZ;
2507 case AMDGPU::S_CBRANCH_EXECNZ:
2508 return EXECNZ;
2509 case AMDGPU::S_CBRANCH_EXECZ:
2510 return EXECZ;
2511 default:
2512 return INVALID_BR;
2513 }
2514}
2515
2516bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
2517 MachineBasicBlock::iterator I,
2518 MachineBasicBlock *&TBB,
2519 MachineBasicBlock *&FBB,
2520 SmallVectorImpl<MachineOperand> &Cond,
2521 bool AllowModify) const {
2522 if (I->getOpcode() == AMDGPU::S_BRANCH) {
2523 // Unconditional Branch
2524 TBB = I->getOperand(0).getMBB();
2525 return false;
2526 }
2527
2528 MachineBasicBlock *CondBB = nullptr;
2529
2530 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
2531 CondBB = I->getOperand(1).getMBB();
2532 Cond.push_back(I->getOperand(0));
2533 } else {
2534 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
2535 if (Pred == INVALID_BR)
2536 return true;
2537
2538 CondBB = I->getOperand(0).getMBB();
2539 Cond.push_back(MachineOperand::CreateImm(Pred));
2540 Cond.push_back(I->getOperand(1)); // Save the branch register.
2541 }
2542 ++I;
2543
2544 if (I == MBB.end()) {
2545 // Conditional branch followed by fall-through.
2546 TBB = CondBB;
2547 return false;
2548 }
2549
2550 if (I->getOpcode() == AMDGPU::S_BRANCH) {
2551 TBB = CondBB;
2552 FBB = I->getOperand(0).getMBB();
2553 return false;
2554 }
2555
2556 return true;
2557}
2558
2559bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
2560 MachineBasicBlock *&FBB,
2561 SmallVectorImpl<MachineOperand> &Cond,
2562 bool AllowModify) const {
2563 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
2564 auto E = MBB.end();
2565 if (I == E)
2566 return false;
2567
2568 // Skip over the instructions that are artificially terminators for special
2569 // exec management.
2570 while (I != E && !I->isBranch() && !I->isReturn()) {
2571 switch (I->getOpcode()) {
2572 case AMDGPU::S_MOV_B64_term:
2573 case AMDGPU::S_XOR_B64_term:
2574 case AMDGPU::S_OR_B64_term:
2575 case AMDGPU::S_ANDN2_B64_term:
2576 case AMDGPU::S_AND_B64_term:
2577 case AMDGPU::S_MOV_B32_term:
2578 case AMDGPU::S_XOR_B32_term:
2579 case AMDGPU::S_OR_B32_term:
2580 case AMDGPU::S_ANDN2_B32_term:
2581 case AMDGPU::S_AND_B32_term:
2582 break;
2583 case AMDGPU::SI_IF:
2584 case AMDGPU::SI_ELSE:
2585 case AMDGPU::SI_KILL_I1_TERMINATOR:
2586 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
2587 // FIXME: It's messy that these need to be considered here at all.
2588 return true;
2589 default:
2590 llvm_unreachable("unexpected non-branch terminator inst")::llvm::llvm_unreachable_internal("unexpected non-branch terminator inst"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2590)
;
2591 }
2592
2593 ++I;
2594 }
2595
2596 if (I == E)
2597 return false;
2598
2599 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
2600}
2601
2602unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
2603 int *BytesRemoved) const {
2604 unsigned Count = 0;
2605 unsigned RemovedSize = 0;
2606 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
2607 // Skip over artificial terminators when removing instructions.
2608 if (MI.isBranch() || MI.isReturn()) {
2609 RemovedSize += getInstSizeInBytes(MI);
2610 MI.eraseFromParent();
2611 ++Count;
2612 }
2613 }
2614
2615 if (BytesRemoved)
2616 *BytesRemoved = RemovedSize;
2617
2618 return Count;
2619}
2620
2621// Copy the flags onto the implicit condition register operand.
2622static void preserveCondRegFlags(MachineOperand &CondReg,
2623 const MachineOperand &OrigCond) {
2624 CondReg.setIsUndef(OrigCond.isUndef());
2625 CondReg.setIsKill(OrigCond.isKill());
2626}
2627
2628unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
2629 MachineBasicBlock *TBB,
2630 MachineBasicBlock *FBB,
2631 ArrayRef<MachineOperand> Cond,
2632 const DebugLoc &DL,
2633 int *BytesAdded) const {
2634 if (!FBB && Cond.empty()) {
2635 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2636 .addMBB(TBB);
2637 if (BytesAdded)
2638 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2639 return 1;
2640 }
2641
2642 if(Cond.size() == 1 && Cond[0].isReg()) {
2643 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
2644 .add(Cond[0])
2645 .addMBB(TBB);
2646 return 1;
2647 }
2648
2649 assert(TBB && Cond[0].isImm())(static_cast <bool> (TBB && Cond[0].isImm()) ? void
(0) : __assert_fail ("TBB && Cond[0].isImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 2649, __extension__ __PRETTY_FUNCTION__))
;
2650
2651 unsigned Opcode
2652 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
2653
2654 if (!FBB) {
2655 Cond[1].isUndef();
2656 MachineInstr *CondBr =
2657 BuildMI(&MBB, DL, get(Opcode))
2658 .addMBB(TBB);
2659
2660 // Copy the flags onto the implicit condition register operand.
2661 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
2662 fixImplicitOperands(*CondBr);
2663
2664 if (BytesAdded)
2665 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2666 return 1;
2667 }
2668
2669 assert(TBB && FBB)(static_cast <bool> (TBB && FBB) ? void (0) : __assert_fail
("TBB && FBB", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 2669, __extension__ __PRETTY_FUNCTION__))
;
2670
2671 MachineInstr *CondBr =
2672 BuildMI(&MBB, DL, get(Opcode))
2673 .addMBB(TBB);
2674 fixImplicitOperands(*CondBr);
2675 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2676 .addMBB(FBB);
2677
2678 MachineOperand &CondReg = CondBr->getOperand(1);
2679 CondReg.setIsUndef(Cond[1].isUndef());
2680 CondReg.setIsKill(Cond[1].isKill());
2681
2682 if (BytesAdded)
2683 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
2684
2685 return 2;
2686}
2687
2688bool SIInstrInfo::reverseBranchCondition(
2689 SmallVectorImpl<MachineOperand> &Cond) const {
2690 if (Cond.size() != 2) {
2691 return true;
2692 }
2693
2694 if (Cond[0].isImm()) {
2695 Cond[0].setImm(-Cond[0].getImm());
2696 return false;
2697 }
2698
2699 return true;
2700}
2701
2702bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
2703 ArrayRef<MachineOperand> Cond,
2704 Register DstReg, Register TrueReg,
2705 Register FalseReg, int &CondCycles,
2706 int &TrueCycles, int &FalseCycles) const {
2707 switch (Cond[0].getImm()) {
2708 case VCCNZ:
2709 case VCCZ: {
2710 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2711 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2712 if (MRI.getRegClass(FalseReg) != RC)
2713 return false;
2714
2715 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2716 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2717
2718 // Limit to equal cost for branch vs. N v_cndmask_b32s.
2719 return RI.hasVGPRs(RC) && NumInsts <= 6;
2720 }
2721 case SCC_TRUE:
2722 case SCC_FALSE: {
2723 // FIXME: We could insert for VGPRs if we could replace the original compare
2724 // with a vector one.
2725 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2726 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2727 if (MRI.getRegClass(FalseReg) != RC)
2728 return false;
2729
2730 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2731
2732 // Multiples of 8 can do s_cselect_b64
2733 if (NumInsts % 2 == 0)
2734 NumInsts /= 2;
2735
2736 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2737 return RI.isSGPRClass(RC);
2738 }
2739 default:
2740 return false;
2741 }
2742}
2743
2744void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
2745 MachineBasicBlock::iterator I, const DebugLoc &DL,
2746 Register DstReg, ArrayRef<MachineOperand> Cond,
2747 Register TrueReg, Register FalseReg) const {
2748 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
2749 if (Pred == VCCZ || Pred == SCC_FALSE) {
2750 Pred = static_cast<BranchPredicate>(-Pred);
2751 std::swap(TrueReg, FalseReg);
2752 }
2753
2754 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2755 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
2756 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
2757
2758 if (DstSize == 32) {
2759 MachineInstr *Select;
2760 if (Pred == SCC_TRUE) {
2761 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
2762 .addReg(TrueReg)
2763 .addReg(FalseReg);
2764 } else {
2765 // Instruction's operands are backwards from what is expected.
2766 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
2767 .addReg(FalseReg)
2768 .addReg(TrueReg);
2769 }
2770
2771 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2772 return;
2773 }
2774
2775 if (DstSize == 64 && Pred == SCC_TRUE) {
2776 MachineInstr *Select =
2777 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
2778 .addReg(TrueReg)
2779 .addReg(FalseReg);
2780
2781 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2782 return;
2783 }
2784
2785 static const int16_t Sub0_15[] = {
2786 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
2787 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
2788 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
2789 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
2790 };
2791
2792 static const int16_t Sub0_15_64[] = {
2793 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
2794 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
2795 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
2796 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
2797 };
2798
2799 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
2800 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
2801 const int16_t *SubIndices = Sub0_15;
2802 int NElts = DstSize / 32;
2803
2804 // 64-bit select is only available for SALU.
2805 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
2806 if (Pred == SCC_TRUE) {
2807 if (NElts % 2) {
2808 SelOp = AMDGPU::S_CSELECT_B32;
2809 EltRC = &AMDGPU::SGPR_32RegClass;
2810 } else {
2811 SelOp = AMDGPU::S_CSELECT_B64;
2812 EltRC = &AMDGPU::SGPR_64RegClass;
2813 SubIndices = Sub0_15_64;
2814 NElts /= 2;
2815 }
2816 }
2817
2818 MachineInstrBuilder MIB = BuildMI(
2819 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
2820
2821 I = MIB->getIterator();
2822
2823 SmallVector<Register, 8> Regs;
2824 for (int Idx = 0; Idx != NElts; ++Idx) {
2825 Register DstElt = MRI.createVirtualRegister(EltRC);
2826 Regs.push_back(DstElt);
2827
2828 unsigned SubIdx = SubIndices[Idx];
2829
2830 MachineInstr *Select;
2831 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
2832 Select =
2833 BuildMI(MBB, I, DL, get(SelOp), DstElt)
2834 .addReg(FalseReg, 0, SubIdx)
2835 .addReg(TrueReg, 0, SubIdx);
2836 } else {
2837 Select =
2838 BuildMI(MBB, I, DL, get(SelOp), DstElt)
2839 .addReg(TrueReg, 0, SubIdx)
2840 .addReg(FalseReg, 0, SubIdx);
2841 }
2842
2843 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2844 fixImplicitOperands(*Select);
2845
2846 MIB.addReg(DstElt)
2847 .addImm(SubIdx);
2848 }
2849}
2850
2851bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
2852 switch (MI.getOpcode()) {
2853 case AMDGPU::V_MOV_B32_e32:
2854 case AMDGPU::V_MOV_B32_e64:
2855 case AMDGPU::V_MOV_B64_PSEUDO:
2856 case AMDGPU::V_MOV_B64_e32:
2857 case AMDGPU::V_MOV_B64_e64:
2858 case AMDGPU::S_MOV_B32:
2859 case AMDGPU::S_MOV_B64:
2860 case AMDGPU::COPY:
2861 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
2862 case AMDGPU::V_ACCVGPR_READ_B32_e64:
2863 case AMDGPU::V_ACCVGPR_MOV_B32:
2864 return true;
2865 default:
2866 return false;
2867 }
2868}
2869
2870unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
2871 unsigned Kind) const {
2872 switch(Kind) {
2873 case PseudoSourceValue::Stack:
2874 case PseudoSourceValue::FixedStack:
2875 return AMDGPUAS::PRIVATE_ADDRESS;
2876 case PseudoSourceValue::ConstantPool:
2877 case PseudoSourceValue::GOT:
2878 case PseudoSourceValue::JumpTable:
2879 case PseudoSourceValue::GlobalValueCallEntry:
2880 case PseudoSourceValue::ExternalSymbolCallEntry:
2881 case PseudoSourceValue::TargetCustom:
2882 return AMDGPUAS::CONSTANT_ADDRESS;
2883 }
2884 return AMDGPUAS::FLAT_ADDRESS;
2885}
2886
2887static void removeModOperands(MachineInstr &MI) {
2888 unsigned Opc = MI.getOpcode();
2889 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2890 AMDGPU::OpName::src0_modifiers);
2891 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2892 AMDGPU::OpName::src1_modifiers);
2893 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2894 AMDGPU::OpName::src2_modifiers);
2895
2896 MI.RemoveOperand(Src2ModIdx);
2897 MI.RemoveOperand(Src1ModIdx);
2898 MI.RemoveOperand(Src0ModIdx);
2899}
2900
2901bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
2902 Register Reg, MachineRegisterInfo *MRI) const {
2903 if (!MRI->hasOneNonDBGUse(Reg))
2904 return false;
2905
2906 switch (DefMI.getOpcode()) {
2907 default:
2908 return false;
2909 case AMDGPU::S_MOV_B64:
2910 // TODO: We could fold 64-bit immediates, but this get complicated
2911 // when there are sub-registers.
2912 return false;
2913
2914 case AMDGPU::V_MOV_B32_e32:
2915 case AMDGPU::S_MOV_B32:
2916 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
2917 break;
2918 }
2919
2920 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2921 assert(ImmOp)(static_cast <bool> (ImmOp) ? void (0) : __assert_fail (
"ImmOp", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2921, __extension__
__PRETTY_FUNCTION__))
;
2922 // FIXME: We could handle FrameIndex values here.
2923 if (!ImmOp->isImm())
2924 return false;
2925
2926 unsigned Opc = UseMI.getOpcode();
2927 if (Opc == AMDGPU::COPY) {
2928 Register DstReg = UseMI.getOperand(0).getReg();
2929 bool Is16Bit = getOpSize(UseMI, 0) == 2;
2930 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
2931 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2932 APInt Imm(32, ImmOp->getImm());
2933
2934 if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
2935 Imm = Imm.ashr(16);
2936
2937 if (RI.isAGPR(*MRI, DstReg)) {
2938 if (!isInlineConstant(Imm))
2939 return false;
2940 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2941 }
2942
2943 if (Is16Bit) {
2944 if (isVGPRCopy)
2945 return false; // Do not clobber vgpr_hi16
2946
2947 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
2948 return false;
2949
2950 UseMI.getOperand(0).setSubReg(0);
2951 if (DstReg.isPhysical()) {
2952 DstReg = RI.get32BitRegister(DstReg);
2953 UseMI.getOperand(0).setReg(DstReg);
2954 }
2955 assert(UseMI.getOperand(1).getReg().isVirtual())(static_cast <bool> (UseMI.getOperand(1).getReg().isVirtual
()) ? void (0) : __assert_fail ("UseMI.getOperand(1).getReg().isVirtual()"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2955, __extension__
__PRETTY_FUNCTION__))
;
2956 }
2957
2958 UseMI.setDesc(get(NewOpc));
2959 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
2960 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2961 return true;
2962 }
2963
2964 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
2965 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
2966 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2967 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) {
2968 // Don't fold if we are using source or output modifiers. The new VOP2
2969 // instructions don't have them.
2970 if (hasAnyModifiersSet(UseMI))
2971 return false;
2972
2973 // If this is a free constant, there's no reason to do this.
2974 // TODO: We could fold this here instead of letting SIFoldOperands do it
2975 // later.
2976 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2977
2978 // Any src operand can be used for the legality check.
2979 if (isInlineConstant(UseMI, *Src0, *ImmOp))
2980 return false;
2981
2982 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
2983 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
2984 bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2985 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64;
2986 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2987 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2988
2989 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2990 // We should only expect these to be on src0 due to canonicalization.
2991 if (Src0->isReg() && Src0->getReg() == Reg) {
2992 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2993 return false;
2994
2995 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2996 return false;
2997
2998 unsigned NewOpc =
2999 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
3000 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3001 if (pseudoToMCOpcode(NewOpc) == -1)
3002 return false;
3003
3004 // We need to swap operands 0 and 1 since madmk constant is at operand 1.
3005
3006 const int64_t Imm = ImmOp->getImm();
3007
3008 // FIXME: This would be a lot easier if we could return a new instruction
3009 // instead of having to modify in place.
3010
3011 // Remove these first since they are at the end.
3012 UseMI.RemoveOperand(
3013 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
3014 UseMI.RemoveOperand(
3015 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
3016
3017 Register Src1Reg = Src1->getReg();
3018 unsigned Src1SubReg = Src1->getSubReg();
3019 Src0->setReg(Src1Reg);
3020 Src0->setSubReg(Src1SubReg);
3021 Src0->setIsKill(Src1->isKill());
3022
3023 if (Opc == AMDGPU::V_MAC_F32_e64 ||
3024 Opc == AMDGPU::V_MAC_F16_e64 ||
3025 Opc == AMDGPU::V_FMAC_F32_e64 ||
3026 Opc == AMDGPU::V_FMAC_F16_e64)
3027 UseMI.untieRegOperand(
3028 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3029
3030 Src1->ChangeToImmediate(Imm);
3031
3032 removeModOperands(UseMI);
3033 UseMI.setDesc(get(NewOpc));
3034
3035 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3036 if (DeleteDef)
3037 DefMI.eraseFromParent();
3038
3039 return true;
3040 }
3041
3042 // Added part is the constant: Use v_madak_{f16, f32}.
3043 if (Src2->isReg() && Src2->getReg() == Reg) {
3044 // Not allowed to use constant bus for another operand.
3045 // We can however allow an inline immediate as src0.
3046 bool Src0Inlined = false;
3047 if (Src0->isReg()) {
3048 // Try to inline constant if possible.
3049 // If the Def moves immediate and the use is single
3050 // We are saving VGPR here.
3051 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3052 if (Def && Def->isMoveImmediate() &&
3053 isInlineConstant(Def->getOperand(1)) &&
3054 MRI->hasOneUse(Src0->getReg())) {
3055 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3056 Src0Inlined = true;
3057 } else if ((Src0->getReg().isPhysical() &&
3058 (ST.getConstantBusLimit(Opc) <= 1 &&
3059 RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
3060 (Src0->getReg().isVirtual() &&
3061 (ST.getConstantBusLimit(Opc) <= 1 &&
3062 RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
3063 return false;
3064 // VGPR is okay as Src0 - fallthrough
3065 }
3066
3067 if (Src1->isReg() && !Src0Inlined ) {
3068 // We have one slot for inlinable constant so far - try to fill it
3069 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3070 if (Def && Def->isMoveImmediate() &&
3071 isInlineConstant(Def->getOperand(1)) &&
3072 MRI->hasOneUse(Src1->getReg()) &&
3073 commuteInstruction(UseMI)) {
3074 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3075 } else if ((Src1->getReg().isPhysical() &&
3076 RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
3077 (Src1->getReg().isVirtual() &&
3078 RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
3079 return false;
3080 // VGPR is okay as Src1 - fallthrough
3081 }
3082
3083 unsigned NewOpc =
3084 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
3085 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3086 if (pseudoToMCOpcode(NewOpc) == -1)
3087 return false;
3088
3089 const int64_t Imm = ImmOp->getImm();
3090
3091 // FIXME: This would be a lot easier if we could return a new instruction
3092 // instead of having to modify in place.
3093
3094 // Remove these first since they are at the end.
3095 UseMI.RemoveOperand(
3096 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
3097 UseMI.RemoveOperand(
3098 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
3099
3100 if (Opc == AMDGPU::V_MAC_F32_e64 ||
3101 Opc == AMDGPU::V_MAC_F16_e64 ||
3102 Opc == AMDGPU::V_FMAC_F32_e64 ||
3103 Opc == AMDGPU::V_FMAC_F16_e64)
3104 UseMI.untieRegOperand(
3105 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3106
3107 // ChangingToImmediate adds Src2 back to the instruction.
3108 Src2->ChangeToImmediate(Imm);
3109
3110 // These come before src2.
3111 removeModOperands(UseMI);
3112 UseMI.setDesc(get(NewOpc));
3113 // It might happen that UseMI was commuted
3114 // and we now have SGPR as SRC1. If so 2 inlined
3115 // constant and SGPR are illegal.
3116 legalizeOperands(UseMI);
3117
3118 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3119 if (DeleteDef)
3120 DefMI.eraseFromParent();
3121
3122 return true;
3123 }
3124 }
3125
3126 return false;
3127}
3128
3129static bool
3130memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
3131 ArrayRef<const MachineOperand *> BaseOps2) {
3132 if (BaseOps1.size() != BaseOps2.size())
3133 return false;
3134 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3135 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3136 return false;
3137 }
3138 return true;
3139}
3140
3141static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
3142 int WidthB, int OffsetB) {
3143 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3144 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3145 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3146 return LowOffset + LowWidth <= HighOffset;
3147}
3148
3149bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3150 const MachineInstr &MIb) const {
3151 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3152 int64_t Offset0, Offset1;
3153 unsigned Dummy0, Dummy1;
3154 bool Offset0IsScalable, Offset1IsScalable;
3155 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3156 Dummy0, &RI) ||
3157 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3158 Dummy1, &RI))
3159 return false;
3160
3161 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3162 return false;
3163
3164 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3165 // FIXME: Handle ds_read2 / ds_write2.
3166 return false;
3167 }
3168 unsigned Width0 = MIa.memoperands().front()->getSize();
3169 unsigned Width1 = MIb.memoperands().front()->getSize();
3170 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3171}
3172
3173bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
3174 const MachineInstr &MIb) const {
3175 assert(MIa.mayLoadOrStore() &&(static_cast <bool> (MIa.mayLoadOrStore() && "MIa must load from or modify a memory location"
) ? void (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3176, __extension__
__PRETTY_FUNCTION__))
3176 "MIa must load from or modify a memory location")(static_cast <bool> (MIa.mayLoadOrStore() && "MIa must load from or modify a memory location"
) ? void (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3176, __extension__
__PRETTY_FUNCTION__))
;
3177 assert(MIb.mayLoadOrStore() &&(static_cast <bool> (MIb.mayLoadOrStore() && "MIb must load from or modify a memory location"
) ? void (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3178, __extension__
__PRETTY_FUNCTION__))
3178 "MIb must load from or modify a memory location")(static_cast <bool> (MIb.mayLoadOrStore() && "MIb must load from or modify a memory location"
) ? void (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3178, __extension__
__PRETTY_FUNCTION__))
;
3179
3180 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
3181 return false;
3182
3183 // XXX - Can we relax this between address spaces?
3184 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3185 return false;
3186
3187 // TODO: Should we check the address space from the MachineMemOperand? That
3188 // would allow us to distinguish objects we know don't alias based on the
3189 // underlying address space, even if it was lowered to a different one,
3190 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3191 // buffer.
3192 if (isDS(MIa)) {
3193 if (isDS(MIb))
3194 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3195
3196 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3197 }
3198
3199 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3200 if (isMUBUF(MIb) || isMTBUF(MIb))
3201 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3202
3203 return !isFLAT(MIb) && !isSMRD(MIb);
3204 }
3205
3206 if (isSMRD(MIa)) {
3207 if (isSMRD(MIb))
3208 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3209
3210 return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
3211 }
3212
3213 if (isFLAT(MIa)) {
3214 if (isFLAT(MIb))
3215 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3216
3217 return false;
3218 }
3219
3220 return false;
3221}
3222
3223static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
3224 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3225 if (Reg.isPhysical())
3226 return false;
3227 auto *Def = MRI.getUniqueVRegDef(Reg);
3228 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3229 Imm = Def->getOperand(1).getImm();
3230 if (DefMI)
3231 *DefMI = Def;
3232 return true;
3233 }
3234 return false;
3235}
3236
3237static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3238 MachineInstr **DefMI = nullptr) {
3239 if (!MO->isReg())
3240 return false;
3241 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3242 const MachineRegisterInfo &MRI = MF->getRegInfo();
3243 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3244}
3245
3246static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
3247 MachineInstr &NewMI) {
3248 if (LV) {
3249 unsigned NumOps = MI.getNumOperands();
3250 for (unsigned I = 1; I < NumOps; ++I) {
3251 MachineOperand &Op = MI.getOperand(I);
3252 if (Op.isReg() && Op.isKill())
3253 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3254 }
3255 }
3256}
3257
3258MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3259 LiveVariables *LV,
3260 LiveIntervals *LIS) const {
3261 MachineBasicBlock &MBB = *MI.getParent();
3262 unsigned Opc = MI.getOpcode();
3263
3264 // Handle MFMA.
3265 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3266 if (NewMFMAOpc != -1) {
3267 MachineInstrBuilder MIB =
3268 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3269 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3270 MIB.add(MI.getOperand(I));
3271 updateLiveVariables(LV, MI, *MIB);
3272 if (LIS)
3273 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3274 return MIB;
3275 }
3276
3277 // Handle MAC/FMAC.
3278 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3279 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
3280 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3281 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3282 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3283 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3284 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3285 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3286 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3287 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3288 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3289 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3290 bool Src0Literal = false;
3291
3292 switch (Opc) {
3293 default:
3294 return nullptr;
3295 case AMDGPU::V_MAC_F16_e64:
3296 case AMDGPU::V_FMAC_F16_e64:
3297 case AMDGPU::V_MAC_F32_e64:
3298 case AMDGPU::V_MAC_LEGACY_F32_e64:
3299 case AMDGPU::V_FMAC_F32_e64:
3300 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3301 case AMDGPU::V_FMAC_F64_e64:
3302 break;
3303 case AMDGPU::V_MAC_F16_e32:
3304 case AMDGPU::V_FMAC_F16_e32:
3305 case AMDGPU::V_MAC_F32_e32:
3306 case AMDGPU::V_MAC_LEGACY_F32_e32:
3307 case AMDGPU::V_FMAC_F32_e32:
3308 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3309 case AMDGPU::V_FMAC_F64_e32: {
3310 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3311 AMDGPU::OpName::src0);
3312 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3313 if (!Src0->isReg() && !Src0->isImm())
3314 return nullptr;
3315
3316 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3317 Src0Literal = true;
3318
3319 break;
3320 }
3321 }
3322
3323 MachineInstrBuilder MIB;
3324 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3325 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3326 const MachineOperand *Src0Mods =
3327 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3328 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3329 const MachineOperand *Src1Mods =
3330 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3331 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3332 const MachineOperand *Src2Mods =
3333 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3334 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3335 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3336
3337 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3338 !IsLegacy &&
3339 // If we have an SGPR input, we will violate the constant bus restriction.
3340 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3341 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3342 MachineInstr *DefMI;
3343 const auto killDef = [&DefMI, &MBB, this]() -> void {
3344 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3345 // The only user is the instruction which will be killed.
3346 if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg()))
3347 return;
3348 // We cannot just remove the DefMI here, calling pass will crash.
3349 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3350 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3351 DefMI->RemoveOperand(I);
3352 };
3353
3354 int64_t Imm;
3355 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3356 unsigned NewOpc =
3357 IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
3358 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3359 if (pseudoToMCOpcode(NewOpc) != -1) {
3360 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3361 .add(*Dst)
3362 .add(*Src0)
3363 .add(*Src1)
3364 .addImm(Imm);
3365 updateLiveVariables(LV, MI, *MIB);
3366 if (LIS)
3367 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3368 killDef();
3369 return MIB;
3370 }
3371 }
3372 unsigned NewOpc = IsFMA
3373 ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
3374 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3375 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3376 if (pseudoToMCOpcode(NewOpc) != -1) {
3377 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3378 .add(*Dst)
3379 .add(*Src0)
3380 .addImm(Imm)
3381 .add(*Src2);
3382 updateLiveVariables(LV, MI, *MIB);
3383 if (LIS)
3384 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3385 killDef();
3386 return MIB;
3387 }
3388 }
3389 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
3390 if (Src0Literal) {
3391 Imm = Src0->getImm();
3392 DefMI = nullptr;
3393 }
3394 if (pseudoToMCOpcode(NewOpc) != -1 &&
3395 isOperandLegal(
3396 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
3397 Src1)) {
3398 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3399 .add(*Dst)
3400 .add(*Src1)
3401 .addImm(Imm)
3402 .add(*Src2);
3403 updateLiveVariables(LV, MI, *MIB);
3404 if (LIS)
3405 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3406 if (DefMI)
3407 killDef();
3408 return MIB;
3409 }
3410 }
3411 }
3412
3413 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
3414 // because VOP3 does not allow a literal operand.
3415 // TODO: Remove this restriction for GFX10.
3416 if (Src0Literal)
3417 return nullptr;
3418
3419 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
3420 : IsF64 ? AMDGPU::V_FMA_F64_e64
3421 : IsLegacy
3422 ? AMDGPU::V_FMA_LEGACY_F32_e64
3423 : AMDGPU::V_FMA_F32_e64
3424 : IsF16 ? AMDGPU::V_MAD_F16_e64
3425 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
3426 : AMDGPU::V_MAD_F32_e64;
3427 if (pseudoToMCOpcode(NewOpc) == -1)
3428 return nullptr;
3429
3430 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3431 .add(*Dst)
3432 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
3433 .add(*Src0)
3434 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
3435 .add(*Src1)
3436 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
3437 .add(*Src2)
3438 .addImm(Clamp ? Clamp->getImm() : 0)
3439 .addImm(Omod ? Omod->getImm() : 0);
3440 updateLiveVariables(LV, MI, *MIB);
3441 if (LIS)
3442 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3443 return MIB;
3444}
3445
3446// It's not generally safe to move VALU instructions across these since it will
3447// start using the register as a base index rather than directly.
3448// XXX - Why isn't hasSideEffects sufficient for these?
3449static bool changesVGPRIndexingMode(const MachineInstr &MI) {
3450 switch (MI.getOpcode()) {
3451 case AMDGPU::S_SET_GPR_IDX_ON:
3452 case AMDGPU::S_SET_GPR_IDX_MODE:
3453 case AMDGPU::S_SET_GPR_IDX_OFF:
3454 return true;
3455 default:
3456 return false;
3457 }
3458}
3459
3460bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
3461 const MachineBasicBlock *MBB,
3462 const MachineFunction &MF) const {
3463 // Skipping the check for SP writes in the base implementation. The reason it
3464 // was added was apparently due to compile time concerns.
3465 //
3466 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
3467 // but is probably avoidable.
3468
3469 // Copied from base implementation.
3470 // Terminators and labels can't be scheduled around.
3471 if (MI.isTerminator() || MI.isPosition())
3472 return true;
3473
3474 // INLINEASM_BR can jump to another block
3475 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
3476 return true;
3477
3478 // Target-independent instructions do not have an implicit-use of EXEC, even
3479 // when they operate on VGPRs. Treating EXEC modifications as scheduling
3480 // boundaries prevents incorrect movements of such instructions.
3481 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
3482 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
3483 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
3484 changesVGPRIndexingMode(MI);
3485}
3486
3487bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
3488 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
3489 Opcode == AMDGPU::DS_GWS_INIT ||
3490 Opcode == AMDGPU::DS_GWS_SEMA_V ||
3491 Opcode == AMDGPU::DS_GWS_SEMA_BR ||
3492 Opcode == AMDGPU::DS_GWS_SEMA_P ||
3493 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
3494 Opcode == AMDGPU::DS_GWS_BARRIER;
3495}
3496
3497bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
3498 // Skip the full operand and register alias search modifiesRegister
3499 // does. There's only a handful of instructions that touch this, it's only an
3500 // implicit def, and doesn't alias any other registers.
3501 if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) {
3502 for (; ImpDef && *ImpDef; ++ImpDef) {
3503 if (*ImpDef == AMDGPU::MODE)
3504 return true;
3505 }
3506 }
3507
3508 return false;
3509}
3510
3511bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
3512 unsigned Opcode = MI.getOpcode();
3513
3514 if (MI.mayStore() && isSMRD(MI))
3515 return true; // scalar store or atomic
3516
3517 // This will terminate the function when other lanes may need to continue.
3518 if (MI.isReturn())
3519 return true;
3520
3521 // These instructions cause shader I/O that may cause hardware lockups
3522 // when executed with an empty EXEC mask.
3523 //
3524 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
3525 // EXEC = 0, but checking for that case here seems not worth it
3526 // given the typical code patterns.
3527 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
3528 isEXP(Opcode) ||
3529 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
3530 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
3531 return true;
3532
3533 if (MI.isCall() || MI.isInlineAsm())
3534 return true; // conservative assumption
3535
3536 // A mode change is a scalar operation that influences vector instructions.
3537 if (modifiesModeRegister(MI))
3538 return true;
3539
3540 // These are like SALU instructions in terms of effects, so it's questionable
3541 // whether we should return true for those.
3542 //
3543 // However, executing them with EXEC = 0 causes them to operate on undefined
3544 // data, which we avoid by returning true here.
3545 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
3546 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
3547 return true;
3548
3549 return false;
3550}
3551
3552bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
3553 const MachineInstr &MI) const {
3554 if (MI.isMetaInstruction())
3555 return false;
3556
3557 // This won't read exec if this is an SGPR->SGPR copy.
3558 if (MI.isCopyLike()) {
3559 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
3560 return true;
3561
3562 // Make sure this isn't copying exec as a normal operand
3563 return MI.readsRegister(AMDGPU::EXEC, &RI);
3564 }
3565
3566 // Make a conservative assumption about the callee.
3567 if (MI.isCall())
3568 return true;
3569
3570 // Be conservative with any unhandled generic opcodes.
3571 if (!isTargetSpecificOpcode(MI.getOpcode()))
3572 return true;
3573
3574 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
3575}
3576
3577bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
3578 switch (Imm.getBitWidth()) {
3579 case 1: // This likely will be a condition code mask.
3580 return true;
3581
3582 case 32:
3583 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
3584 ST.hasInv2PiInlineImm());
3585 case 64:
3586 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
3587 ST.hasInv2PiInlineImm());
3588 case 16:
3589 return ST.has16BitInsts() &&
3590 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
3591 ST.hasInv2PiInlineImm());
3592 default:
3593 llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 3593)
;
3594 }
3595}
3596
3597bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
3598 uint8_t OperandType) const {
3599 if (!MO.isImm() ||
3600 OperandType < AMDGPU::OPERAND_SRC_FIRST ||
3601 OperandType > AMDGPU::OPERAND_SRC_LAST)
3602 return false;
3603
3604 // MachineOperand provides no way to tell the true operand size, since it only
3605 // records a 64-bit value. We need to know the size to determine if a 32-bit
3606 // floating point immediate bit pattern is legal for an integer immediate. It
3607 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
3608
3609 int64_t Imm = MO.getImm();
3610 switch (OperandType) {
3611 case AMDGPU::OPERAND_REG_IMM_INT32:
3612 case AMDGPU::OPERAND_REG_IMM_FP32:
3613 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
3614 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
3615 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
3616 case AMDGPU::OPERAND_REG_IMM_V2FP32:
3617 case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
3618 case AMDGPU::OPERAND_REG_IMM_V2INT32:
3619 case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
3620 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
3621 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
3622 int32_t Trunc = static_cast<int32_t>(Imm);
3623 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
3624 }
3625 case AMDGPU::OPERAND_REG_IMM_INT64:
3626 case AMDGPU::OPERAND_REG_IMM_FP64:
3627 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
3628 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
3629 case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
3630 return AMDGPU::isInlinableLiteral64(MO.getImm(),
3631 ST.hasInv2PiInlineImm());
3632 case AMDGPU::OPERAND_REG_IMM_INT16:
3633 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
3634 case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
3635 // We would expect inline immediates to not be concerned with an integer/fp
3636 // distinction. However, in the case of 16-bit integer operations, the
3637 // "floating point" values appear to not work. It seems read the low 16-bits
3638 // of 32-bit immediates, which happens to always work for the integer
3639 // values.
3640 //
3641 // See llvm bugzilla 46302.
3642 //
3643 // TODO: Theoretically we could use op-sel to use the high bits of the
3644 // 32-bit FP values.
3645 return AMDGPU::isInlinableIntLiteral(Imm);
3646 case AMDGPU::OPERAND_REG_IMM_V2INT16:
3647 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
3648 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
3649 // This suffers the same problem as the scalar 16-bit cases.
3650 return AMDGPU::isInlinableIntLiteralV216(Imm);
3651 case AMDGPU::OPERAND_REG_IMM_FP16:
3652 case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
3653 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
3654 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
3655 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
3656 // A few special case instructions have 16-bit operands on subtargets
3657 // where 16-bit instructions are not legal.
3658 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
3659 // constants in these cases
3660 int16_t Trunc = static_cast<int16_t>(Imm);
3661 return ST.has16BitInsts() &&
3662 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
3663 }
3664
3665 return false;
3666 }
3667 case AMDGPU::OPERAND_REG_IMM_V2FP16:
3668 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
3669 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
3670 uint32_t Trunc = static_cast<uint32_t>(Imm);
3671 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
3672 }
3673 case AMDGPU::OPERAND_KIMM32:
3674 case AMDGPU::OPERAND_KIMM16:
3675 return false;
3676 default:
3677 llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 3677)
;
3678 }
3679}
3680
3681bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
3682 const MCOperandInfo &OpInfo) const {
3683 switch (MO.getType()) {
3684 case MachineOperand::MO_Register:
3685 return false;
3686 case MachineOperand::MO_Immediate:
3687 return !isInlineConstant(MO, OpInfo);
3688 case MachineOperand::MO_FrameIndex:
3689 case MachineOperand::MO_MachineBasicBlock:
3690 case MachineOperand::MO_ExternalSymbol:
3691 case MachineOperand::MO_GlobalAddress:
3692 case MachineOperand::MO_MCSymbol:
3693 return true;
3694 default:
3695 llvm_unreachable("unexpected operand type")::llvm::llvm_unreachable_internal("unexpected operand type", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 3695)
;
3696 }
3697}
3698
3699static bool compareMachineOp(const MachineOperand &Op0,
3700 const MachineOperand &Op1) {
3701 if (Op0.getType() != Op1.getType())
3702 return false;
3703
3704 switch (Op0.getType()) {
3705 case MachineOperand::MO_Register:
3706 return Op0.getReg() == Op1.getReg();
3707 case MachineOperand::MO_Immediate:
3708 return Op0.getImm() == Op1.getImm();
3709 default:
3710 llvm_unreachable("Didn't expect to be comparing these operand types")::llvm::llvm_unreachable_internal("Didn't expect to be comparing these operand types"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3710)
;
3711 }
3712}
3713
3714bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
3715 const MachineOperand &MO) const {
3716 const MCInstrDesc &InstDesc = MI.getDesc();
3717 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
3718
3719 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())(static_cast <bool> (MO.isImm() || MO.isTargetIndex() ||
MO.isFI() || MO.isGlobal()) ? void (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3719, __extension__
__PRETTY_FUNCTION__))
;
3720
3721 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
3722 return true;
3723
3724 if (OpInfo.RegClass < 0)
3725 return false;
3726
3727 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
3728 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
3729 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3730 AMDGPU::OpName::src2))
3731 return false;
3732 return RI.opCanUseInlineConstant(OpInfo.OperandType);
3733 }
3734
3735 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
3736 return false;
3737
3738 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
3739 return true;
3740
3741 return ST.hasVOP3Literal();
3742}
3743
3744bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
3745 // GFX90A does not have V_MUL_LEGACY_F32_e32.
3746 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
3747 return false;
3748
3749 int Op32 = AMDGPU::getVOPe32(Opcode);
3750 if (Op32 == -1)
3751 return false;
3752
3753 return pseudoToMCOpcode(Op32) != -1;
3754}
3755
3756bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
3757 // The src0_modifier operand is present on all instructions
3758 // that have modifiers.
3759
3760 return AMDGPU::getNamedOperandIdx(Opcode,
3761 AMDGPU::OpName::src0_modifiers) != -1;
3762}
3763
3764bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
3765 unsigned OpName) const {
3766 const MachineOperand *Mods = getNamedOperand(MI, OpName);
3767 return Mods && Mods->getImm();
3768}
3769
3770bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
3771 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
3772 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
3773 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
3774 hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
3775 hasModifiersSet(MI, AMDGPU::OpName::omod);
3776}
3777
3778bool SIInstrInfo::canShrink(const MachineInstr &MI,
3779 const MachineRegisterInfo &MRI) const {
3780 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3781 // Can't shrink instruction with three operands.
3782 if (Src2) {
3783 switch (MI.getOpcode()) {
3784 default: return false;
3785
3786 case AMDGPU::V_ADDC_U32_e64:
3787 case AMDGPU::V_SUBB_U32_e64:
3788 case AMDGPU::V_SUBBREV_U32_e64: {
3789 const MachineOperand *Src1
3790 = getNamedOperand(MI, AMDGPU::OpName::src1);
3791 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
3792 return false;
3793 // Additional verification is needed for sdst/src2.
3794 return true;
3795 }
3796 case AMDGPU::V_MAC_F16_e64:
3797 case AMDGPU::V_MAC_F32_e64:
3798 case AMDGPU::V_MAC_LEGACY_F32_e64:
3799 case AMDGPU::V_FMAC_F16_e64:
3800 case AMDGPU::V_FMAC_F32_e64:
3801 case AMDGPU::V_FMAC_F64_e64:
3802 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3803 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
3804 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
3805 return false;
3806 break;
3807
3808 case AMDGPU::V_CNDMASK_B32_e64:
3809 break;
3810 }
3811 }
3812
3813 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3814 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
3815 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
3816 return false;
3817
3818 // We don't need to check src0, all input types are legal, so just make sure
3819 // src0 isn't using any modifiers.
3820 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
3821 return false;
3822
3823 // Can it be shrunk to a valid 32 bit opcode?
3824 if (!hasVALU32BitEncoding(MI.getOpcode()))
3825 return false;
3826
3827 // Check output modifiers
3828 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
3829 !hasModifiersSet(MI, AMDGPU::OpName::clamp);
3830}
3831
3832// Set VCC operand with all flags from \p Orig, except for setting it as
3833// implicit.
3834static void copyFlagsToImplicitVCC(MachineInstr &MI,
3835 const MachineOperand &Orig) {
3836
3837 for (MachineOperand &Use : MI.implicit_operands()) {
3838 if (Use.isUse() &&
3839 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
3840 Use.setIsUndef(Orig.isUndef());
3841 Use.setIsKill(Orig.isKill());
3842 return;
3843 }
3844 }
3845}
3846
3847MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
3848 unsigned Op32) const {
3849 MachineBasicBlock *MBB = MI.getParent();;
3850 MachineInstrBuilder Inst32 =
3851 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
3852 .setMIFlags(MI.getFlags());
3853
3854 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
3855 // For VOPC instructions, this is replaced by an implicit def of vcc.
3856 int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
3857 if (Op32DstIdx != -1) {
3858 // dst
3859 Inst32.add(MI.getOperand(0));
3860 } else {
3861 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU
::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
"Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3863, __extension__
__PRETTY_FUNCTION__))
3862 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU
::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
"Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3863, __extension__
__PRETTY_FUNCTION__))
3863 "Unexpected case")(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU
::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
"Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3863, __extension__
__PRETTY_FUNCTION__))
;
3864 }
3865
3866 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
3867
3868 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3869 if (Src1)
3870 Inst32.add(*Src1);
3871
3872 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3873
3874 if (Src2) {
3875 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
3876 if (Op32Src2Idx != -1) {
3877 Inst32.add(*Src2);
3878 } else {
3879 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
3880 // replaced with an implicit read of vcc or vcc_lo. The implicit read
3881 // of vcc was already added during the initial BuildMI, but we
3882 // 1) may need to change vcc to vcc_lo to preserve the original register
3883 // 2) have to preserve the original flags.
3884 fixImplicitOperands(*Inst32);
3885 copyFlagsToImplicitVCC(*Inst32, *Src2);
3886 }
3887 }
3888
3889 return Inst32;
3890}
3891
3892bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
3893 const MachineOperand &MO,
3894 const MCOperandInfo &OpInfo) const {
3895 // Literal constants use the constant bus.
3896 //if (isLiteralConstantLike(MO, OpInfo))
3897 // return true;
3898 if (MO.isImm())
3899 return !isInlineConstant(MO, OpInfo);
3900
3901 if (!MO.isReg())
3902 return true; // Misc other operands like FrameIndex
3903
3904 if (!MO.isUse())
3905 return false;
3906
3907 if (MO.getReg().isVirtual())
3908 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
3909
3910 // Null is free
3911 if (MO.getReg() == AMDGPU::SGPR_NULL)
3912 return false;
3913
3914 // SGPRs use the constant bus
3915 if (MO.isImplicit()) {
3916 return MO.getReg() == AMDGPU::M0 ||
3917 MO.getReg() == AMDGPU::VCC ||
3918 MO.getReg() == AMDGPU::VCC_LO;
3919 } else {
3920 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
3921 AMDGPU::SReg_64RegClass.contains(MO.getReg());
3922 }
3923}
3924
3925static Register findImplicitSGPRRead(const MachineInstr &MI) {
3926 for (const MachineOperand &MO : MI.implicit_operands()) {
3927 // We only care about reads.
3928 if (MO.isDef())
3929 continue;
3930
3931 switch (MO.getReg()) {
3932 case AMDGPU::VCC:
3933 case AMDGPU::VCC_LO:
3934 case AMDGPU::VCC_HI:
3935 case AMDGPU::M0:
3936 case AMDGPU::FLAT_SCR:
3937 return MO.getReg();
3938
3939 default:
3940 break;
3941 }
3942 }
3943
3944 return AMDGPU::NoRegister;
3945}
3946
3947static bool shouldReadExec(const MachineInstr &MI) {
3948 if (SIInstrInfo::isVALU(MI)) {
3949 switch (MI.getOpcode()) {
3950 case AMDGPU::V_READLANE_B32:
3951 case AMDGPU::V_WRITELANE_B32:
3952 return false;
3953 }
3954
3955 return true;
3956 }
3957
3958 if (MI.isPreISelOpcode() ||
3959 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
3960 SIInstrInfo::isSALU(MI) ||
3961 SIInstrInfo::isSMRD(MI))
3962 return false;
3963
3964 return true;
3965}
3966
3967static bool isSubRegOf(const SIRegisterInfo &TRI,
3968 const MachineOperand &SuperVec,
3969 const MachineOperand &SubReg) {
3970 if (SubReg.getReg().isPhysical())
3971 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
3972
3973 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
3974 SubReg.getReg() == SuperVec.getReg();
3975}
3976
3977bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
3978 StringRef &ErrInfo) const {
3979 uint16_t Opcode = MI.getOpcode();
3980 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
3981 return true;
3982
3983 const MachineFunction *MF = MI.getParent()->getParent();
3984 const MachineRegisterInfo &MRI = MF->getRegInfo();
3985
3986 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
3987 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
3988 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
3989
3990 // Make sure the number of operands is correct.
3991 const MCInstrDesc &Desc = get(Opcode);
3992 if (!Desc.isVariadic() &&
3993 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
3994 ErrInfo = "Instruction has wrong number of operands.";
3995 return false;
3996 }
3997
3998 if (MI.isInlineAsm()) {
3999 // Verify register classes for inlineasm constraints.
4000 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4001 I != E; ++I) {
4002 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4003 if (!RC)
4004 continue;
4005
4006 const MachineOperand &Op = MI.getOperand(I);
4007 if (!Op.isReg())
4008 continue;
4009
4010 Register Reg = Op.getReg();
4011 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4012 ErrInfo = "inlineasm operand has incorrect register class.";
4013 return false;
4014 }
4015 }
4016
4017 return true;
4018 }
4019
4020 if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4021 ErrInfo = "missing memory operand from MIMG instruction.";
4022 return false;
4023 }
4024
4025 // Make sure the register classes are correct.
4026 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4027 const MachineOperand &MO = MI.getOperand(i);
4028 if (MO.isFPImm()) {
4029 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4030 "all fp values to integers.";
4031 return false;
4032 }
4033
4034 int RegClass = Desc.OpInfo[i].RegClass;
4035
4036 switch (Desc.OpInfo[i].OperandType) {
4037 case MCOI::OPERAND_REGISTER:
4038 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4039 ErrInfo = "Illegal immediate value for operand.";
4040 return false;
4041 }
4042 break;
4043 case AMDGPU::OPERAND_REG_IMM_INT32:
4044 case AMDGPU::OPERAND_REG_IMM_FP32:
4045 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
4046 break;
4047 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
4048 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
4049 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
4050 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
4051 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
4052 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
4053 case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
4054 case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
4055 case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
4056 case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
4057 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
4058 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4059 ErrInfo = "Illegal immediate value for operand.";
4060 return false;
4061 }
4062 break;
4063 }
4064 case MCOI::OPERAND_IMMEDIATE:
4065 case AMDGPU::OPERAND_KIMM32:
4066 // Check if this operand is an immediate.
4067 // FrameIndex operands will be replaced by immediates, so they are
4068 // allowed.
4069 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4070 ErrInfo = "Expected immediate, but got non-immediate";
4071 return false;
4072 }
4073 LLVM_FALLTHROUGH[[gnu::fallthrough]];
4074 default:
4075 continue;
4076 }
4077
4078 if (!MO.isReg())
4079 continue;
4080 Register Reg = MO.getReg();
4081 if (!Reg)
4082 continue;
4083
4084 // FIXME: Ideally we would have separate instruction definitions with the
4085 // aligned register constraint.
4086 // FIXME: We do not verify inline asm operands, but custom inline asm
4087 // verification is broken anyway
4088 if (ST.needsAlignedVGPRs()) {
4089 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4090 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4091 const TargetRegisterClass *SubRC =
4092 RI.getSubRegClass(RC, MO.getSubReg());
4093 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4094 if (RC)
4095 RC = SubRC;
4096 }
4097
4098 // Check that this is the aligned version of the class.
4099 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4100 ErrInfo = "Subtarget requires even aligned vector registers";
4101 return false;
4102 }
4103 }
4104
4105 if (RegClass != -1) {
4106 if (Reg.isVirtual())
4107 continue;
4108
4109 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4110 if (!RC->contains(Reg)) {
4111 ErrInfo = "Operand has incorrect register class.";
4112 return false;
4113 }
4114 }
4115 }
4116
4117 // Verify SDWA
4118 if (isSDWA(MI)) {
4119 if (!ST.hasSDWA()) {
4120 ErrInfo = "SDWA is not supported on this target";
4121 return false;
4122 }
4123
4124 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4125
4126 const int OpIndices[] = {DstIdx, Src0Idx, Src1Idx, Src2Idx};
4127
4128 for (int OpIdx : OpIndices) {
4129 if (OpIdx == -1)
4130 continue;
4131 const MachineOperand &MO = MI.getOperand(OpIdx);
4132
4133 if (!ST.hasSDWAScalar()) {
4134 // Only VGPRS on VI
4135 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4136 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4137 return false;
4138 }
4139 } else {
4140 // No immediates on GFX9
4141 if (!MO.isReg()) {
4142 ErrInfo =
4143 "Only reg allowed as operands in SDWA instructions on GFX9+";
4144 return false;
4145 }
4146 }
4147 }
4148
4149 if (!ST.hasSDWAOmod()) {
4150 // No omod allowed on VI
4151 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4152 if (OMod != nullptr &&
4153 (!OMod->isImm() || OMod->getImm() != 0)) {
4154 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4155 return false;
4156 }
4157 }
4158
4159 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4160 if (isVOPC(BasicOpcode)) {
4161 if (!ST.hasSDWASdst() && DstIdx != -1) {
4162 // Only vcc allowed as dst on VI for VOPC
4163 const MachineOperand &Dst = MI.getOperand(DstIdx);
4164 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4165 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4166 return false;
4167 }
4168 } else if (!ST.hasSDWAOutModsVOPC()) {
4169 // No clamp allowed on GFX9 for VOPC
4170 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4171 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4172 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4173 return false;
4174 }
4175
4176 // No omod allowed on GFX9 for VOPC
4177 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4178 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4179 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4180 return false;
4181 }
4182 }
4183 }
4184
4185 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4186 if (DstUnused && DstUnused->isImm() &&
4187 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4188 const MachineOperand &Dst = MI.getOperand(DstIdx);
4189 if (!Dst.isReg() || !Dst.isTied()) {
4190 ErrInfo = "Dst register should have tied register";
4191 return false;
4192 }
4193
4194 const MachineOperand &TiedMO =
4195 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4196 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4197 ErrInfo =
4198 "Dst register should be tied to implicit use of preserved register";
4199 return false;
4200 } else if (TiedMO.getReg().isPhysical() &&
4201 Dst.getReg() != TiedMO.getReg()) {
4202 ErrInfo = "Dst register should use same physical register as preserved";
4203 return false;
4204 }
4205 }
4206 }
4207
4208 // Verify MIMG
4209 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
4210 // Ensure that the return type used is large enough for all the options
4211 // being used TFE/LWE require an extra result register.
4212 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4213 if (DMask) {
4214 uint64_t DMaskImm = DMask->getImm();
4215 uint32_t RegCount =
4216 isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
4217 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4218 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4219 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4220
4221 // Adjust for packed 16 bit values
4222 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4223 RegCount >>= 1;
4224
4225 // Adjust if using LWE or TFE
4226 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4227 RegCount += 1;
4228
4229 const uint32_t DstIdx =
4230 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4231 const MachineOperand &Dst = MI.getOperand(DstIdx);
4232 if (Dst.isReg()) {
4233 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4234 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4235 if (RegCount > DstSize) {
4236 ErrInfo = "MIMG instruction returns too many registers for dst "
4237 "register class";
4238 return false;
4239 }
4240 }
4241 }
4242 }
4243
4244 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4245 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
4246 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
4247 // Only look at the true operands. Only a real operand can use the constant
4248 // bus, and we don't want to check pseudo-operands like the source modifier
4249 // flags.
4250 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
4251
4252 unsigned ConstantBusCount = 0;
4253 bool UsesLiteral = false;
4254 const MachineOperand *LiteralVal = nullptr;
4255
4256 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
4257 ++ConstantBusCount;
4258
4259 SmallVector<Register, 2> SGPRsUsed;
4260 Register SGPRUsed;
4261
4262 for (int OpIdx : OpIndices) {
4263 if (OpIdx == -1)
4264 break;
4265 const MachineOperand &MO = MI.getOperand(OpIdx);
4266 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
4267 if (MO.isReg()) {
4268 SGPRUsed = MO.getReg();
4269 if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) {
4270 return SGPRUsed != SGPR;
4271 })) {
4272 ++ConstantBusCount;
4273 SGPRsUsed.push_back(SGPRUsed);
4274 }
4275 } else {
4276 if (!UsesLiteral) {
4277 ++ConstantBusCount;
4278 UsesLiteral = true;
4279 LiteralVal = &MO;
4280 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4281 assert(isVOP3(MI))(static_cast <bool> (isVOP3(MI)) ? void (0) : __assert_fail
("isVOP3(MI)", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4281
, __extension__ __PRETTY_FUNCTION__))
;
4282 ErrInfo = "VOP3 instruction uses more than one literal";
4283 return false;
4284 }
4285 }
4286 }
4287 }
4288
4289 SGPRUsed = findImplicitSGPRRead(MI);
4290 if (SGPRUsed != AMDGPU::NoRegister) {
4291 // Implicit uses may safely overlap true operands
4292 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4293 return !RI.regsOverlap(SGPRUsed, SGPR);
4294 })) {
4295 ++ConstantBusCount;
4296 SGPRsUsed.push_back(SGPRUsed);
4297 }
4298 }
4299
4300 // v_writelane_b32 is an exception from constant bus restriction:
4301 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4302 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4303 Opcode != AMDGPU::V_WRITELANE_B32) {
4304 ErrInfo = "VOP* instruction violates constant bus restriction";
4305 return false;
4306 }
4307
4308 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4309 ErrInfo = "VOP3 instruction uses literal";
4310 return false;
4311 }
4312 }
4313
4314 // Special case for writelane - this can break the multiple constant bus rule,
4315 // but still can't use more than one SGPR register
4316 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4317 unsigned SGPRCount = 0;
4318 Register SGPRUsed = AMDGPU::NoRegister;
4319
4320 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
4321 if (OpIdx == -1)
4322 break;
4323
4324 const MachineOperand &MO = MI.getOperand(OpIdx);
4325
4326 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
4327 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4328 if (MO.getReg() != SGPRUsed)
4329 ++SGPRCount;
4330 SGPRUsed = MO.getReg();
4331 }
4332 }
4333 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4334 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4335 return false;
4336 }
4337 }
4338 }
4339
4340 // Verify misc. restrictions on specific instructions.
4341 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
4342 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
4343 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4344 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4345 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
4346 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
4347 if (!compareMachineOp(Src0, Src1) &&
4348 !compareMachineOp(Src0, Src2)) {
4349 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
4350 return false;
4351 }
4352 }
4353 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
4354 SISrcMods::ABS) ||
4355 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
4356 SISrcMods::ABS) ||
4357 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
4358 SISrcMods::ABS)) {
4359 ErrInfo = "ABS not allowed in VOP3B instructions";
4360 return false;
4361 }
4362 }
4363
4364 if (isSOP2(MI) || isSOPC(MI)) {
4365 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4366 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4367 unsigned Immediates = 0;
4368
4369 if (!Src0.isReg() &&
4370 !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
4371 Immediates++;
4372 if (!Src1.isReg() &&
4373 !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
4374 Immediates++;
4375
4376 if (Immediates > 1) {
4377 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
4378 return false;
4379 }
4380 }
4381
4382 if (isSOPK(MI)) {
4383 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
4384 if (Desc.isBranch()) {
4385 if (!Op->isMBB()) {
4386 ErrInfo = "invalid branch target for SOPK instruction";
4387 return false;
4388 }
4389 } else {
4390 uint64_t Imm = Op->getImm();
4391 if (sopkIsZext(MI)) {
4392 if (!isUInt<16>(Imm)) {
4393 ErrInfo = "invalid immediate for SOPK instruction";
4394 return false;
4395 }
4396 } else {
4397 if (!isInt<16>(Imm)) {
4398 ErrInfo = "invalid immediate for SOPK instruction";
4399 return false;
4400 }
4401 }
4402 }
4403 }
4404
4405 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
4406 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
4407 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
4408 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
4409 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
4410 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
4411
4412 const unsigned StaticNumOps = Desc.getNumOperands() +
4413 Desc.getNumImplicitUses();
4414 const unsigned NumImplicitOps = IsDst ? 2 : 1;
4415
4416 // Allow additional implicit operands. This allows a fixup done by the post
4417 // RA scheduler where the main implicit operand is killed and implicit-defs
4418 // are added for sub-registers that remain live after this instruction.
4419 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
4420 ErrInfo = "missing implicit register operands";
4421 return false;
4422 }
4423
4424 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4425 if (IsDst) {
4426 if (!Dst->isUse()) {
4427 ErrInfo = "v_movreld_b32 vdst should be a use operand";
4428 return false;
4429 }
4430
4431 unsigned UseOpIdx;
4432 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
4433 UseOpIdx != StaticNumOps + 1) {
4434 ErrInfo = "movrel implicit operands should be tied";
4435 return false;
4436 }
4437 }
4438
4439 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4440 const MachineOperand &ImpUse
4441 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
4442 if (!ImpUse.isReg() || !ImpUse.isUse() ||
4443 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
4444 ErrInfo = "src0 should be subreg of implicit vector use";
4445 return false;
4446 }
4447 }
4448
4449 // Make sure we aren't losing exec uses in the td files. This mostly requires
4450 // being careful when using let Uses to try to add other use registers.
4451 if (shouldReadExec(MI)) {
4452 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
4453 ErrInfo = "VALU instruction does not implicitly read exec mask";
4454 return false;
4455 }
4456 }
4457
4458 if (isSMRD(MI)) {
4459 if (MI.mayStore()) {
4460 // The register offset form of scalar stores may only use m0 as the
4461 // soffset register.
4462 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
4463 if (Soff && Soff->getReg() != AMDGPU::M0) {
4464 ErrInfo = "scalar stores must use m0 as offset register";
4465 return false;
4466 }
4467 }
4468 }
4469
4470 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
4471 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4472 if (Offset->getImm() != 0) {
4473 ErrInfo = "subtarget does not support offsets in flat instructions";
4474 return false;
4475 }
4476 }
4477
4478 if (isMIMG(MI)) {
4479 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
4480 if (DimOp) {
4481 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
4482 AMDGPU::OpName::vaddr0);
4483 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
4484 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
4485 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4486 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
4487 const AMDGPU::MIMGDimInfo *Dim =
4488 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());
4489
4490 if (!Dim) {
4491 ErrInfo = "dim is out of range";
4492 return false;
4493 }
4494
4495 bool IsA16 = false;
4496 if (ST.hasR128A16()) {
4497 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
4498 IsA16 = R128A16->getImm() != 0;
4499 } else if (ST.hasGFX10A16()) {
4500 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
4501 IsA16 = A16->getImm() != 0;
4502 }
4503
4504 bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
4505
4506 unsigned AddrWords =
4507 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
4508
4509 unsigned VAddrWords;
4510 if (IsNSA) {
4511 VAddrWords = SRsrcIdx - VAddr0Idx;
4512 } else {
4513 const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
4514 VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
4515 if (AddrWords > 8)
4516 AddrWords = 16;
4517 }
4518
4519 if (VAddrWords != AddrWords) {
4520 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWordsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("si-instr-info")) { dbgs() << "bad vaddr size, expected "
<< AddrWords << " but got " << VAddrWords <<
"\n"; } } while (false)
4521 << " but got " << VAddrWords << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("si-instr-info")) { dbgs() << "bad vaddr size, expected "
<< AddrWords << " but got " << VAddrWords <<
"\n"; } } while (false)
;
4522 ErrInfo = "bad vaddr size";
4523 return false;
4524 }
4525 }
4526 }
4527
4528 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
4529 if (DppCt) {
4530 using namespace AMDGPU::DPP;
4531
4532 unsigned DC = DppCt->getImm();
4533 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
4534 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
4535 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
4536 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
4537 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
4538 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
4539 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
4540 ErrInfo = "Invalid dpp_ctrl value";
4541 return false;
4542 }
4543 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
4544 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
4545 ErrInfo = "Invalid dpp_ctrl value: "
4546 "wavefront shifts are not supported on GFX10+";
4547 return false;
4548 }
4549 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
4550 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
4551 ErrInfo = "Invalid dpp_ctrl value: "
4552 "broadcasts are not supported on GFX10+";
4553 return false;
4554 }
4555 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
4556 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
4557 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
4558 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
4559 !ST.hasGFX90AInsts()) {
4560 ErrInfo = "Invalid dpp_ctrl value: "
4561 "row_newbroadcast/row_share is not supported before "
4562 "GFX90A/GFX10";
4563 return false;
4564 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
4565 ErrInfo = "Invalid dpp_ctrl value: "
4566 "row_share and row_xmask are not supported before GFX10";
4567 return false;
4568 }
4569 }
4570
4571 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4572 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4573
4574 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
4575 ((DstIdx >= 0 &&
4576 (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
4577 Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) ||
4578 ((Src0Idx >= 0 &&
4579 (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
4580 Desc.OpInfo[Src0Idx].RegClass ==
4581 AMDGPU::VReg_64_Align2RegClassID)))) &&
4582 !AMDGPU::isLegal64BitDPPControl(DC)) {
4583 ErrInfo = "Invalid dpp_ctrl value: "
4584 "64 bit dpp only support row_newbcast";
4585 return false;
4586 }
4587 }
4588
4589 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
4590 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4591 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
4592 : AMDGPU::OpName::vdata;
4593 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
4594 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
4595 if (Data && !Data->isReg())
4596 Data = nullptr;
4597
4598 if (ST.hasGFX90AInsts()) {
4599 if (Dst && Data &&
4600 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
4601 ErrInfo = "Invalid register class: "
4602 "vdata and vdst should be both VGPR or AGPR";
4603 return false;
4604 }
4605 if (Data && Data2 &&
4606 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
4607 ErrInfo = "Invalid register class: "
4608 "both data operands should be VGPR or AGPR";
4609 return false;
4610 }
4611 } else {
4612 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
4613 (Data && RI.isAGPR(MRI, Data->getReg())) ||
4614 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
4615 ErrInfo = "Invalid register class: "
4616 "agpr loads and stores not supported on this GPU";
4617 return false;
4618 }
4619 }
4620 }
4621
4622 if (ST.needsAlignedVGPRs() &&
4623 (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
4624 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
4625 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) {
4626 const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0);
4627 Register Reg = Op->getReg();
4628 bool Aligned = true;
4629 if (Reg.isPhysical()) {
4630 Aligned = !(RI.getHWRegIndex(Reg) & 1);
4631 } else {
4632 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
4633 Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
4634 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
4635 }
4636
4637 if (!Aligned) {
4638 ErrInfo = "Subtarget requires even aligned vector registers "
4639 "for DS_GWS instructions";
4640 return false;
4641 }
4642 }
4643
4644 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
4645 const MachineOperand &SrcOp = MI.getOperand(1);
4646 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
4647 ErrInfo = "pseudo expects only physical SGPRs";
4648 return false;
4649 }
4650 }
4651
4652 return true;
4653}
4654
4655unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
4656 switch (MI.getOpcode()) {
4657 default: return AMDGPU::INSTRUCTION_LIST_END;
4658 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
4659 case AMDGPU::COPY: return AMDGPU::COPY;
4660 case AMDGPU::PHI: return AMDGPU::PHI;
4661 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
4662 case AMDGPU::WQM: return AMDGPU::WQM;
4663 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
4664 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
4665 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
4666 case AMDGPU::S_MOV_B32: {
4667 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4668 return MI.getOperand(1).isReg() ||
4669 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
4670 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
4671 }
4672 case AMDGPU::S_ADD_I32:
4673 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
4674 case AMDGPU::S_ADDC_U32:
4675 return AMDGPU::V_ADDC_U32_e32;
4676 case AMDGPU::S_SUB_I32:
4677 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
4678 // FIXME: These are not consistently handled, and selected when the carry is
4679 // used.
4680 case AMDGPU::S_ADD_U32:
4681 return AMDGPU::V_ADD_CO_U32_e32;
4682 case AMDGPU::S_SUB_U32:
4683 return AMDGPU::V_SUB_CO_U32_e32;
4684 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
4685 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
4686 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
4687 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
4688 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
4689 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
4690 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
4691 case AMDGPU::S_XNOR_B32:
4692 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
4693 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
4694 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
4695 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
4696 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
4697 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
4698 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
4699 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
4700 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
4701 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
4702 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
4703 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
4704 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
4705 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
4706 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
4707 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
4708 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
4709 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
4710 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
4711 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
4712 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
4713 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
4714 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
4715 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
4716 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
4717 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
4718 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
4719 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
4720 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
4721 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
4722 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
4723 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
4724 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
4725 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
4726 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
4727 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
4728 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
4729 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
4730 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
4731 }
4732 llvm_unreachable(::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4733)
4733 "Unexpected scalar opcode without corresponding vector one!")::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4733)
;
4734}
4735
4736static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
4737 const MachineRegisterInfo &MRI,
4738 const MCInstrDesc &TID,
4739 unsigned RCID,
4740 bool IsAllocatable) {
4741 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
4742 (((TID.mayLoad() || TID.mayStore()) &&
4743 !(TID.TSFlags & SIInstrFlags::VGPRSpill)) ||
4744 (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
4745 switch (RCID) {
4746 case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
4747 case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
4748 case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
4749 case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
4750 case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
4751 default:
4752 break;
4753 }
4754 }
4755 return RCID;
4756}
4757
4758const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
4759 unsigned OpNum, const TargetRegisterInfo *TRI,
4760 const MachineFunction &MF)
4761 const {
4762 if (OpNum >= TID.getNumOperands())
4763 return nullptr;
4764 auto RegClass = TID.OpInfo[OpNum].RegClass;
4765 bool IsAllocatable = false;
4766 if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
4767 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
4768 // with two data operands. Request register class constrained to VGPR only
4769 // of both operands present as Machine Copy Propagation can not check this
4770 // constraint and possibly other passes too.
4771 //
4772 // The check is limited to FLAT and DS because atomics in non-flat encoding
4773 // have their vdst and vdata tied to be the same register.
4774 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
4775 AMDGPU::OpName::vdst);
4776 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
4777 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
4778 : AMDGPU::OpName::vdata);
4779 if (DataIdx != -1) {
4780 IsAllocatable = VDstIdx != -1 ||
4781 AMDGPU::getNamedOperandIdx(TID.Opcode,
4782 AMDGPU::OpName::data1) != -1;
4783 }
4784 }
4785 RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
4786 IsAllocatable);
4787 return RI.getRegClass(RegClass);
4788}
4789
4790const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
4791 unsigned OpNo) const {
4792 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4793 const MCInstrDesc &Desc = get(MI.getOpcode());
4794 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
4795 Desc.OpInfo[OpNo].RegClass == -1) {
4796 Register Reg = MI.getOperand(OpNo).getReg();
4797
4798 if (Reg.isVirtual())
4799 return MRI.getRegClass(Reg);
4800 return RI.getPhysRegClass(Reg);
4801 }
4802
4803 unsigned RCID = Desc.OpInfo[OpNo].RegClass;
4804 RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
4805 return RI.getRegClass(RCID);
4806}
4807
4808void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
4809 MachineBasicBlock::iterator I = MI;
4810 MachineBasicBlock *MBB = MI.getParent();
4811 MachineOperand &MO = MI.getOperand(OpIdx);
4812 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
4813 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
4814 const TargetRegisterClass *RC = RI.getRegClass(RCID);
4815 unsigned Size = RI.getRegSizeInBits(*RC);
4816 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
4817 if (MO.isReg())
4818 Opcode = AMDGPU::COPY;
4819 else if (RI.isSGPRClass(RC))
4820 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
4821
4822 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
4823 const TargetRegisterClass *VRC64 = RI.getVGPR64Class();
4824 if (RI.getCommonSubClass(VRC64, VRC))
4825 VRC = VRC64;
4826 else
4827 VRC = &AMDGPU::VGPR_32RegClass;
4828
4829 Register Reg = MRI.createVirtualRegister(VRC);
4830 DebugLoc DL = MBB->findDebugLoc(I);
4831 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
4832 MO.ChangeToRegister(Reg, false);
4833}
4834
4835unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
4836 MachineRegisterInfo &MRI,
4837 MachineOperand &SuperReg,
4838 const TargetRegisterClass *SuperRC,
4839 unsigned SubIdx,
4840 const TargetRegisterClass *SubRC)
4841 const {
4842 MachineBasicBlock *MBB = MI->getParent();
4843 DebugLoc DL = MI->getDebugLoc();
4844 Register SubReg = MRI.createVirtualRegister(SubRC);
4845
4846 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
4847 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
4848 .addReg(SuperReg.getReg(), 0, SubIdx);
4849 return SubReg;
4850 }
4851
4852 // Just in case the super register is itself a sub-register, copy it to a new
4853 // value so we don't need to worry about merging its subreg index with the
4854 // SubIdx passed to this function. The register coalescer should be able to
4855 // eliminate this extra copy.
4856 Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
4857
4858 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
4859 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
4860
4861 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
4862 .addReg(NewSuperReg, 0, SubIdx);
4863
4864 return SubReg;
4865}
4866
4867MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
4868 MachineBasicBlock::iterator MII,
4869 MachineRegisterInfo &MRI,
4870 MachineOperand &Op,
4871 const TargetRegisterClass *SuperRC,
4872 unsigned SubIdx,
4873 const TargetRegisterClass *SubRC) const {
4874 if (Op.isImm()) {
4875 if (SubIdx == AMDGPU::sub0)
4876 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
4877 if (SubIdx == AMDGPU::sub1)
4878 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
4879
4880 llvm_unreachable("Unhandled register index for immediate")::llvm::llvm_unreachable_internal("Unhandled register index for immediate"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4880)
;
4881 }
4882
4883 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
4884 SubIdx, SubRC);
4885 return MachineOperand::CreateReg(SubReg, false);
4886}
4887
4888// Change the order of operands from (0, 1, 2) to (0, 2, 1)
4889void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
4890 assert(Inst.getNumExplicitOperands() == 3)(static_cast <bool> (Inst.getNumExplicitOperands() == 3
) ? void (0) : __assert_fail ("Inst.getNumExplicitOperands() == 3"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4890, __extension__
__PRETTY_FUNCTION__))
;
4891 MachineOperand Op1 = Inst.getOperand(1);
4892 Inst.RemoveOperand(1);
4893 Inst.addOperand(Op1);
4894}
4895
4896bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
4897 const MCOperandInfo &OpInfo,
4898 const MachineOperand &MO) const {
4899 if (!MO.isReg())
4900 return false;
4901
4902 Register Reg = MO.getReg();
4903
4904 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
4905 if (Reg.isPhysical())
4906 return DRC->contains(Reg);
4907
4908 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
4909
4910 if (MO.getSubReg()) {
4911 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
4912 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
4913 if (!SuperRC)
4914 return false;
4915
4916 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
4917 if (!DRC)
4918 return false;
4919 }
4920 return RC->hasSuperClassEq(DRC);
4921}
4922
4923bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
4924 const MCOperandInfo &OpInfo,
4925 const MachineOperand &MO) const {
4926 if (MO.isReg())
4927 return isLegalRegOperand(MRI, OpInfo, MO);
4928
4929 // Handle non-register types that are treated like immediates.
4930 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())(static_cast <bool> (MO.isImm() || MO.isTargetIndex() ||
MO.isFI() || MO.isGlobal()) ? void (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4930, __extension__
__PRETTY_FUNCTION__))
;
4931 return true;
4932}
4933
4934bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
4935 const MachineOperand *MO) const {
4936 const MachineFunction &MF = *MI.getParent()->getParent();
4937 const MachineRegisterInfo &MRI = MF.getRegInfo();
4938 const MCInstrDesc &InstDesc = MI.getDesc();
4939 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
4940 const TargetRegisterClass *DefinedRC =
4941 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
4942 if (!MO)
4943 MO = &MI.getOperand(OpIdx);
4944
4945 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
4946 int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
4947 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
4948 if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
4949 return false;
4950
4951 SmallDenseSet<RegSubRegPair> SGPRsUsed;
4952 if (MO->isReg())
4953 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
4954
4955 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4956 if (i == OpIdx)
4957 continue;
4958 const MachineOperand &Op = MI.getOperand(i);
4959 if (Op.isReg()) {
4960 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
4961 if (!SGPRsUsed.count(SGPR) &&
4962 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
4963 if (--ConstantBusLimit <= 0)
4964 return false;
4965 SGPRsUsed.insert(SGPR);
4966 }
4967 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
4968 if (--ConstantBusLimit <= 0)
4969 return false;
4970 } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
4971 isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
4972 if (!VOP3LiteralLimit--)
4973 return false;
4974 if (--ConstantBusLimit <= 0)
4975 return false;
4976 }
4977 }
4978 }
4979
4980 if (MO->isReg()) {
4981 assert(DefinedRC)(static_cast <bool> (DefinedRC) ? void (0) : __assert_fail
("DefinedRC", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4981
, __extension__ __PRETTY_FUNCTION__))
;
4982 if (!isLegalRegOperand(MRI, OpInfo, *MO))
4983 return false;
4984 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
4985 if (IsAGPR && !ST.hasMAIInsts())
4986 return false;
4987 unsigned Opc = MI.getOpcode();
4988 if (IsAGPR &&
4989 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
4990 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
4991 return false;
4992 // Atomics should have both vdst and vdata either vgpr or agpr.
4993 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
4994 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
4995 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
4996 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
4997 MI.getOperand(DataIdx).isReg() &&
4998 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
4999 return false;
5000 if ((int)OpIdx == DataIdx) {
5001 if (VDstIdx != -1 &&
5002 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5003 return false;
5004 // DS instructions with 2 src operands also must have tied RC.
5005 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5006 AMDGPU::OpName::data1);
5007 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5008 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5009 return false;
5010 }
5011 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5012 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5013 RI.isSGPRReg(MRI, MO->getReg()))
5014 return false;
5015 return true;
5016 }
5017
5018 // Handle non-register types that are treated like immediates.
5019 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal())(static_cast <bool> (MO->isImm() || MO->isTargetIndex
() || MO->isFI() || MO->isGlobal()) ? void (0) : __assert_fail
("MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5019, __extension__
__PRETTY_FUNCTION__))
;
5020
5021 if (!DefinedRC) {
5022 // This operand expects an immediate.
5023 return true;
5024 }
5025
5026 return isImmOperandLegal(MI, OpIdx, *MO);
5027}
5028
5029void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
5030 MachineInstr &MI) const {
5031 unsigned Opc = MI.getOpcode();
5032 const MCInstrDesc &InstrDesc = get(Opc);
5033
5034 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5035 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5036
5037 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5038 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5039
5040 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5041 // we need to only have one constant bus use before GFX10.
5042 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
5043 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 &&
5044 Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) ||
5045 isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx])))
5046 legalizeOpWithMove(MI, Src0Idx);
5047
5048 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5049 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5050 // src0/src1 with V_READFIRSTLANE.
5051 if (Opc == AMDGPU::V_WRITELANE_B32) {
5052 const DebugLoc &DL = MI.getDebugLoc();
5053 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5054 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5055 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5056 .add(Src0);
5057 Src0.ChangeToRegister(Reg, false);
5058 }
5059 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5060 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5061 const DebugLoc &DL = MI.getDebugLoc();
5062 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5063 .add(Src1);
5064 Src1.ChangeToRegister(Reg, false);
5065 }
5066 return;
5067 }
5068
5069 // No VOP2 instructions support AGPRs.
5070 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5071 legalizeOpWithMove(MI, Src0Idx);
5072
5073 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5074 legalizeOpWithMove(MI, Src1Idx);
5075
5076 // VOP2 src0 instructions support all operand types, so we don't need to check
5077 // their legality. If src1 is already legal, we don't need to do anything.
5078 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
5079 return;
5080
5081 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5082 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5083 // select is uniform.
5084 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5085 RI.isVGPR(MRI, Src1.getReg())) {
5086 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5087 const DebugLoc &DL = MI.getDebugLoc();
5088 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5089 .add(Src1);
5090 Src1.ChangeToRegister(Reg, false);
5091 return;
5092 }
5093
5094 // We do not use commuteInstruction here because it is too aggressive and will
5095 // commute if it is possible. We only want to commute here if it improves
5096 // legality. This can be called a fairly large number of times so don't waste
5097 // compile time pointlessly swapping and checking legality again.
5098 if (HasImplicitSGPR || !MI.isCommutable()) {
5099 legalizeOpWithMove(MI, Src1Idx);
5100 return;
5101 }
5102
5103 // If src0 can be used as src1, commuting will make the operands legal.
5104 // Otherwise we have to give up and insert a move.
5105 //
5106 // TODO: Other immediate-like operand kinds could be commuted if there was a
5107 // MachineOperand::ChangeTo* for them.
5108 if ((!Src1.isImm() && !Src1.isReg()) ||
5109 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
5110 legalizeOpWithMove(MI, Src1Idx);
5111 return;
5112 }
5113
5114 int CommutedOpc = commuteOpcode(MI);
5115 if (CommutedOpc == -1) {
5116 legalizeOpWithMove(MI, Src1Idx);
5117 return;
5118 }
5119
5120 MI.setDesc(get(CommutedOpc));
5121
5122 Register Src0Reg = Src0.getReg();
5123 unsigned Src0SubReg = Src0.getSubReg();
5124 bool Src0Kill = Src0.isKill();
5125
5126 if (Src1.isImm())
5127 Src0.ChangeToImmediate(Src1.getImm());
5128 else if (Src1.isReg()) {
5129 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5130 Src0.setSubReg(Src1.getSubReg());
5131 } else
5132 llvm_unreachable("Should only have register or immediate operands")::llvm::llvm_unreachable_internal("Should only have register or immediate operands"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5132)
;
5133
5134 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5135 Src1.setSubReg(Src0SubReg);
5136 fixImplicitOperands(MI);
5137}
5138
5139// Legalize VOP3 operands. All operand types are supported for any operand
5140// but only one literal constant and only starting from GFX10.
5141void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
5142 MachineInstr &MI) const {
5143 unsigned Opc = MI.getOpcode();
5144
5145 int VOP3Idx[3] = {
5146 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5147 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5148 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5149 };
5150
5151 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5152 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5153 // src1 and src2 must be scalar
5154 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5155 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5156 const DebugLoc &DL = MI.getDebugLoc();
5157 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5158 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5159 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5160 .add(Src1);
5161 Src1.ChangeToRegister(Reg, false);
5162 }
5163 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5164 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5165 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5166 .add(Src2);
5167 Src2.ChangeToRegister(Reg, false);
5168 }
5169 }
5170
5171 // Find the one SGPR operand we are allowed to use.
5172 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
5173 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
5174 SmallDenseSet<unsigned> SGPRsUsed;
5175 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
5176 if (SGPRReg != AMDGPU::NoRegister) {
5177 SGPRsUsed.insert(SGPRReg);
5178 --ConstantBusLimit;
5179 }
5180
5181 for (int Idx : VOP3Idx) {
5182 if (Idx == -1)
5183 break;
5184 MachineOperand &MO = MI.getOperand(Idx);
5185
5186 if (!MO.isReg()) {
5187 if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx]))
5188 continue;
5189
5190 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
5191 --LiteralLimit;
5192 --ConstantBusLimit;
5193 continue;
5194 }
5195
5196 --LiteralLimit;
5197 --ConstantBusLimit;
5198 legalizeOpWithMove(MI, Idx);
5199 continue;
5200 }
5201
5202 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
5203 !isOperandLegal(MI, Idx, &MO)) {
5204 legalizeOpWithMove(MI, Idx);
5205 continue;
5206 }
5207
5208 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
5209 continue; // VGPRs are legal
5210
5211 // We can use one SGPR in each VOP3 instruction prior to GFX10
5212 // and two starting from GFX10.
5213 if (SGPRsUsed.count(MO.getReg()))
5214 continue;
5215 if (ConstantBusLimit > 0) {
5216 SGPRsUsed.insert(MO.getReg());
5217 --ConstantBusLimit;
5218 continue;
5219 }
5220
5221 // If we make it this far, then the operand is not legal and we must
5222 // legalize it.
5223 legalizeOpWithMove(MI, Idx);
5224 }
5225}
5226
5227Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
5228 MachineRegisterInfo &MRI) const {
5229 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
5230 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
5231 Register DstReg = MRI.createVirtualRegister(SRC);
5232 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
5233
5234 if (RI.hasAGPRs(VRC)) {
5235 VRC = RI.getEquivalentVGPRClass(VRC);
5236 Register NewSrcReg = MRI.createVirtualRegister(VRC);
5237 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
5238 get(TargetOpcode::COPY), NewSrcReg)
5239 .addReg(SrcReg);
5240 SrcReg = NewSrcReg;
5241 }
5242
5243 if (SubRegs == 1) {
5244 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
5245 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5246 .addReg(SrcReg);
5247 return DstReg;
5248 }
5249
5250 SmallVector<unsigned, 8> SRegs;
5251 for (unsigned i = 0; i < SubRegs; ++i) {
5252 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5253 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
5254 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
5255 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
5256 SRegs.push_back(SGPR);
5257 }
5258
5259 MachineInstrBuilder MIB =
5260 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
5261 get(AMDGPU::REG_SEQUENCE), DstReg);
5262 for (unsigned i = 0; i < SubRegs; ++i) {
5263 MIB.addReg(SRegs[i]);
5264 MIB.addImm(RI.getSubRegFromChannel(i));
5265 }
5266 return DstReg;
5267}
5268
5269void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
5270 MachineInstr &MI) const {
5271
5272 // If the pointer is store in VGPRs, then we need to move them to
5273 // SGPRs using v_readfirstlane. This is safe because we only select
5274 // loads with uniform pointers to SMRD instruction so we know the
5275 // pointer value is uniform.
5276 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
5277 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
5278 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
5279 SBase->setReg(SGPR);
5280 }
5281 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
5282 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
5283 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
5284 SOff->setReg(SGPR);
5285 }
5286}
5287
5288bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
5289 unsigned Opc = Inst.getOpcode();
5290 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
5291 if (OldSAddrIdx < 0)
5292 return false;
5293
5294 assert(isSegmentSpecificFLAT(Inst))(static_cast <bool> (isSegmentSpecificFLAT(Inst)) ? void
(0) : __assert_fail ("isSegmentSpecificFLAT(Inst)", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 5294, __extension__ __PRETTY_FUNCTION__))
;
5295
5296 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
5297 if (NewOpc < 0)
5298 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);
5299 if (NewOpc < 0)
5300 return false;
5301
5302 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
5303 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
5304 if (RI.isSGPRReg(MRI, SAddr.getReg()))
5305 return false;
5306
5307 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
5308 if (NewVAddrIdx < 0)
5309 return false;
5310
5311 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
5312
5313 // Check vaddr, it shall be zero or absent.
5314 MachineInstr *VAddrDef = nullptr;
5315 if (OldVAddrIdx >= 0) {
5316 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
5317 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
5318 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
5319 !VAddrDef->getOperand(1).isImm() ||
5320 VAddrDef->getOperand(1).getImm() != 0)
5321 return false;
5322 }
5323
5324 const MCInstrDesc &NewDesc = get(NewOpc);
5325 Inst.setDesc(NewDesc);
5326
5327 // Callers expect iterator to be valid after this call, so modify the
5328 // instruction in place.
5329 if (OldVAddrIdx == NewVAddrIdx) {
5330 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
5331 // Clear use list from the old vaddr holding a zero register.
5332 MRI.removeRegOperandFromUseList(&NewVAddr);
5333 MRI.moveOperands(&NewVAddr, &SAddr, 1);
5334 Inst.RemoveOperand(OldSAddrIdx);
5335 // Update the use list with the pointer we have just moved from vaddr to
5336 // saddr position. Otherwise new vaddr will be missing from the use list.
5337 MRI.removeRegOperandFromUseList(&NewVAddr);
5338 MRI.addRegOperandToUseList(&NewVAddr);
5339 } else {
5340 assert(OldSAddrIdx == NewVAddrIdx)(static_cast <bool> (OldSAddrIdx == NewVAddrIdx) ? void
(0) : __assert_fail ("OldSAddrIdx == NewVAddrIdx", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 5340, __extension__ __PRETTY_FUNCTION__))
;
5341
5342 if (OldVAddrIdx >= 0) {
5343 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
5344 AMDGPU::OpName::vdst_in);
5345
5346 // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so
5347 // it asserts. Untie the operands for now and retie them afterwards.
5348 if (NewVDstIn != -1) {
5349 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
5350 Inst.untieRegOperand(OldVDstIn);
5351 }
5352
5353 Inst.RemoveOperand(OldVAddrIdx);
5354
5355 if (NewVDstIn != -1) {
5356 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
5357 Inst.tieOperands(NewVDst, NewVDstIn);
5358 }
5359 }
5360 }
5361
5362 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
5363 VAddrDef->eraseFromParent();
5364
5365 return true;
5366}
5367
5368// FIXME: Remove this when SelectionDAG is obsoleted.
5369void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
5370 MachineInstr &MI) const {
5371 if (!isSegmentSpecificFLAT(MI))
5372 return;
5373
5374 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
5375 // thinks they are uniform, so a readfirstlane should be valid.
5376 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
5377 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
5378 return;
5379
5380 if (moveFlatAddrToVGPR(MI))
5381 return;
5382
5383 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
5384 SAddr->setReg(ToSGPR);
5385}
5386
5387void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
5388 MachineBasicBlock::iterator I,
5389 const TargetRegisterClass *DstRC,
5390 MachineOperand &Op,
5391 MachineRegisterInfo &MRI,
5392 const DebugLoc &DL) const {
5393 Register OpReg = Op.getReg();
5394 unsigned OpSubReg = Op.getSubReg();
5395
5396 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
5397 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
5398
5399 // Check if operand is already the correct register class.
5400 if (DstRC == OpRC)
5401 return;
5402
5403 Register DstReg = MRI.createVirtualRegister(DstRC);
5404 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
5405
5406 Op.setReg(DstReg);
5407 Op.setSubReg(0);
5408
5409 MachineInstr *Def = MRI.getVRegDef(OpReg);
5410 if (!Def)
5411 return;
5412
5413 // Try to eliminate the copy if it is copying an immediate value.
5414 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
5415 FoldImmediate(*Copy, *Def, OpReg, &MRI);
5416
5417 bool ImpDef = Def->isImplicitDef();
5418 while (!ImpDef && Def && Def->isCopy()) {
5419 if (Def->getOperand(1).getReg().isPhysical())
5420 break;
5421 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
5422 ImpDef = Def && Def->isImplicitDef();
5423 }
5424 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
5425 !ImpDef)
5426 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
5427}
5428
5429// Emit the actual waterfall loop, executing the wrapped instruction for each
5430// unique value of \p Rsrc across all lanes. In the best case we execute 1
5431// iteration, in the worst case we execute 64 (once per lane).
5432static void
5433emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
5434 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5435 const DebugLoc &DL, MachineOperand &Rsrc) {
5436 MachineFunction &MF = *OrigBB.getParent();
5437 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5438 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5439 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5440 unsigned SaveExecOpc =
5441 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
5442 unsigned XorTermOpc =
5443 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
5444 unsigned AndOpc =
5445 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5446 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5447
5448 MachineBasicBlock::iterator I = LoopBB.begin();
5449
5450 SmallVector<Register, 8> ReadlanePieces;
5451 Register CondReg = AMDGPU::NoRegister;
5452
5453 Register VRsrc = Rsrc.getReg();
5454 unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
5455
5456 unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI);
5457 unsigned NumSubRegs = RegSize / 32;
5458 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size")(static_cast <bool> (NumSubRegs % 2 == 0 && NumSubRegs
<= 32 && "Unhandled register size") ? void (0) : __assert_fail
("NumSubRegs % 2 == 0 && NumSubRegs <= 32 && \"Unhandled register size\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5458, __extension__
__PRETTY_FUNCTION__))
;
5459
5460 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
5461
5462 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5463 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5464
5465 // Read the next variant <- also loop target.
5466 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
5467 .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx));
5468
5469 // Read the next variant <- also loop target.
5470 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
5471 .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1));
5472
5473 ReadlanePieces.push_back(CurRegLo);
5474 ReadlanePieces.push_back(CurRegHi);
5475
5476 // Comparison is to be done as 64-bit.
5477 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
5478 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
5479 .addReg(CurRegLo)
5480 .addImm(AMDGPU::sub0)
5481 .addReg(CurRegHi)
5482 .addImm(AMDGPU::sub1);
5483
5484 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
5485 auto Cmp =
5486 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
5487 .addReg(CurReg);
5488 if (NumSubRegs <= 2)
5489 Cmp.addReg(VRsrc);
5490 else
5491 Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
5492
5493 // Combine the comparison results with AND.
5494 if (CondReg == AMDGPU::NoRegister) // First.
5495 CondReg = NewCondReg;
5496 else { // If not the first, we create an AND.
5497 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
5498 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
5499 .addReg(CondReg)
5500 .addReg(NewCondReg);
5501 CondReg = AndReg;
5502 }
5503 } // End for loop.
5504
5505 auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc));
5506 Register SRsrc = MRI.createVirtualRegister(SRsrcRC);
5507
5508 // Build scalar Rsrc.
5509 auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc);
5510 unsigned Channel = 0;
5511 for (Register Piece : ReadlanePieces) {
5512 Merge.addReg(Piece)
5513 .addImm(TRI->getSubRegFromChannel(Channel++));
5514 }
5515
5516 // Update Rsrc operand to use the SGPR Rsrc.
5517 Rsrc.setReg(SRsrc);
5518 Rsrc.setIsKill(true);
5519
5520 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5521 MRI.setSimpleHint(SaveExec, CondReg);
5522
5523 // Update EXEC to matching lanes, saving original to SaveExec.
5524 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
5525 .addReg(CondReg, RegState::Kill);
5526
5527 // The original instruction is here; we insert the terminators after it.
5528 I = LoopBB.end();
5529
5530 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5531 BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
5532 .addReg(Exec)
5533 .addReg(SaveExec);
5534
5535 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
5536}
5537
5538// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
5539// with SGPRs by iterating over all unique values across all lanes.
5540// Returns the loop basic block that now contains \p MI.
5541static MachineBasicBlock *
5542loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
5543 MachineOperand &Rsrc, MachineDominatorTree *MDT,
5544 MachineBasicBlock::iterator Begin = nullptr,
5545 MachineBasicBlock::iterator End = nullptr) {
5546 MachineBasicBlock &MBB = *MI.getParent();
5547 MachineFunction &MF = *MBB.getParent();
5548 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5549 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5550 MachineRegisterInfo &MRI = MF.getRegInfo();
5551 if (!Begin.isValid())
5552 Begin = &MI;
5553 if (!End.isValid()) {
5554 End = &MI;
5555 ++End;
5556 }
5557 const DebugLoc &DL = MI.getDebugLoc();
5558 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5559 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5560 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5561
5562 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5563
5564 // Save the EXEC mask
5565 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
5566
5567 // Killed uses in the instruction we are waterfalling around will be
5568 // incorrect due to the added control-flow.
5569 MachineBasicBlock::iterator AfterMI = MI;
5570 ++AfterMI;
5571 for (auto I = Begin; I != AfterMI; I++) {
5572 for (auto &MO : I->uses()) {
5573 if (MO.isReg() && MO.isUse()) {
5574 MRI.clearKillFlags(MO.getReg());
5575 }
5576 }
5577 }
5578
5579 // To insert the loop we need to split the block. Move everything after this
5580 // point to a new block, and insert a new empty block between the two.
5581 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
5582 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
5583 MachineFunction::iterator MBBI(MBB);
5584 ++MBBI;
5585
5586 MF.insert(MBBI, LoopBB);
5587 MF.insert(MBBI, RemainderBB);
5588
5589 LoopBB->addSuccessor(LoopBB);
5590 LoopBB->addSuccessor(RemainderBB);
5591
5592 // Move Begin to MI to the LoopBB, and the remainder of the block to
5593 // RemainderBB.
5594 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
5595 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
5596 LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
5597
5598 MBB.addSuccessor(LoopBB);
5599
5600 // Update dominators. We know that MBB immediately dominates LoopBB, that
5601 // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
5602 // dominates all of the successors transferred to it from MBB that MBB used
5603 // to properly dominate.
5604 if (MDT) {
5605 MDT->addNewBlock(LoopBB, &MBB);
5606 MDT->addNewBlock(RemainderBB, LoopBB);
5607 for (auto &Succ : RemainderBB->successors()) {
5608 if (MDT->properlyDominates(&MBB, Succ)) {
5609 MDT->changeImmediateDominator(Succ, RemainderBB);
5610 }
5611 }
5612 }
5613
5614 emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
5615
5616 // Restore the EXEC mask
5617 MachineBasicBlock::iterator First = RemainderBB->begin();
5618 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
5619 return LoopBB;
5620}
5621
5622// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
5623static std::tuple<unsigned, unsigned>
5624extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
5625 MachineBasicBlock &MBB = *MI.getParent();
5626 MachineFunction &MF = *MBB.getParent();
5627 MachineRegisterInfo &MRI = MF.getRegInfo();
5628
5629 // Extract the ptr from the resource descriptor.
5630 unsigned RsrcPtr =
5631 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
5632 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
5633
5634 // Create an empty resource descriptor
5635 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5636 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5637 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5638 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5639 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
5640
5641 // Zero64 = 0
5642 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
5643 .addImm(0);
5644
5645 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
5646 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
5647 .addImm(RsrcDataFormat & 0xFFFFFFFF);
5648
5649 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
5650 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
5651 .addImm(RsrcDataFormat >> 32);
5652
5653 // NewSRsrc = {Zero64, SRsrcFormat}
5654 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
5655 .addReg(Zero64)
5656 .addImm(AMDGPU::sub0_sub1)
5657 .addReg(SRsrcFormatLo)
5658 .addImm(AMDGPU::sub2)
5659 .addReg(SRsrcFormatHi)
5660 .addImm(AMDGPU::sub3);
5661
5662 return std::make_tuple(RsrcPtr, NewSRsrc);
5663}
5664
5665MachineBasicBlock *
5666SIInstrInfo::legalizeOperands(MachineInstr &MI,
5667 MachineDominatorTree *MDT) const {
5668 MachineFunction &MF = *MI.getParent()->getParent();
5669 MachineRegisterInfo &MRI = MF.getRegInfo();
5670 MachineBasicBlock *CreatedBB = nullptr;
5671
5672 // Legalize VOP2
5673 if (isVOP2(MI) || isVOPC(MI)) {
5674 legalizeOperandsVOP2(MRI, MI);
5675 return CreatedBB;
5676 }
5677
5678 // Legalize VOP3
5679 if (isVOP3(MI)) {
5680 legalizeOperandsVOP3(MRI, MI);
5681 return CreatedBB;
5682 }
5683
5684 // Legalize SMRD
5685 if (isSMRD(MI)) {
5686 legalizeOperandsSMRD(MRI, MI);
5687 return CreatedBB;
5688 }
5689
5690 // Legalize FLAT
5691 if (isFLAT(MI)) {
5692 legalizeOperandsFLAT(MRI, MI);
5693 return CreatedBB;
5694 }
5695
5696 // Legalize REG_SEQUENCE and PHI
5697 // The register class of the operands much be the same type as the register
5698 // class of the output.
5699 if (MI.getOpcode() == AMDGPU::PHI) {
5700 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
5701 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
5702 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
5703 continue;
5704 const TargetRegisterClass *OpRC =
5705 MRI.getRegClass(MI.getOperand(i).getReg());
5706 if (RI.hasVectorRegisters(OpRC)) {
5707 VRC = OpRC;
5708 } else {
5709 SRC = OpRC;
5710 }
5711 }
5712
5713 // If any of the operands are VGPR registers, then they all most be
5714 // otherwise we will create illegal VGPR->SGPR copies when legalizing
5715 // them.
5716 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
5717 if (!VRC) {
5718 assert(SRC)(static_cast <bool> (SRC) ? void (0) : __assert_fail ("SRC"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5718, __extension__
__PRETTY_FUNCTION__))
;
5719 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
5720 VRC = &AMDGPU::VReg_1RegClass;
5721 } else
5722 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
5723 ? RI.getEquivalentAGPRClass(SRC)
5724 : RI.getEquivalentVGPRClass(SRC);
5725 } else {
5726 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
5727 ? RI.getEquivalentAGPRClass(VRC)
5728 : RI.getEquivalentVGPRClass(VRC);
5729 }
5730 RC = VRC;
5731 } else {
5732 RC = SRC;
5733 }
5734
5735 // Update all the operands so they have the same type.
5736 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5737 MachineOperand &Op = MI.getOperand(I);
5738 if (!Op.isReg() || !Op.getReg().isVirtual())
5739 continue;
5740
5741 // MI is a PHI instruction.
5742 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
5743 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
5744
5745 // Avoid creating no-op copies with the same src and dst reg class. These
5746 // confuse some of the machine passes.
5747 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
5748 }
5749 }
5750
5751 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
5752 // VGPR dest type and SGPR sources, insert copies so all operands are
5753 // VGPRs. This seems to help operand folding / the register coalescer.
5754 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
5755 MachineBasicBlock *MBB = MI.getParent();
5756 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
5757 if (RI.hasVGPRs(DstRC)) {
5758 // Update all the operands so they are VGPR register classes. These may
5759 // not be the same register class because REG_SEQUENCE supports mixing
5760 // subregister index types e.g. sub0_sub1 + sub2 + sub3
5761 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5762 MachineOperand &Op = MI.getOperand(I);
5763 if (!Op.isReg() || !Op.getReg().isVirtual())
5764 continue;
5765
5766 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
5767 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
5768 if (VRC == OpRC)
5769 continue;
5770
5771 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
5772 Op.setIsKill();
5773 }
5774 }
5775
5776 return CreatedBB;
5777 }
5778
5779 // Legalize INSERT_SUBREG
5780 // src0 must have the same register class as dst
5781 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
5782 Register Dst = MI.getOperand(0).getReg();
5783 Register Src0 = MI.getOperand(1).getReg();
5784 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
5785 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
5786 if (DstRC != Src0RC) {
5787 MachineBasicBlock *MBB = MI.getParent();
5788 MachineOperand &Op = MI.getOperand(1);
5789 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
5790 }
5791 return CreatedBB;
5792 }
5793
5794 // Legalize SI_INIT_M0
5795 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
5796 MachineOperand &Src = MI.getOperand(0);
5797 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
5798 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
5799 return CreatedBB;
5800 }
5801
5802 // Legalize MIMG and MUBUF/MTBUF for shaders.
5803 //
5804 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
5805 // scratch memory access. In both cases, the legalization never involves
5806 // conversion to the addr64 form.
5807 if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
5808 (isMUBUF(MI) || isMTBUF(MI)))) {
5809 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
5810 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
5811 CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);
5812
5813 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
5814 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
5815 CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT);
5816
5817 return CreatedBB;
5818 }
5819
5820 // Legalize SI_CALL
5821 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
5822 MachineOperand *Dest = &MI.getOperand(0);
5823 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
5824 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
5825 // following copies, we also need to move copies from and to physical
5826 // registers into the loop block.
5827 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
5828 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
5829
5830 // Also move the copies to physical registers into the loop block
5831 MachineBasicBlock &MBB = *MI.getParent();
5832 MachineBasicBlock::iterator Start(&MI);
5833 while (Start->getOpcode() != FrameSetupOpcode)
5834 --Start;
5835 MachineBasicBlock::iterator End(&MI);
5836 while (End->getOpcode() != FrameDestroyOpcode)
5837 ++End;
5838 // Also include following copies of the return value
5839 ++End;
5840 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
5841 MI.definesRegister(End->getOperand(1).getReg()))
5842 ++End;
5843 CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End);
5844 }
5845 }
5846
5847 // Legalize MUBUF* instructions.
5848 int RsrcIdx =
5849 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
5850 if (RsrcIdx != -1) {
5851 // We have an MUBUF instruction
5852 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
5853 unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
5854 if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
5855 RI.getRegClass(RsrcRC))) {
5856 // The operands are legal.
5857 // FIXME: We may need to legalize operands besides srsrc.
5858 return CreatedBB;
5859 }
5860
5861 // Legalize a VGPR Rsrc.
5862 //
5863 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
5864 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
5865 // a zero-value SRsrc.
5866 //
5867 // If the instruction is _OFFSET (both idxen and offen disabled), and we
5868 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
5869 // above.
5870 //
5871 // Otherwise we are on non-ADDR64 hardware, and/or we have
5872 // idxen/offen/bothen and we fall back to a waterfall loop.
5873
5874 MachineBasicBlock &MBB = *MI.getParent();
5875
5876 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
5877 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
5878 // This is already an ADDR64 instruction so we need to add the pointer
5879 // extracted from the resource descriptor to the current value of VAddr.
5880 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5881 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5882 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
5883
5884 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5885 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
5886 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
5887
5888 unsigned RsrcPtr, NewSRsrc;
5889 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
5890
5891 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
5892 const DebugLoc &DL = MI.getDebugLoc();
5893 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
5894 .addDef(CondReg0)
5895 .addReg(RsrcPtr, 0, AMDGPU::sub0)
5896 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
5897 .addImm(0);
5898
5899 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
5900 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
5901 .addDef(CondReg1, RegState::Dead)
5902 .addReg(RsrcPtr, 0, AMDGPU::sub1)
5903 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
5904 .addReg(CondReg0, RegState::Kill)
5905 .addImm(0);
5906
5907 // NewVaddr = {NewVaddrHi, NewVaddrLo}
5908 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
5909 .addReg(NewVAddrLo)
5910 .addImm(AMDGPU::sub0)
5911 .addReg(NewVAddrHi)
5912 .addImm(AMDGPU::sub1);
5913
5914 VAddr->setReg(NewVAddr);
5915 Rsrc->setReg(NewSRsrc);
5916 } else if (!VAddr && ST.hasAddr64()) {
5917 // This instructions is the _OFFSET variant, so we need to convert it to
5918 // ADDR64.
5919 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&(static_cast <bool> (ST.getGeneration() < AMDGPUSubtarget
::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"
) ? void (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5920, __extension__
__PRETTY_FUNCTION__))
5920 "FIXME: Need to emit flat atomics here")(static_cast <bool> (ST.getGeneration() < AMDGPUSubtarget
::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"
) ? void (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5920, __extension__
__PRETTY_FUNCTION__))
;
5921
5922 unsigned RsrcPtr, NewSRsrc;
5923 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
5924
5925 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
5926 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
5927 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5928 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
5929 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
5930
5931 // Atomics with return have an additional tied operand and are
5932 // missing some of the special bits.
5933 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
5934 MachineInstr *Addr64;
5935
5936 if (!VDataIn) {
5937 // Regular buffer load / store.
5938 MachineInstrBuilder MIB =
5939 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
5940 .add(*VData)
5941 .addReg(NewVAddr)
5942 .addReg(NewSRsrc)
5943 .add(*SOffset)
5944 .add(*Offset);
5945
5946 if (const MachineOperand *CPol =
5947 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5948 MIB.addImm(CPol->getImm());
5949 }
5950
5951 if (const MachineOperand *TFE =
5952 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
5953 MIB.addImm(TFE->getImm());
5954 }
5955
5956 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
5957
5958 MIB.cloneMemRefs(MI);
5959 Addr64 = MIB;
5960 } else {
5961 // Atomics with return.
5962 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
5963 .add(*VData)
5964 .add(*VDataIn)
5965 .addReg(NewVAddr)
5966 .addReg(NewSRsrc)
5967 .add(*SOffset)
5968 .add(*Offset)
5969 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
5970 .cloneMemRefs(MI);
5971 }
5972
5973 MI.removeFromParent();
5974
5975 // NewVaddr = {NewVaddrHi, NewVaddrLo}
5976 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
5977 NewVAddr)
5978 .addReg(RsrcPtr, 0, AMDGPU::sub0)
5979 .addImm(AMDGPU::sub0)
5980 .addReg(RsrcPtr, 0, AMDGPU::sub1)
5981 .addImm(AMDGPU::sub1);
5982 } else {
5983 // This is another variant; legalize Rsrc with waterfall loop from VGPRs
5984 // to SGPRs.
5985 CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
5986 return CreatedBB;
5987 }
5988 }
5989 return CreatedBB;
5990}
5991
5992MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
5993 MachineDominatorTree *MDT) const {
5994 SetVectorType Worklist;
5995 Worklist.insert(&TopInst);
5996 MachineBasicBlock *CreatedBB = nullptr;
5997 MachineBasicBlock *CreatedBBTmp = nullptr;
5998
5999 while (!Worklist.empty()) {
6000 MachineInstr &Inst = *Worklist.pop_back_val();
6001 MachineBasicBlock *MBB = Inst.getParent();
6002 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6003
6004 unsigned Opcode = Inst.getOpcode();
6005 unsigned NewOpcode = getVALUOp(Inst);
6006
6007 // Handle some special cases
6008 switch (Opcode) {
6009 default:
6010 break;
6011 case AMDGPU::S_ADD_U64_PSEUDO:
6012 case AMDGPU::S_SUB_U64_PSEUDO:
6013 splitScalar64BitAddSub(Worklist, Inst, MDT);
6014 Inst.eraseFromParent();
6015 continue;
6016 case AMDGPU::S_ADD_I32:
6017 case AMDGPU::S_SUB_I32: {
6018 // FIXME: The u32 versions currently selected use the carry.
6019 bool Changed;
6020 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
6021 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
6022 CreatedBB = CreatedBBTmp;
6023 if (Changed)
6024 continue;
6025
6026 // Default handling
6027 break;
6028 }
6029 case AMDGPU::S_AND_B64:
6030 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
6031 Inst.eraseFromParent();
6032 continue;
6033
6034 case AMDGPU::S_OR_B64:
6035 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
6036 Inst.eraseFromParent();
6037 continue;
6038
6039 case AMDGPU::S_XOR_B64:
6040 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
6041 Inst.eraseFromParent();
6042 continue;
6043
6044 case AMDGPU::S_NAND_B64:
6045 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
6046 Inst.eraseFromParent();
6047 continue;
6048
6049 case AMDGPU::S_NOR_B64:
6050 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
6051 Inst.eraseFromParent();
6052 continue;
6053
6054 case AMDGPU::S_XNOR_B64:
6055 if (ST.hasDLInsts())
6056 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
6057 else
6058 splitScalar64BitXnor(Worklist, Inst, MDT);
6059 Inst.eraseFromParent();
6060 continue;
6061
6062 case AMDGPU::S_ANDN2_B64:
6063 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
6064 Inst.eraseFromParent();
6065 continue;
6066
6067 case AMDGPU::S_ORN2_B64:
6068 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
6069 Inst.eraseFromParent();
6070 continue;
6071
6072 case AMDGPU::S_BREV_B64:
6073 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
6074 Inst.eraseFromParent();
6075 continue;
6076
6077 case AMDGPU::S_NOT_B64:
6078 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
6079 Inst.eraseFromParent();
6080 continue;
6081
6082 case AMDGPU::S_BCNT1_I32_B64:
6083 splitScalar64BitBCNT(Worklist, Inst);
6084 Inst.eraseFromParent();
6085 continue;
6086
6087 case AMDGPU::S_BFE_I64:
6088 splitScalar64BitBFE(Worklist, Inst);
6089 Inst.eraseFromParent();
6090 continue;
6091
6092 case AMDGPU::S_LSHL_B32:
6093 if (ST.hasOnlyRevVALUShifts()) {
6094 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
6095 swapOperands(Inst);
6096 }
6097 break;
6098 case AMDGPU::S_ASHR_I32:
6099 if (ST.hasOnlyRevVALUShifts()) {
6100 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
6101 swapOperands(Inst);
6102 }
6103 break;
6104 case AMDGPU::S_LSHR_B32:
6105 if (ST.hasOnlyRevVALUShifts()) {
6106 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
6107 swapOperands(Inst);
6108 }
6109 break;
6110 case AMDGPU::S_LSHL_B64:
6111 if (ST.hasOnlyRevVALUShifts()) {
6112 NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
6113 swapOperands(Inst);
6114 }
6115 break;
6116 case AMDGPU::S_ASHR_I64:
6117 if (ST.hasOnlyRevVALUShifts()) {
6118 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
6119 swapOperands(Inst);
6120 }
6121 break;
6122 case AMDGPU::S_LSHR_B64:
6123 if (ST.hasOnlyRevVALUShifts()) {
6124 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
6125 swapOperands(Inst);
6126 }
6127 break;
6128
6129 case AMDGPU::S_ABS_I32:
6130 lowerScalarAbs(Worklist, Inst);
6131 Inst.eraseFromParent();
6132 continue;
6133
6134 case AMDGPU::S_CBRANCH_SCC0:
6135 case AMDGPU::S_CBRANCH_SCC1: {
6136 // Clear unused bits of vcc
6137 Register CondReg = Inst.getOperand(1).getReg();
6138 bool IsSCC = CondReg == AMDGPU::SCC;
6139 Register VCC = RI.getVCC();
6140 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6141 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6142 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
6143 .addReg(EXEC)
6144 .addReg(IsSCC ? VCC : CondReg);
6145 Inst.RemoveOperand(1);
6146 }
6147 break;
6148
6149 case AMDGPU::S_BFE_U64:
6150 case AMDGPU::S_BFM_B64:
6151 llvm_unreachable("Moving this op to VALU not implemented")::llvm::llvm_unreachable_internal("Moving this op to VALU not implemented"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6151)
;
6152
6153 case AMDGPU::S_PACK_LL_B32_B16:
6154 case AMDGPU::S_PACK_LH_B32_B16:
6155 case AMDGPU::S_PACK_HH_B32_B16:
6156 movePackToVALU(Worklist, MRI, Inst);
6157 Inst.eraseFromParent();
6158 continue;
6159
6160 case AMDGPU::S_XNOR_B32:
6161 lowerScalarXnor(Worklist, Inst);
6162 Inst.eraseFromParent();
6163 continue;
6164
6165 case AMDGPU::S_NAND_B32:
6166 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
6167 Inst.eraseFromParent();
6168 continue;
6169
6170 case AMDGPU::S_NOR_B32:
6171 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
6172 Inst.eraseFromParent();
6173 continue;
6174
6175 case AMDGPU::S_ANDN2_B32:
6176 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
6177 Inst.eraseFromParent();
6178 continue;
6179
6180 case AMDGPU::S_ORN2_B32:
6181 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
6182 Inst.eraseFromParent();
6183 continue;
6184
6185 // TODO: remove as soon as everything is ready
6186 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
6187 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
6188 // can only be selected from the uniform SDNode.
6189 case AMDGPU::S_ADD_CO_PSEUDO:
6190 case AMDGPU::S_SUB_CO_PSEUDO: {
6191 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6192 ? AMDGPU::V_ADDC_U32_e64
6193 : AMDGPU::V_SUBB_U32_e64;
6194 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6195
6196 Register CarryInReg = Inst.getOperand(4).getReg();
6197 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
6198 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
6199 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
6200 .addReg(CarryInReg);
6201 }
6202
6203 Register CarryOutReg = Inst.getOperand(1).getReg();
6204
6205 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
6206 MRI.getRegClass(Inst.getOperand(0).getReg())));
6207 MachineInstr *CarryOp =
6208 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
6209 .addReg(CarryOutReg, RegState::Define)
6210 .add(Inst.getOperand(2))
6211 .add(Inst.getOperand(3))
6212 .addReg(CarryInReg)
6213 .addImm(0);
6214 CreatedBBTmp = legalizeOperands(*CarryOp);
6215 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
6216 CreatedBB = CreatedBBTmp;
6217 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
6218 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
6219 Inst.eraseFromParent();
6220 }
6221 continue;
6222 case AMDGPU::S_UADDO_PSEUDO:
6223 case AMDGPU::S_USUBO_PSEUDO: {
6224 const DebugLoc &DL = Inst.getDebugLoc();
6225 MachineOperand &Dest0 = Inst.getOperand(0);
6226 MachineOperand &Dest1 = Inst.getOperand(1);
6227 MachineOperand &Src0 = Inst.getOperand(2);
6228 MachineOperand &Src1 = Inst.getOperand(3);
6229
6230 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6231 ? AMDGPU::V_ADD_CO_U32_e64
6232 : AMDGPU::V_SUB_CO_U32_e64;
6233 const TargetRegisterClass *NewRC =
6234 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
6235 Register DestReg = MRI.createVirtualRegister(NewRC);
6236 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
6237 .addReg(Dest1.getReg(), RegState::Define)
6238 .add(Src0)
6239 .add(Src1)
6240 .addImm(0); // clamp bit
6241
6242 CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
6243 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
6244 CreatedBB = CreatedBBTmp;
6245
6246 MRI.replaceRegWith(Dest0.getReg(), DestReg);
6247 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
6248 Worklist);
6249 Inst.eraseFromParent();
6250 }
6251 continue;
6252
6253 case AMDGPU::S_CSELECT_B32:
6254 case AMDGPU::S_CSELECT_B64:
6255 lowerSelect(Worklist, Inst, MDT);
6256 Inst.eraseFromParent();
6257 continue;
6258 case AMDGPU::S_CMP_EQ_I32:
6259 case AMDGPU::S_CMP_LG_I32:
6260 case AMDGPU::S_CMP_GT_I32:
6261 case AMDGPU::S_CMP_GE_I32:
6262 case AMDGPU::S_CMP_LT_I32:
6263 case AMDGPU::S_CMP_LE_I32:
6264 case AMDGPU::S_CMP_EQ_U32:
6265 case AMDGPU::S_CMP_LG_U32:
6266 case AMDGPU::S_CMP_GT_U32:
6267 case AMDGPU::S_CMP_GE_U32:
6268 case AMDGPU::S_CMP_LT_U32:
6269 case AMDGPU::S_CMP_LE_U32:
6270 case AMDGPU::S_CMP_EQ_U64:
6271 case AMDGPU::S_CMP_LG_U64: {
6272 const MCInstrDesc &NewDesc = get(NewOpcode);
6273 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
6274 MachineInstr *NewInstr =
6275 BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
6276 .add(Inst.getOperand(0))
6277 .add(Inst.getOperand(1));
6278 legalizeOperands(*NewInstr, MDT);
6279 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
6280 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
6281 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
6282 Inst.eraseFromParent();
6283 }
6284 continue;
6285 }
6286
6287
6288 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
6289 // We cannot move this instruction to the VALU, so we should try to
6290 // legalize its operands instead.
6291 CreatedBBTmp = legalizeOperands(Inst, MDT);
6292 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
6293 CreatedBB = CreatedBBTmp;
6294 continue;
6295 }
6296
6297 // Use the new VALU Opcode.
6298 const MCInstrDesc &NewDesc = get(NewOpcode);
6299 Inst.setDesc(NewDesc);
6300
6301 // Remove any references to SCC. Vector instructions can't read from it, and
6302 // We're just about to add the implicit use / defs of VCC, and we don't want
6303 // both.
6304 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
6305 MachineOperand &Op = Inst.getOperand(i);
6306 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
6307 // Only propagate through live-def of SCC.
6308 if (Op.isDef() && !Op.isDead())
6309 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
6310 if (Op.isUse())
6311 addSCCDefsToVALUWorklist(Op, Worklist);
6312 Inst.RemoveOperand(i);
6313 }
6314 }
6315
6316 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
6317 // We are converting these to a BFE, so we need to add the missing
6318 // operands for the size and offset.
6319 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
6320 Inst.addOperand(MachineOperand::CreateImm(0));
6321 Inst.addOperand(MachineOperand::CreateImm(Size));
6322
6323 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
6324 // The VALU version adds the second operand to the result, so insert an
6325 // extra 0 operand.
6326 Inst.addOperand(MachineOperand::CreateImm(0));
6327 }
6328
6329 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
6330 fixImplicitOperands(Inst);
6331
6332 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
6333 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
6334 // If we need to move this to VGPRs, we need to unpack the second operand
6335 // back into the 2 separate ones for bit offset and width.
6336 assert(OffsetWidthOp.isImm() &&(static_cast <bool> (OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset"
) ? void (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6337, __extension__
__PRETTY_FUNCTION__))
6337 "Scalar BFE is only implemented for constant width and offset")(static_cast <bool> (OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset"
) ? void (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6337, __extension__
__PRETTY_FUNCTION__))
;
6338 uint32_t Imm = OffsetWidthOp.getImm();
6339
6340 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
6341 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
6342 Inst.RemoveOperand(2); // Remove old immediate.
6343 Inst.addOperand(MachineOperand::CreateImm(Offset));
6344 Inst.addOperand(MachineOperand::CreateImm(BitWidth));
6345 }
6346
6347 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
6348 unsigned NewDstReg = AMDGPU::NoRegister;
6349 if (HasDst) {
6350 Register DstReg = Inst.getOperand(0).getReg();
6351 if (DstReg.isPhysical())
6352 continue;
6353
6354 // Update the destination register class.
6355 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
6356 if (!NewDstRC)
6357 continue;
6358
6359 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
6360 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
6361 // Instead of creating a copy where src and dst are the same register
6362 // class, we just replace all uses of dst with src. These kinds of
6363 // copies interfere with the heuristics MachineSink uses to decide
6364 // whether or not to split a critical edge. Since the pass assumes
6365 // that copies will end up as machine instructions and not be
6366 // eliminated.
6367 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
6368 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
6369 MRI.clearKillFlags(Inst.getOperand(1).getReg());
6370 Inst.getOperand(0).setReg(DstReg);
6371
6372 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
6373 // these are deleted later, but at -O0 it would leave a suspicious
6374 // looking illegal copy of an undef register.
6375 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
6376 Inst.RemoveOperand(I);
6377 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
6378 continue;
6379 }
6380
6381 NewDstReg = MRI.createVirtualRegister(NewDstRC);
6382 MRI.replaceRegWith(DstReg, NewDstReg);
6383 }
6384
6385 // Legalize the operands
6386 CreatedBBTmp = legalizeOperands(Inst, MDT);
6387 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
6388 CreatedBB = CreatedBBTmp;
6389
6390 if (HasDst)
6391 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
6392 }
6393 return CreatedBB;
6394}
6395
6396// Add/sub require special handling to deal with carry outs.
6397std::pair<bool, MachineBasicBlock *>
6398SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
6399 MachineDominatorTree *MDT) const {
6400 if (ST.hasAddNoCarry()) {
6401 // Assume there is no user of scc since we don't select this in that case.
6402 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
6403 // is used.
6404
6405 MachineBasicBlock &MBB = *Inst.getParent();
6406 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6407
6408 Register OldDstReg = Inst.getOperand(0).getReg();
6409 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6410
6411 unsigned Opc = Inst.getOpcode();
6412 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32)(static_cast <bool> (Opc == AMDGPU::S_ADD_I32 || Opc ==
AMDGPU::S_SUB_I32) ? void (0) : __assert_fail ("Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6412, __extension__
__PRETTY_FUNCTION__))
;
6413
6414 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
6415 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
6416
6417 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC)(static_cast <bool> (Inst.getOperand(3).getReg() == AMDGPU
::SCC) ? void (0) : __assert_fail ("Inst.getOperand(3).getReg() == AMDGPU::SCC"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6417, __extension__
__PRETTY_FUNCTION__))
;
6418 Inst.RemoveOperand(3);
6419
6420 Inst.setDesc(get(NewOpc));
6421 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
6422 Inst.addImplicitDefUseOperands(*MBB.getParent());
6423 MRI.replaceRegWith(OldDstReg, ResultReg);
6424 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
6425
6426 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6427 return std::make_pair(true, NewBB);
6428 }
6429
6430 return std::make_pair(false, nullptr);
6431}
6432
6433void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
6434 MachineDominatorTree *MDT) const {
6435
6436 MachineBasicBlock &MBB = *Inst.getParent();
6437 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6438 MachineBasicBlock::iterator MII = Inst;
6439 DebugLoc DL = Inst.getDebugLoc();
6440
6441 MachineOperand &Dest = Inst.getOperand(0);
6442 MachineOperand &Src0 = Inst.getOperand(1);
6443 MachineOperand &Src1 = Inst.getOperand(2);
6444 MachineOperand &Cond = Inst.getOperand(3);
6445
6446 Register SCCSource = Cond.getReg();
6447 bool IsSCC = (SCCSource == AMDGPU::SCC);
6448
6449 // If this is a trivial select where the condition is effectively not SCC
6450 // (SCCSource is a source of copy to SCC), then the select is semantically
6451 // equivalent to copying SCCSource. Hence, there is no need to create
6452 // V_CNDMASK, we can just use that and bail out.
6453 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
6454 (Src1.getImm() == 0)) {
6455 MRI.replaceRegWith(Dest.getReg(), SCCSource);
6456 return;
6457 }
6458
6459 const TargetRegisterClass *TC =
6460 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6461
6462 Register CopySCC = MRI.createVirtualRegister(TC);
6463
6464 if (IsSCC) {
6465 // Now look for the closest SCC def if it is a copy
6466 // replacing the SCCSource with the COPY source register
6467 bool CopyFound = false;
6468 for (MachineInstr &CandI :
6469 make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
6470 Inst.getParent()->rend())) {
6471 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
6472 -1) {
6473 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
6474 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
6475 .addReg(CandI.getOperand(1).getReg());
6476 CopyFound = true;
6477 }
6478 break;
6479 }
6480 }
6481 if (!CopyFound) {
6482 // SCC def is not a copy
6483 // Insert a trivial select instead of creating a copy, because a copy from
6484 // SCC would semantically mean just copying a single bit, but we may need
6485 // the result to be a vector condition mask that needs preserving.
6486 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
6487 : AMDGPU::S_CSELECT_B32;
6488 auto NewSelect =
6489 BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
6490 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
6491 }
6492 }
6493
6494 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6495
6496 auto UpdatedInst =
6497 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
6498 .addImm(0)
6499 .add(Src1) // False
6500 .addImm(0)
6501 .add(Src0) // True
6502 .addReg(IsSCC ? CopySCC : SCCSource);
6503
6504 MRI.replaceRegWith(Dest.getReg(), ResultReg);
6505 legalizeOperands(*UpdatedInst, MDT);
6506 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6507}
6508
6509void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
6510 MachineInstr &Inst) const {
6511 MachineBasicBlock &MBB = *Inst.getParent();
6512 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6513 MachineBasicBlock::iterator MII = Inst;
6514 DebugLoc DL = Inst.getDebugLoc();
6515
6516 MachineOperand &Dest = Inst.getOperand(0);
6517 MachineOperand &Src = Inst.getOperand(1);
6518 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6519 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6520
6521 unsigned SubOp = ST.hasAddNoCarry() ?
6522 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
6523
6524 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
6525 .addImm(0)
6526 .addReg(Src.getReg());
6527
6528 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
6529 .addReg(Src.getReg())
6530 .addReg(TmpReg);
6531
6532 MRI.replaceRegWith(Dest.getReg(), ResultReg);
6533 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6534}
6535
6536void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
6537 MachineInstr &Inst) const {
6538 MachineBasicBlock &MBB = *Inst.getParent();
6539 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6540 MachineBasicBlock::iterator MII = Inst;
6541 const DebugLoc &DL = Inst.getDebugLoc();
6542
6543 MachineOperand &Dest = Inst.getOperand(0);
6544 MachineOperand &Src0 = Inst.getOperand(1);
6545 MachineOperand &Src1 = Inst.getOperand(2);
6546
6547 if (ST.hasDLInsts()) {
6548 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6549 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
6550 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
6551
6552 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
6553 .add(Src0)
6554 .add(Src1);
6555
6556 MRI.replaceRegWith(Dest.getReg(), NewDest);
6557 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6558 } else {
6559 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
6560 // invert either source and then perform the XOR. If either source is a
6561 // scalar register, then we can leave the inversion on the scalar unit to
6562 // achieve a better distribution of scalar and vector instructions.
6563 bool Src0IsSGPR = Src0.isReg() &&
6564 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
6565 bool Src1IsSGPR = Src1.isReg() &&
6566 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
6567 MachineInstr *Xor;
6568 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6569 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6570
6571 // Build a pair of scalar instructions and add them to the work list.
6572 // The next iteration over the work list will lower these to the vector
6573 // unit as necessary.
6574 if (Src0IsSGPR) {
6575 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
6576 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
6577 .addReg(Temp)
6578 .add(Src1);
6579 } else if (Src1IsSGPR) {
6580 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
6581 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
6582 .add(Src0)
6583 .addReg(Temp);
6584 } else {
6585 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
6586 .add(Src0)
6587 .add(Src1);
6588 MachineInstr *Not =
6589 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
6590 Worklist.insert(Not);
6591 }
6592
6593 MRI.replaceRegWith(Dest.getReg(), NewDest);
6594
6595 Worklist.insert(Xor);
6596
6597 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6598 }
6599}
6600
6601void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
6602 MachineInstr &Inst,
6603 unsigned Opcode) const {
6604 MachineBasicBlock &MBB = *Inst.getParent();
6605 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6606 MachineBasicBlock::iterator MII = Inst;
6607 const DebugLoc &DL = Inst.getDebugLoc();
6608
6609 MachineOperand &Dest = Inst.getOperand(0);
6610 MachineOperand &Src0 = Inst.getOperand(1);
6611 MachineOperand &Src1 = Inst.getOperand(2);
6612
6613 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6614 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6615
6616 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
6617 .add(Src0)
6618 .add(Src1);
6619
6620 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
6621 .addReg(Interm);
6622
6623 Worklist.insert(&Op);
6624 Worklist.insert(&Not);
6625
6626 MRI.replaceRegWith(Dest.getReg(), NewDest);
6627 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6628}
6629
6630void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
6631 MachineInstr &Inst,
6632 unsigned Opcode) const {
6633 MachineBasicBlock &MBB = *Inst.getParent();
6634 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6635 MachineBasicBlock::iterator MII = Inst;
6636 const DebugLoc &DL = Inst.getDebugLoc();
6637
6638 MachineOperand &Dest = Inst.getOperand(0);
6639 MachineOperand &Src0 = Inst.getOperand(1);
6640 MachineOperand &Src1 = Inst.getOperand(2);
6641
6642 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6643 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6644
6645 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
6646 .add(Src1);
6647
6648 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
6649 .add(Src0)
6650 .addReg(Interm);
6651
6652 Worklist.insert(&Not);
6653 Worklist.insert(&Op);
6654
6655 MRI.replaceRegWith(Dest.getReg(), NewDest);
6656 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6657}
6658
6659void SIInstrInfo::splitScalar64BitUnaryOp(
6660 SetVectorType &Worklist, MachineInstr &Inst,
6661 unsigned Opcode, bool Swap) const {
6662 MachineBasicBlock &MBB = *Inst.getParent();
6663 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6664
6665 MachineOperand &Dest = Inst.getOperand(0);
6666 MachineOperand &Src0 = Inst.getOperand(1);
6667 DebugLoc DL = Inst.getDebugLoc();
6668
6669 MachineBasicBlock::iterator MII = Inst;
6670
6671 const MCInstrDesc &InstDesc = get(Opcode);
6672 const TargetRegisterClass *Src0RC = Src0.isReg() ?
6673 MRI.getRegClass(Src0.getReg()) :
6674 &AMDGPU::SGPR_32RegClass;
6675
6676 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
6677
6678 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6679 AMDGPU::sub0, Src0SubRC);
6680
6681 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
6682 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
6683 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
6684
6685 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
6686 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
6687
6688 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6689 AMDGPU::sub1, Src0SubRC);
6690
6691 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
6692 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
6693
6694 if (Swap)
6695 std::swap(DestSub0, DestSub1);
6696
6697 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
6698 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
6699 .addReg(DestSub0)
6700 .addImm(AMDGPU::sub0)
6701 .addReg(DestSub1)
6702 .addImm(AMDGPU::sub1);
6703
6704 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
6705
6706 Worklist.insert(&LoHalf);
6707 Worklist.insert(&HiHalf);
6708
6709 // We don't need to legalizeOperands here because for a single operand, src0
6710 // will support any kind of input.
6711
6712 // Move all users of this moved value.
6713 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
6714}
6715
6716void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
6717 MachineInstr &Inst,
6718 MachineDominatorTree *MDT) const {
6719 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
6720
6721 MachineBasicBlock &MBB = *Inst.getParent();
6722 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6723 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6724
6725 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6726 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6727 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6728
6729 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6730 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6731
6732 MachineOperand &Dest = Inst.getOperand(0);
6733 MachineOperand &Src0 = Inst.getOperand(1);
6734 MachineOperand &Src1 = Inst.getOperand(2);
6735 const DebugLoc &DL = Inst.getDebugLoc();
6736 MachineBasicBlock::iterator MII = Inst;
6737
6738 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
6739 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
6740 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
6741 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
6742
6743 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6744 AMDGPU::sub0, Src0SubRC);
6745 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6746 AMDGPU::sub0, Src1SubRC);
6747
6748
6749 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6750 AMDGPU::sub1, Src0SubRC);
6751 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6752 AMDGPU::sub1, Src1SubRC);
6753
6754 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6755 MachineInstr *LoHalf =
6756 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
6757 .addReg(CarryReg, RegState::Define)
6758 .add(SrcReg0Sub0)
6759 .add(SrcReg1Sub0)
6760 .addImm(0); // clamp bit
6761
6762 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6763 MachineInstr *HiHalf =
6764 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
6765 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6766 .add(SrcReg0Sub1)
6767 .add(SrcReg1Sub1)
6768 .addReg(CarryReg, RegState::Kill)
6769 .addImm(0); // clamp bit
6770
6771 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
6772 .addReg(DestSub0)
6773 .addImm(AMDGPU::sub0)
6774 .addReg(DestSub1)
6775 .addImm(AMDGPU::sub1);
6776
6777 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
6778
6779 // Try to legalize the operands in case we need to swap the order to keep it
6780 // valid.
6781 legalizeOperands(*LoHalf, MDT);
6782 legalizeOperands(*HiHalf, MDT);
6783
6784 // Move all users of this moved value.
6785 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
6786}
6787
6788void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
6789 MachineInstr &Inst, unsigned Opcode,
6790 MachineDominatorTree *MDT) const {
6791 MachineBasicBlock &MBB = *Inst.getParent();
6792 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6793
6794 MachineOperand &Dest = Inst.getOperand(0);
6795 MachineOperand &Src0 = Inst.getOperand(1);
6796 MachineOperand &Src1 = Inst.getOperand(2);
6797 DebugLoc DL = Inst.getDebugLoc();
6798
6799 MachineBasicBlock::iterator MII = Inst;
6800
6801 const MCInstrDesc &InstDesc = get(Opcode);
6802 const TargetRegisterClass *Src0RC = Src0.isReg() ?
6803 MRI.getRegClass(Src0.getReg()) :
6804 &AMDGPU::SGPR_32RegClass;
6805
6806 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
6807 const TargetRegisterClass *Src1RC = Src1.isReg() ?
6808 MRI.getRegClass(Src1.getReg()) :
6809 &AMDGPU::SGPR_32RegClass;
6810
6811 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
6812
6813 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6814 AMDGPU::sub0, Src0SubRC);
6815 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6816 AMDGPU::sub0, Src1SubRC);
6817 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6818 AMDGPU::sub1, Src0SubRC);
6819 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6820 AMDGPU::sub1, Src1SubRC);
6821
6822 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
6823 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
6824 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
6825
6826 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
6827 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
6828 .add(SrcReg0Sub0)
6829 .add(SrcReg1Sub0);
6830
6831 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
6832 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
6833 .add(SrcReg0Sub1)
6834 .add(SrcReg1Sub1);
6835
6836 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
6837 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
6838 .addReg(DestSub0)
6839 .addImm(AMDGPU::sub0)
6840 .addReg(DestSub1)
6841 .addImm(AMDGPU::sub1);
6842
6843 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
6844
6845 Worklist.insert(&LoHalf);
6846 Worklist.insert(&HiHalf);
6847
6848 // Move all users of this moved value.
6849 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
6850}
6851
6852void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
6853 MachineInstr &Inst,
6854 MachineDominatorTree *MDT) const {
6855 MachineBasicBlock &MBB = *Inst.getParent();
6856 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6857
6858 MachineOperand &Dest = Inst.getOperand(0);
6859 MachineOperand &Src0 = Inst.getOperand(1);
6860 MachineOperand &Src1 = Inst.getOperand(2);
6861 const DebugLoc &DL = Inst.getDebugLoc();
6862
6863 MachineBasicBlock::iterator MII = Inst;
6864
6865 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
6866
6867 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6868
6869 MachineOperand* Op0;
6870 MachineOperand* Op1;
6871
6872 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
6873 Op0 = &Src0;
6874 Op1 = &Src1;
6875 } else {
6876 Op0 = &Src1;
6877 Op1 = &Src0;
6878 }
6879
6880 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
6881 .add(*Op0);
6882
6883 Register NewDest = MRI.createVirtualRegister(DestRC);
6884
6885 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
6886 .addReg(Interm)
6887 .add(*Op1);
6888
6889 MRI.replaceRegWith(Dest.getReg(), NewDest);
6890
6891 Worklist.insert(&Xor);
6892}
6893
6894void SIInstrInfo::splitScalar64BitBCNT(
6895 SetVectorType &Worklist, MachineInstr &Inst) const {
6896 MachineBasicBlock &MBB = *Inst.getParent();
6897 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6898
6899 MachineBasicBlock::iterator MII = Inst;
6900 const DebugLoc &DL = Inst.getDebugLoc();
6901
6902 MachineOperand &Dest = Inst.getOperand(0);
6903 MachineOperand &Src = Inst.getOperand(1);
6904
6905 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
6906 const TargetRegisterClass *SrcRC = Src.isReg() ?
6907 MRI.getRegClass(Src.getReg()) :
6908 &AMDGPU::SGPR_32RegClass;
6909
6910 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6911 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6912
6913 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
6914
6915 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
6916 AMDGPU::sub0, SrcSubRC);
6917 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
6918 AMDGPU::sub1, SrcSubRC);
6919
6920 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
6921
6922 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
6923
6924 MRI.replaceRegWith(Dest.getReg(), ResultReg);
6925
6926 // We don't need to legalize operands here. src0 for either instruction can be
6927 // an SGPR, and the second input is unused or determined here.
6928 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6929}
6930
6931void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
6932 MachineInstr &Inst) const {
6933 MachineBasicBlock &MBB = *Inst.getParent();
6934 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6935 MachineBasicBlock::iterator MII = Inst;
6936 const DebugLoc &DL = Inst.getDebugLoc();
6937
6938 MachineOperand &Dest = Inst.getOperand(0);
6939 uint32_t Imm = Inst.getOperand(2).getImm();
6940 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
6941 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
6942
6943 (void) Offset;
6944
6945 // Only sext_inreg cases handled.
6946 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&(static_cast <bool> (Inst.getOpcode() == AMDGPU::S_BFE_I64
&& BitWidth <= 32 && Offset == 0 &&
"Not implemented") ? void (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6947, __extension__
__PRETTY_FUNCTION__))
6947 Offset == 0 && "Not implemented")(static_cast <bool> (Inst.getOpcode() == AMDGPU::S_BFE_I64
&& BitWidth <= 32 && Offset == 0 &&
"Not implemented") ? void (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6947, __extension__
__PRETTY_FUNCTION__))
;
6948
6949 if (BitWidth < 32) {
6950 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6951 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6952 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6953
6954 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
6955 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
6956 .addImm(0)
6957 .addImm(BitWidth);
6958
6959 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
6960 .addImm(31)
6961 .addReg(MidRegLo);
6962
6963 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
6964 .addReg(MidRegLo)
6965 .addImm(AMDGPU::sub0)
6966 .addReg(MidRegHi)
6967 .addImm(AMDGPU::sub1);
6968
6969 MRI.replaceRegWith(Dest.getReg(), ResultReg);
6970 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6971 return;
6972 }
6973
6974 MachineOperand &Src = Inst.getOperand(1);
6975 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6976 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6977
6978 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
6979 .addImm(31)
6980 .addReg(Src.getReg(), 0, AMDGPU::sub0);
6981
6982 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
6983 .addReg(Src.getReg(), 0, AMDGPU::sub0)
6984 .addImm(AMDGPU::sub0)
6985 .addReg(TmpReg)
6986 .addImm(AMDGPU::sub1);
6987
6988 MRI.replaceRegWith(Dest.getReg(), ResultReg);
6989 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6990}
6991
6992void SIInstrInfo::addUsersToMoveToVALUWorklist(
6993 Register DstReg,
6994 MachineRegisterInfo &MRI,
6995 SetVectorType &Worklist) const {
6996 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
6997 E = MRI.use_end(); I != E;) {
6998 MachineInstr &UseMI = *I->getParent();
6999
7000 unsigned OpNo = 0;
7001
7002 switch (UseMI.getOpcode()) {
7003 case AMDGPU::COPY:
7004 case AMDGPU::WQM:
7005 case AMDGPU::SOFT_WQM:
7006 case AMDGPU::STRICT_WWM:
7007 case AMDGPU::STRICT_WQM:
7008 case AMDGPU::REG_SEQUENCE:
7009 case AMDGPU::PHI:
7010 case AMDGPU::INSERT_SUBREG:
7011 break;
7012 default:
7013 OpNo = I.getOperandNo();
7014 break;
7015 }
7016
7017 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
7018 Worklist.insert(&UseMI);
7019
7020 do {
7021 ++I;
7022 } while (I != E && I->getParent() == &UseMI);
7023 } else {
7024 ++I;
7025 }
7026 }
7027}
7028
7029void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
7030 MachineRegisterInfo &MRI,
7031 MachineInstr &Inst) const {
7032 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7033 MachineBasicBlock *MBB = Inst.getParent();
7034 MachineOperand &Src0 = Inst.getOperand(1);
7035 MachineOperand &Src1 = Inst.getOperand(2);
7036 const DebugLoc &DL = Inst.getDebugLoc();
7037
7038 switch (Inst.getOpcode()) {
7039 case AMDGPU::S_PACK_LL_B32_B16: {
7040 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7041 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7042
7043 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
7044 // 0.
7045 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
7046 .addImm(0xffff);
7047
7048 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
7049 .addReg(ImmReg, RegState::Kill)
7050 .add(Src0);
7051
7052 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
7053 .add(Src1)
7054 .addImm(16)
7055 .addReg(TmpReg, RegState::Kill);
7056 break;
7057 }
7058 case AMDGPU::S_PACK_LH_B32_B16: {
7059 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7060 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
7061 .addImm(0xffff);
7062 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
7063 .addReg(ImmReg, RegState::Kill)
7064 .add(Src0)
7065 .add(Src1);
7066 break;
7067 }
7068 case AMDGPU::S_PACK_HH_B32_B16: {
7069 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7070 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7071 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7072 .addImm(16)
7073 .add(Src0);
7074 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
7075 .addImm(0xffff0000);
7076 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
7077 .add(Src1)
7078 .addReg(ImmReg, RegState::Kill)
7079 .addReg(TmpReg, RegState::Kill);
7080 break;
7081 }
7082 default:
7083 llvm_unreachable("unhandled s_pack_* instruction")::llvm::llvm_unreachable_internal("unhandled s_pack_* instruction"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7083)
;
7084 }
7085
7086 MachineOperand &Dest = Inst.getOperand(0);
7087 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7088 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7089}
7090
7091void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
7092 MachineInstr &SCCDefInst,
7093 SetVectorType &Worklist,
7094 Register NewCond) const {
7095
7096 // Ensure that def inst defines SCC, which is still live.
7097 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&(static_cast <bool> (Op.isReg() && Op.getReg() ==
AMDGPU::SCC && Op.isDef() && !Op.isDead() &&
Op.getParent() == &SCCDefInst) ? void (0) : __assert_fail
("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7098, __extension__
__PRETTY_FUNCTION__))
7098 !Op.isDead() && Op.getParent() == &SCCDefInst)(static_cast <bool> (Op.isReg() && Op.getReg() ==
AMDGPU::SCC && Op.isDef() && !Op.isDead() &&
Op.getParent() == &SCCDefInst) ? void (0) : __assert_fail
("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7098, __extension__
__PRETTY_FUNCTION__))
;
7099 SmallVector<MachineInstr *, 4> CopyToDelete;
7100 // This assumes that all the users of SCC are in the same block
7101 // as the SCC def.
7102 for (MachineInstr &MI : // Skip the def inst itself.
7103 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
7104 SCCDefInst.getParent()->end())) {
7105 // Check if SCC is used first.
7106 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI);
7107 if (SCCIdx != -1) {
7108 if (MI.isCopy()) {
7109 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7110 Register DestReg = MI.getOperand(0).getReg();
7111
7112 MRI.replaceRegWith(DestReg, NewCond);
7113 CopyToDelete.push_back(&MI);
7114 } else {
7115
7116 if (NewCond.isValid())
7117 MI.getOperand(SCCIdx).setReg(NewCond);
7118
7119 Worklist.insert(&MI);
7120 }
7121 }
7122 // Exit if we find another SCC def.
7123 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
7124 break;
7125 }
7126 for (auto &Copy : CopyToDelete)
7127 Copy->eraseFromParent();
7128}
7129
7130// Instructions that use SCC may be converted to VALU instructions. When that
7131// happens, the SCC register is changed to VCC_LO. The instruction that defines
7132// SCC must be changed to an instruction that defines VCC. This function makes
7133// sure that the instruction that defines SCC is added to the moveToVALU
7134// worklist.
7135void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
7136 SetVectorType &Worklist) const {
7137 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse())(static_cast <bool> (Op.isReg() && Op.getReg() ==
AMDGPU::SCC && Op.isUse()) ? void (0) : __assert_fail
("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7137, __extension__
__PRETTY_FUNCTION__))
;
7138
7139 MachineInstr *SCCUseInst = Op.getParent();
7140 // Look for a preceding instruction that either defines VCC or SCC. If VCC
7141 // then there is nothing to do because the defining instruction has been
7142 // converted to a VALU already. If SCC then that instruction needs to be
7143 // converted to a VALU.
7144 for (MachineInstr &MI :
7145 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
7146 SCCUseInst->getParent()->rend())) {
7147 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
7148 break;
7149 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
7150 Worklist.insert(&MI);
7151 break;
7152 }
7153 }
7154}
7155
7156const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
7157 const MachineInstr &Inst) const {
7158 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
7159
7160 switch (Inst.getOpcode()) {
7161 // For target instructions, getOpRegClass just returns the virtual register
7162 // class associated with the operand, so we need to find an equivalent VGPR
7163 // register class in order to move the instruction to the VALU.
7164 case AMDGPU::COPY:
7165 case AMDGPU::PHI:
7166 case AMDGPU::REG_SEQUENCE:
7167 case AMDGPU::INSERT_SUBREG:
7168 case AMDGPU::WQM:
7169 case AMDGPU::SOFT_WQM:
7170 case AMDGPU::STRICT_WWM:
7171 case AMDGPU::STRICT_WQM: {
7172 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
7173 if (RI.isAGPRClass(SrcRC)) {
7174 if (RI.isAGPRClass(NewDstRC))
7175 return nullptr;
7176
7177 switch (Inst.getOpcode()) {
7178 case AMDGPU::PHI:
7179 case AMDGPU::REG_SEQUENCE:
7180 case AMDGPU::INSERT_SUBREG:
7181 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
7182 break;
7183 default:
7184 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
7185 }
7186
7187 if (!NewDstRC)
7188 return nullptr;
7189 } else {
7190 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
7191 return nullptr;
7192
7193 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
7194 if (!NewDstRC)
7195 return nullptr;
7196 }
7197
7198 return NewDstRC;
7199 }
7200 default:
7201 return NewDstRC;
7202 }
7203}
7204
7205// Find the one SGPR operand we are allowed to use.
7206Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
7207 int OpIndices[3]) const {
7208 const MCInstrDesc &Desc = MI.getDesc();
7209
7210 // Find the one SGPR operand we are allowed to use.
7211 //
7212 // First we need to consider the instruction's operand requirements before
7213 // legalizing. Some operands are required to be SGPRs, such as implicit uses
7214 // of VCC, but we are still bound by the constant bus requirement to only use
7215 // one.
7216 //
7217 // If the operand's class is an SGPR, we can never move it.
7218
7219 Register SGPRReg = findImplicitSGPRRead(MI);
7220 if (SGPRReg != AMDGPU::NoRegister)
7221 return SGPRReg;
7222
7223 Register UsedSGPRs[3] = { AMDGPU::NoRegister };
7224 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7225
7226 for (unsigned i = 0; i < 3; ++i) {
7227 int Idx = OpIndices[i];
7228 if (Idx == -1)
7229 break;
7230
7231 const MachineOperand &MO = MI.getOperand(Idx);
7232 if (!MO.isReg())
7233 continue;
7234
7235 // Is this operand statically required to be an SGPR based on the operand
7236 // constraints?
7237 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
7238 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
7239 if (IsRequiredSGPR)
7240 return MO.getReg();
7241
7242 // If this could be a VGPR or an SGPR, Check the dynamic register class.
7243 Register Reg = MO.getReg();
7244 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
7245 if (RI.isSGPRClass(RegRC))
7246 UsedSGPRs[i] = Reg;
7247 }
7248
7249 // We don't have a required SGPR operand, so we have a bit more freedom in
7250 // selecting operands to move.
7251
7252 // Try to select the most used SGPR. If an SGPR is equal to one of the
7253 // others, we choose that.
7254 //
7255 // e.g.
7256 // V_FMA_F32 v0, s0, s0, s0 -> No moves
7257 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
7258
7259 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
7260 // prefer those.
7261
7262 if (UsedSGPRs[0] != AMDGPU::NoRegister) {
7263 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
7264 SGPRReg = UsedSGPRs[0];
7265 }
7266
7267 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
7268 if (UsedSGPRs[1] == UsedSGPRs[2])
7269 SGPRReg = UsedSGPRs[1];
7270 }
7271
7272 return SGPRReg;
7273}
7274
7275MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
7276 unsigned OperandName) const {
7277 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
7278 if (Idx == -1)
7279 return nullptr;
7280
7281 return &MI.getOperand(Idx);
7282}
7283
7284uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
7285 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
7286 return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) |
7287 (1ULL << 56) | // RESOURCE_LEVEL = 1
7288 (3ULL << 60); // OOB_SELECT = 3
7289 }
7290
7291 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
7292 if (ST.isAmdHsaOS()) {
7293 // Set ATC = 1. GFX9 doesn't have this bit.
7294 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
7295 RsrcDataFormat |= (1ULL << 56);
7296
7297 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
7298 // BTW, it disables TC L2 and therefore decreases performance.
7299 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
7300 RsrcDataFormat |= (2ULL << 59);
7301 }
7302
7303 return RsrcDataFormat;
7304}
7305
7306uint64_t SIInstrInfo::getScratchRsrcWords23() const {
7307 uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
7308 AMDGPU::RSRC_TID_ENABLE |
7309 0xffffffff; // Size;
7310
7311 // GFX9 doesn't have ELEMENT_SIZE.
7312 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
7313 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
7314 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
7315 }
7316
7317 // IndexStride = 64 / 32.
7318 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
7319 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
7320
7321 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
7322 // Clear them unless we want a huge stride.
7323 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7324 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
7325 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
7326
7327 return Rsrc23;
7328}
7329
7330bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
7331 unsigned Opc = MI.getOpcode();
7332
7333 return isSMRD(Opc);
7334}
7335
7336bool SIInstrInfo::isHighLatencyDef(int Opc) const {
7337 return get(Opc).mayLoad() &&
7338 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
7339}
7340
7341unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
7342 int &FrameIndex) const {
7343 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7344 if (!Addr || !Addr->isFI())
7345 return AMDGPU::NoRegister;
7346
7347 assert(!MI.memoperands_empty() &&(static_cast <bool> (!MI.memoperands_empty() &&
(*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS
) ? void (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7348, __extension__
__PRETTY_FUNCTION__))
7348 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (!MI.memoperands_empty() &&
(*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS
) ? void (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7348, __extension__
__PRETTY_FUNCTION__))
;
7349
7350 FrameIndex = Addr->getIndex();
7351 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
7352}
7353
7354unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
7355 int &FrameIndex) const {
7356 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
7357 assert(Addr && Addr->isFI())(static_cast <bool> (Addr && Addr->isFI()) ?
void (0) : __assert_fail ("Addr && Addr->isFI()",
"llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7357, __extension__
__PRETTY_FUNCTION__))
;
7358 FrameIndex = Addr->getIndex();
7359 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
7360}
7361
7362unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
7363 int &FrameIndex) const {
7364 if (!MI.mayLoad())
7365 return AMDGPU::NoRegister;
7366
7367 if (isMUBUF(MI) || isVGPRSpill(MI))
7368 return isStackAccess(MI, FrameIndex);
7369
7370 if (isSGPRSpill(MI))
7371 return isSGPRStackAccess(MI, FrameIndex);
7372
7373 return AMDGPU::NoRegister;
7374}
7375
7376unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
7377 int &FrameIndex) const {
7378 if (!MI.mayStore())
7379 return AMDGPU::NoRegister;
7380
7381 if (isMUBUF(MI) || isVGPRSpill(MI))
7382 return isStackAccess(MI, FrameIndex);
7383
7384 if (isSGPRSpill(MI))
7385 return isSGPRStackAccess(MI, FrameIndex);
7386
7387 return AMDGPU::NoRegister;
7388}
7389
7390unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
7391 unsigned Size = 0;
7392 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
7393 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
7394 while (++I != E && I->isInsideBundle()) {
7395 assert(!I->isBundle() && "No nested bundle!")(static_cast <bool> (!I->isBundle() && "No nested bundle!"
) ? void (0) : __assert_fail ("!I->isBundle() && \"No nested bundle!\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7395, __extension__
__PRETTY_FUNCTION__))
;
7396 Size += getInstSizeInBytes(*I);
7397 }
7398
7399 return Size;
7400}
7401
7402unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
7403 unsigned Opc = MI.getOpcode();
7404 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
7405 unsigned DescSize = Desc.getSize();
7406
7407 // If we have a definitive size, we can use it. Otherwise we need to inspect
7408 // the operands to know the size.
7409 if (isFixedSize(MI)) {
7410 unsigned Size = DescSize;
7411
7412 // If we hit the buggy offset, an extra nop will be inserted in MC so
7413 // estimate the worst case.
7414 if (MI.isBranch() && ST.hasOffset3fBug())
7415 Size += 4;
7416
7417 return Size;
7418 }
7419
7420 // Instructions may have a 32-bit literal encoded after them. Check
7421 // operands that could ever be literals.
7422 if (isVALU(MI) || isSALU(MI)) {
7423 if (isDPP(MI))
7424 return DescSize;
7425 bool HasLiteral = false;
7426 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
7427 if (isLiteralConstant(MI, I)) {
7428 HasLiteral = true;
7429 break;
7430 }
7431 }
7432 return HasLiteral ? DescSize + 4 : DescSize;
7433 }
7434
7435 // Check whether we have extra NSA words.
7436 if (isMIMG(MI)) {
7437 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
7438 if (VAddr0Idx < 0)
7439 return 8;
7440
7441 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
7442 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
7443 }
7444
7445 switch (Opc) {
7446 case TargetOpcode::BUNDLE:
7447 return getInstBundleSize(MI);
7448 case TargetOpcode::INLINEASM:
7449 case TargetOpcode::INLINEASM_BR: {
7450 const MachineFunction *MF = MI.getParent()->getParent();
7451 const char *AsmStr = MI.getOperand(0).getSymbolName();
7452 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
7453 }
7454 default:
7455 if (MI.isMetaInstruction())
7456 return 0;
7457 return DescSize;
7458 }
7459}
7460
7461bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
7462 if (!isFLAT(MI))
7463 return false;
7464
7465 if (MI.memoperands_empty())
7466 return true;
7467
7468 for (const MachineMemOperand *MMO : MI.memoperands()) {
7469 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
7470 return true;
7471 }
7472 return false;
7473}
7474
7475bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
7476 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
7477}
7478
7479void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
7480 MachineBasicBlock *IfEnd) const {
7481 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
7482 assert(TI != IfEntry->end())(static_cast <bool> (TI != IfEntry->end()) ? void (0
) : __assert_fail ("TI != IfEntry->end()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 7482, __extension__ __PRETTY_FUNCTION__))
;
7483
7484 MachineInstr *Branch = &(*TI);
7485 MachineFunction *MF = IfEntry->getParent();
7486 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
7487
7488 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
7489 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
7490 MachineInstr *SIIF =
7491 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
7492 .add(Branch->getOperand(0))
7493 .add(Branch->getOperand(1));
7494 MachineInstr *SIEND =
7495 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
7496 .addReg(DstReg);
7497
7498 IfEntry->erase(TI);
7499 IfEntry->insert(IfEntry->end(), SIIF);
7500 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
7501 }
7502}
7503
7504void SIInstrInfo::convertNonUniformLoopRegion(
7505 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
7506 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
7507 // We expect 2 terminators, one conditional and one unconditional.
7508 assert(TI != LoopEnd->end())(static_cast <bool> (TI != LoopEnd->end()) ? void (0
) : __assert_fail ("TI != LoopEnd->end()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 7508, __extension__ __PRETTY_FUNCTION__))
;
7509
7510 MachineInstr *Branch = &(*TI);
7511 MachineFunction *MF = LoopEnd->getParent();
7512 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
7513
7514 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
7515
7516 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
7517 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
7518 MachineInstrBuilder HeaderPHIBuilder =
7519 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
7520 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
7521 if (PMBB == LoopEnd) {
7522 HeaderPHIBuilder.addReg(BackEdgeReg);
7523 } else {
7524 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
7525 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
7526 ZeroReg, 0);
7527 HeaderPHIBuilder.addReg(ZeroReg);
7528 }
7529 HeaderPHIBuilder.addMBB(PMBB);
7530 }
7531 MachineInstr *HeaderPhi = HeaderPHIBuilder;
7532 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
7533 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
7534 .addReg(DstReg)
7535 .add(Branch->getOperand(0));
7536 MachineInstr *SILOOP =
7537 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
7538 .addReg(BackEdgeReg)
7539 .addMBB(LoopEntry);
7540
7541 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
7542 LoopEnd->erase(TI);
7543 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
7544 LoopEnd->insert(LoopEnd->end(), SILOOP);
7545 }
7546}
7547
7548ArrayRef<std::pair<int, const char *>>
7549SIInstrInfo::getSerializableTargetIndices() const {
7550 static const std::pair<int, const char *> TargetIndices[] = {
7551 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
7552 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
7553 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
7554 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
7555 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
7556 return makeArrayRef(TargetIndices);
7557}
7558
7559/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
7560/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
7561ScheduleHazardRecognizer *
7562SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
7563 const ScheduleDAG *DAG) const {
7564 return new GCNHazardRecognizer(DAG->MF);
7565}
7566
7567/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
7568/// pass.
7569ScheduleHazardRecognizer *
7570SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
7571 return new GCNHazardRecognizer(MF);
7572}
7573
7574// Called during:
7575// - pre-RA scheduling and post-RA scheduling
7576ScheduleHazardRecognizer *
7577SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
7578 const ScheduleDAGMI *DAG) const {
7579 // Borrowed from Arm Target
7580 // We would like to restrict this hazard recognizer to only
7581 // post-RA scheduling; we can tell that we're post-RA because we don't
7582 // track VRegLiveness.
7583 if (!DAG->hasVRegLiveness())
7584 return new GCNHazardRecognizer(DAG->MF);
7585 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
7586}
7587
7588std::pair<unsigned, unsigned>
7589SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
7590 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
7591}
7592
7593ArrayRef<std::pair<unsigned, const char *>>
7594SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
7595 static const std::pair<unsigned, const char *> TargetFlags[] = {
7596 { MO_GOTPCREL, "amdgpu-gotprel" },
7597 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
7598 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
7599 { MO_REL32_LO, "amdgpu-rel32-lo" },
7600 { MO_REL32_HI, "amdgpu-rel32-hi" },
7601 { MO_ABS32_LO, "amdgpu-abs32-lo" },
7602 { MO_ABS32_HI, "amdgpu-abs32-hi" },
7603 };
7604
7605 return makeArrayRef(TargetFlags);
7606}
7607
7608ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
7609SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
7610 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
7611 {
7612 {MONoClobber, "amdgpu-noclobber"},
7613 };
7614
7615 return makeArrayRef(TargetFlags);
7616}
7617
7618bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
7619 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
7620 MI.modifiesRegister(AMDGPU::EXEC, &RI);
7621}
7622
7623MachineInstrBuilder
7624SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
7625 MachineBasicBlock::iterator I,
7626 const DebugLoc &DL,
7627 Register DestReg) const {
7628 if (ST.hasAddNoCarry())
7629 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
7630
7631 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7632 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
7633 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
7634
7635 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
7636 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
7637}
7638
7639MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
7640 MachineBasicBlock::iterator I,
7641 const DebugLoc &DL,
7642 Register DestReg,
7643 RegScavenger &RS) const {
7644 if (ST.hasAddNoCarry())
7645 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
7646
7647 // If available, prefer to use vcc.
7648 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
7649 ? Register(RI.getVCC())
7650 : RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
7651
7652 // TODO: Users need to deal with this.
7653 if (!UnusedCarry.isValid())
7654 return MachineInstrBuilder();
7655
7656 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
7657 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
7658}
7659
7660bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
7661 switch (Opcode) {
7662 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
7663 case AMDGPU::SI_KILL_I1_TERMINATOR:
7664 return true;
7665 default:
7666 return false;
7667 }
7668}
7669
7670const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
7671 switch (Opcode) {
7672 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7673 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
7674 case AMDGPU::SI_KILL_I1_PSEUDO:
7675 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
7676 default:
7677 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO")::llvm::llvm_unreachable_internal("invalid opcode, expected SI_KILL_*_PSEUDO"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7677)
;
7678 }
7679}
7680
7681void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
7682 if (!ST.isWave32())
7683 return;
7684
7685 for (auto &Op : MI.implicit_operands()) {
7686 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
7687 Op.setReg(AMDGPU::VCC_LO);
7688 }
7689}
7690
7691bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
7692 if (!isSMRD(MI))
7693 return false;
7694
7695 // Check that it is using a buffer resource.
7696 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
7697 if (Idx == -1) // e.g. s_memtime
7698 return false;
7699
7700 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
7701 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
7702}
7703
7704// Depending on the used address space and instructions, some immediate offsets
7705// are allowed and some are not.
7706// In general, flat instruction offsets can only be non-negative, global and
7707// scratch instruction offsets can also be negative.
7708//
7709// There are several bugs related to these offsets:
7710// On gfx10.1, flat instructions that go into the global address space cannot
7711// use an offset.
7712//
7713// For scratch instructions, the address can be either an SGPR or a VGPR.
7714// The following offsets can be used, depending on the architecture (x means
7715// cannot be used):
7716// +----------------------------+------+------+
7717// | Address-Mode | SGPR | VGPR |
7718// +----------------------------+------+------+
7719// | gfx9 | | |
7720// | negative, 4-aligned offset | x | ok |
7721// | negative, unaligned offset | x | ok |
7722// +----------------------------+------+------+
7723// | gfx10 | | |
7724// | negative, 4-aligned offset | ok | ok |
7725// | negative, unaligned offset | ok | x |
7726// +----------------------------+------+------+
7727// | gfx10.3 | | |
7728// | negative, 4-aligned offset | ok | ok |
7729// | negative, unaligned offset | ok | ok |
7730// +----------------------------+------+------+
7731//
7732// This function ignores the addressing mode, so if an offset cannot be used in
7733// one addressing mode, it is considered illegal.
7734bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
7735 uint64_t FlatVariant) const {
7736 // TODO: Should 0 be special cased?
7737 if (!ST.hasFlatInstOffsets())
7738 return false;
7739
7740 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
7741 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
7742 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
7743 return false;
7744
7745 bool Signed = FlatVariant != SIInstrFlags::FLAT;
7746 if (ST.hasNegativeScratchOffsetBug() &&
7747 FlatVariant == SIInstrFlags::FlatScratch)
7748 Signed = false;
7749 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
7750 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
7751 (Offset % 4) != 0) {
7752 return false;
7753 }
7754
7755 unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
7756 return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
7757}
7758
7759// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
7760std::pair<int64_t, int64_t>
7761SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
7762 uint64_t FlatVariant) const {
7763 int64_t RemainderOffset = COffsetVal;
7764 int64_t ImmField = 0;
7765 bool Signed = FlatVariant != SIInstrFlags::FLAT;
7766 if (ST.hasNegativeScratchOffsetBug() &&
7767 FlatVariant == SIInstrFlags::FlatScratch)
7768 Signed = false;
7769
7770 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed);
7771 if (Signed) {
7772 // Use signed division by a power of two to truncate towards 0.
7773 int64_t D = 1LL << (NumBits - 1);
7774 RemainderOffset = (COffsetVal / D) * D;
7775 ImmField = COffsetVal - RemainderOffset;
7776
7777 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
7778 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
7779 (ImmField % 4) != 0) {
7780 // Make ImmField a multiple of 4
7781 RemainderOffset += ImmField % 4;
7782 ImmField -= ImmField % 4;
7783 }
7784 } else if (COffsetVal >= 0) {
7785 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
7786 RemainderOffset = COffsetVal - ImmField;
7787 }
7788
7789 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant))(static_cast <bool> (isLegalFLATOffset(ImmField, AddrSpace
, FlatVariant)) ? void (0) : __assert_fail ("isLegalFLATOffset(ImmField, AddrSpace, FlatVariant)"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7789, __extension__
__PRETTY_FUNCTION__))
;
7790 assert(RemainderOffset + ImmField == COffsetVal)(static_cast <bool> (RemainderOffset + ImmField == COffsetVal
) ? void (0) : __assert_fail ("RemainderOffset + ImmField == COffsetVal"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7790, __extension__
__PRETTY_FUNCTION__))
;
7791 return {ImmField, RemainderOffset};
7792}
7793
7794// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
7795enum SIEncodingFamily {
7796 SI = 0,
7797 VI = 1,
7798 SDWA = 2,
7799 SDWA9 = 3,
7800 GFX80 = 4,
7801 GFX9 = 5,
7802 GFX10 = 6,
7803 SDWA10 = 7,
7804 GFX90A = 8,
7805 GFX940 = 9
7806};
7807
7808static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
7809 switch (ST.getGeneration()) {
7810 default:
7811 break;
7812 case AMDGPUSubtarget::SOUTHERN_ISLANDS:
7813 case AMDGPUSubtarget::SEA_ISLANDS:
7814 return SIEncodingFamily::SI;
7815 case AMDGPUSubtarget::VOLCANIC_ISLANDS:
7816 case AMDGPUSubtarget::GFX9:
7817 return SIEncodingFamily::VI;
7818 case AMDGPUSubtarget::GFX10:
7819 return SIEncodingFamily::GFX10;
7820 }
7821 llvm_unreachable("Unknown subtarget generation!")::llvm::llvm_unreachable_internal("Unknown subtarget generation!"
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7821)
;
7822}
7823
7824bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
7825 switch(MCOp) {
7826 // These opcodes use indirect register addressing so
7827 // they need special handling by codegen (currently missing).
7828 // Therefore it is too risky to allow these opcodes
7829 // to be selected by dpp combiner or sdwa peepholer.
7830 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
7831 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
7832 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
7833 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
7834 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
7835 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
7836 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
7837 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
7838 return true;
7839 default:
7840 return false;
7841 }
7842}
7843
7844int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
7845 SIEncodingFamily Gen = subtargetEncodingFamily(ST);
7846
7847 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
7848 ST.getGeneration() == AMDGPUSubtarget::GFX9)
7849 Gen = SIEncodingFamily::GFX9;
7850
7851 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
7852 // subtarget has UnpackedD16VMem feature.
7853 // TODO: remove this when we discard GFX80 encoding.
7854 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
7855 Gen = SIEncodingFamily::GFX80;
7856
7857 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
7858 switch (ST.getGeneration()) {
7859 default:
7860 Gen = SIEncodingFamily::SDWA;
7861 break;
7862 case AMDGPUSubtarget::GFX9:
7863 Gen = SIEncodingFamily::SDWA9;
7864 break;
7865 case AMDGPUSubtarget::GFX10:
7866 Gen = SIEncodingFamily::SDWA10;
7867 break;
7868 }
7869 }
7870
7871 if (isMAI(Opcode)) {
7872 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
7873 if (MFMAOp != -1)
7874 Opcode = MFMAOp;
7875 }
7876
7877 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
7878
7879 // -1 means that Opcode is already a native instruction.
7880 if (MCOp == -1)
7881 return Opcode;
7882
7883 if (ST.hasGFX90AInsts()) {
7884 uint16_t NMCOp = (uint16_t)-1;
7885 if (ST.hasGFX940Insts())
7886 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);
7887 if (NMCOp == (uint16_t)-1)
7888 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
7889 if (NMCOp == (uint16_t)-1)
7890 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
7891 if (NMCOp != (uint16_t)-1)
7892 MCOp = NMCOp;
7893 }
7894
7895 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
7896 // no encoding in the given subtarget generation.
7897 if (MCOp == (uint16_t)-1)
7898 return -1;
7899
7900 if (isAsmOnlyOpcode(MCOp))
7901 return -1;
7902
7903 return MCOp;
7904}
7905
7906static
7907TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
7908 assert(RegOpnd.isReg())(static_cast <bool> (RegOpnd.isReg()) ? void (0) : __assert_fail
("RegOpnd.isReg()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 7908, __extension__ __PRETTY_FUNCTION__))
;
7909 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
7910 getRegSubRegPair(RegOpnd);
7911}
7912
7913TargetInstrInfo::RegSubRegPair
7914llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
7915 assert(MI.isRegSequence())(static_cast <bool> (MI.isRegSequence()) ? void (0) : __assert_fail
("MI.isRegSequence()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 7915, __extension__ __PRETTY_FUNCTION__))
;
7916 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
7917 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
7918 auto &RegOp = MI.getOperand(1 + 2 * I);
7919 return getRegOrUndef(RegOp);
7920 }
7921 return TargetInstrInfo::RegSubRegPair();
7922}
7923
7924// Try to find the definition of reg:subreg in subreg-manipulation pseudos
7925// Following a subreg of reg:subreg isn't supported
7926static bool followSubRegDef(MachineInstr &MI,
7927 TargetInstrInfo::RegSubRegPair &RSR) {
7928 if (!RSR.SubReg)
7929 return false;
7930 switch (MI.getOpcode()) {
7931 default: break;
7932 case AMDGPU::REG_SEQUENCE:
7933 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
7934 return true;
7935 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
7936 case AMDGPU::INSERT_SUBREG:
7937 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
7938 // inserted the subreg we're looking for
7939 RSR = getRegOrUndef(MI.getOperand(2));
7940 else { // the subreg in the rest of the reg
7941 auto R1 = getRegOrUndef(MI.getOperand(1));
7942 if (R1.SubReg) // subreg of subreg isn't supported
7943 return false;
7944 RSR.Reg = R1.Reg;
7945 }
7946 return true;
7947 }
7948 return false;
7949}
7950
7951MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
7952 MachineRegisterInfo &MRI) {
7953 assert(MRI.isSSA())(static_cast <bool> (MRI.isSSA()) ? void (0) : __assert_fail
("MRI.isSSA()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7953
, __extension__ __PRETTY_FUNCTION__))
;
7954 if (!P.Reg.isVirtual())
7955 return nullptr;
7956
7957 auto RSR = P;
7958 auto *DefInst = MRI.getVRegDef(RSR.Reg);
7959 while (auto *MI = DefInst) {
7960 DefInst = nullptr;
7961 switch (MI->getOpcode()) {
7962 case AMDGPU::COPY:
7963 case AMDGPU::V_MOV_B32_e32: {
7964 auto &Op1 = MI->getOperand(1);
7965 if (Op1.isReg() && Op1.getReg().isVirtual()) {
7966 if (Op1.isUndef())
7967 return nullptr;
7968 RSR = getRegSubRegPair(Op1);
7969 DefInst = MRI.getVRegDef(RSR.Reg);
7970 }
7971 break;
7972 }
7973 default:
7974 if (followSubRegDef(*MI, RSR)) {
7975 if (!RSR.Reg)
7976 return nullptr;
7977 DefInst = MRI.getVRegDef(RSR.Reg);
7978 }
7979 }
7980 if (!DefInst)
7981 return MI;
7982 }
7983 return nullptr;
7984}
7985
7986bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
7987 Register VReg,
7988 const MachineInstr &DefMI,
7989 const MachineInstr &UseMI) {
7990 assert(MRI.isSSA() && "Must be run on SSA")(static_cast <bool> (MRI.isSSA() && "Must be run on SSA"
) ? void (0) : __assert_fail ("MRI.isSSA() && \"Must be run on SSA\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7990, __extension__
__PRETTY_FUNCTION__))
;
7991
7992 auto *TRI = MRI.getTargetRegisterInfo();
7993 auto *DefBB = DefMI.getParent();
7994
7995 // Don't bother searching between blocks, although it is possible this block
7996 // doesn't modify exec.
7997 if (UseMI.getParent() != DefBB)
7998 return true;
7999
8000 const int MaxInstScan = 20;
8001 int NumInst = 0;
8002
8003 // Stop scan at the use.
8004 auto E = UseMI.getIterator();
8005 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
8006 if (I->isDebugInstr())
8007 continue;
8008
8009 if (++NumInst > MaxInstScan)
8010 return true;
8011
8012 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
8013 return true;
8014 }
8015
8016 return false;
8017}
8018
8019bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
8020 Register VReg,
8021 const MachineInstr &DefMI) {
8022 assert(MRI.isSSA() && "Must be run on SSA")(static_cast <bool> (MRI.isSSA() && "Must be run on SSA"
) ? void (0) : __assert_fail ("MRI.isSSA() && \"Must be run on SSA\""
, "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 8022, __extension__
__PRETTY_FUNCTION__))
;
8023
8024 auto *TRI = MRI.getTargetRegisterInfo();
8025 auto *DefBB = DefMI.getParent();
8026
8027 const int MaxUseScan = 10;
8028 int NumUse = 0;
8029
8030 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
8031 auto &UseInst = *Use.getParent();
8032 // Don't bother searching between blocks, although it is possible this block
8033 // doesn't modify exec.
8034 if (UseInst.getParent() != DefBB)
8035 return true;
8036
8037 if (++NumUse > MaxUseScan)
8038 return true;
8039 }
8040
8041 if (NumUse == 0)
8042 return false;
8043
8044 const int MaxInstScan = 20;
8045 int NumInst = 0;
8046
8047 // Stop scan when we have seen all the uses.
8048 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
8049 assert(I != DefBB->end())(static_cast <bool> (I != DefBB->end()) ? void (0) :
__assert_fail ("I != DefBB->end()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp"
, 8049, __extension__ __PRETTY_FUNCTION__))
;
8050
8051 if (I->isDebugInstr())
8052 continue;
8053
8054 if (++NumInst > MaxInstScan)
8055 return true;
8056
8057 for (const MachineOperand &Op : I->operands()) {
8058 // We don't check reg masks here as they're used only on calls:
8059 // 1. EXEC is only considered const within one BB
8060 // 2. Call should be a terminator instruction if present in a BB
8061
8062 if (!Op.isReg())
8063 continue;
8064
8065 Register Reg = Op.getReg();
8066 if (Op.isUse()) {
8067 if (Reg == VReg && --NumUse == 0)
8068 return false;
8069 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
8070 return true;
8071 }
8072 }
8073}
8074
8075MachineInstr *SIInstrInfo::createPHIDestinationCopy(
8076 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
8077 const DebugLoc &DL, Register Src, Register Dst) const {
8078 auto Cur = MBB.begin();
8079 if (Cur != MBB.end())
8080 do {
8081 if (!Cur->isPHI() && Cur->readsRegister(Dst))
8082 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
8083 ++Cur;
8084 } while (Cur != MBB.end() && Cur != LastPHIIt);
8085
8086 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
8087 Dst);
8088}
8089
8090MachineInstr *SIInstrInfo::createPHISourceCopy(
8091 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
8092 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
8093 if (InsPt != MBB.end() &&
8094 (InsPt->getOpcode() == AMDGPU::SI_IF ||
8095 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
8096 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
8097 InsPt->definesRegister(Src)) {
8098 InsPt++;
8099 return BuildMI(MBB, InsPt, DL,
8100 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
8101 : AMDGPU::S_MOV_B64_term),
8102 Dst)
8103 .addReg(Src, 0, SrcSubReg)
8104 .addReg(AMDGPU::EXEC, RegState::Implicit);
8105 }
8106 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
8107 Dst);
8108}
8109
8110bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
8111
8112MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
8113 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
8114 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
8115 VirtRegMap *VRM) const {
8116 // This is a bit of a hack (copied from AArch64). Consider this instruction:
8117 //
8118 // %0:sreg_32 = COPY $m0
8119 //
8120 // We explicitly chose SReg_32 for the virtual register so such a copy might
8121 // be eliminated by RegisterCoalescer. However, that may not be possible, and
8122 // %0 may even spill. We can't spill $m0 normally (it would require copying to
8123 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
8124 // TargetInstrInfo::foldMemoryOperand() is going to try.
8125 // A similar issue also exists with spilling and reloading $exec registers.
8126 //
8127 // To prevent that, constrain the %0 register class here.
8128 if (MI.isFullCopy()) {
8129 Register DstReg = MI.getOperand(0).getReg();
8130 Register SrcReg = MI.getOperand(1).getReg();
8131 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
8132 (DstReg.isVirtual() != SrcReg.isVirtual())) {
8133 MachineRegisterInfo &MRI = MF.getRegInfo();
8134 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
8135 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
8136 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
8137 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
8138 return nullptr;
8139 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
8140 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
8141 return nullptr;
8142 }
8143 }
8144 }
8145
8146 return nullptr;
8147}
8148
8149unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
8150 const MachineInstr &MI,
8151 unsigned *PredCost) const {
8152 if (MI.isBundle()) {
8153 MachineBasicBlock::const_instr_iterator I(MI.getIterator());
8154 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
8155 unsigned Lat = 0, Count = 0;
8156 for (++I; I != E && I->isBundledWithPred(); ++I) {
8157 ++Count;
8158 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
8159 }
8160 return Lat + Count - 1;
8161 }
8162
8163 return SchedModel.computeInstrLatency(&MI);
8164}
8165
8166unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
8167 switch (MF.getFunction().getCallingConv()) {
8168 case CallingConv::AMDGPU_PS:
8169 return 1;
8170 case CallingConv::AMDGPU_VS:
8171 return 2;
8172 case CallingConv::AMDGPU_GS:
8173 return 3;
8174 case CallingConv::AMDGPU_HS:
8175 case CallingConv::AMDGPU_LS:
8176 case CallingConv::AMDGPU_ES:
8177 report_fatal_error("ds_ordered_count unsupported for this calling conv");
8178 case CallingConv::AMDGPU_CS:
8179 case CallingConv::AMDGPU_KERNEL:
8180 case CallingConv::C:
8181 case CallingConv::Fast:
8182 default:
8183 // Assume other calling conventions are various compute callable functions
8184 return 0;
8185 }
8186}
8187
8188bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
8189 Register &SrcReg2, int64_t &CmpMask,
8190 int64_t &CmpValue) const {
8191 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
8192 return false;
8193
8194 switch (MI.getOpcode()) {
8195 default:
8196 break;
8197 case AMDGPU::S_CMP_EQ_U32:
8198 case AMDGPU::S_CMP_EQ_I32:
8199 case AMDGPU::S_CMP_LG_U32:
8200 case AMDGPU::S_CMP_LG_I32:
8201 case AMDGPU::S_CMP_LT_U32:
8202 case AMDGPU::S_CMP_LT_I32:
8203 case AMDGPU::S_CMP_GT_U32:
8204 case AMDGPU::S_CMP_GT_I32:
8205 case AMDGPU::S_CMP_LE_U32:
8206 case AMDGPU::S_CMP_LE_I32:
8207 case AMDGPU::S_CMP_GE_U32:
8208 case AMDGPU::S_CMP_GE_I32:
8209 case AMDGPU::S_CMP_EQ_U64:
8210 case AMDGPU::S_CMP_LG_U64:
8211 SrcReg = MI.getOperand(0).getReg();
8212 if (MI.getOperand(1).isReg()) {
8213 if (MI.getOperand(1).getSubReg())
8214 return false;
8215 SrcReg2 = MI.getOperand(1).getReg();
8216 CmpValue = 0;
8217 } else if (MI.getOperand(1).isImm()) {
8218 SrcReg2 = Register();
8219 CmpValue = MI.getOperand(1).getImm();
8220 } else {
8221 return false;
8222 }
8223 CmpMask = ~0;
8224 return true;
8225 case AMDGPU::S_CMPK_EQ_U32:
8226 case AMDGPU::S_CMPK_EQ_I32:
8227 case AMDGPU::S_CMPK_LG_U32:
8228 case AMDGPU::S_CMPK_LG_I32:
8229 case AMDGPU::S_CMPK_LT_U32:
8230 case AMDGPU::S_CMPK_LT_I32:
8231 case AMDGPU::S_CMPK_GT_U32:
8232 case AMDGPU::S_CMPK_GT_I32:
8233 case AMDGPU::S_CMPK_LE_U32:
8234 case AMDGPU::S_CMPK_LE_I32:
8235 case AMDGPU::S_CMPK_GE_U32:
8236 case AMDGPU::S_CMPK_GE_I32:
8237 SrcReg = MI.getOperand(0).getReg();
8238 SrcReg2 = Register();
8239 CmpValue = MI.getOperand(1).getImm();
8240 CmpMask = ~0;
8241 return true;
8242 }
8243
8244 return false;
8245}
8246
8247bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
8248 Register SrcReg2, int64_t CmpMask,
8249 int64_t CmpValue,
8250 const MachineRegisterInfo *MRI) const {
8251 if (!SrcReg || SrcReg.isPhysical())
8252 return false;
8253
8254 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
8255 return false;
8256
8257 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
8258 this](int64_t ExpectedValue, unsigned SrcSize,
8259 bool IsReversible, bool IsSigned) -> bool {
8260 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
8261 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
8262 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
8263 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
8264 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
8265 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
8266 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
8267 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
8268 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
8269 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
8270 //
8271 // Signed ge/gt are not used for the sign bit.
8272 //
8273 // If result of the AND is unused except in the compare:
8274 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
8275 //
8276 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
8277 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
8278 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
8279 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
8280 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
8281 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
8282
8283 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
8284 if (!Def || Def->getParent() != CmpInstr.getParent())
8285 return false;
8286
8287 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
8288 Def->getOpcode() != AMDGPU::S_AND_B64)
8289 return false;
8290
8291 int64_t Mask;
8292 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
8293 if (MO->isImm())
8294 Mask = MO->getImm();
8295 else if (!getFoldableImm(MO, Mask))
8296 return false;
8297 Mask &= maxUIntN(SrcSize);
8298 return isPowerOf2_64(Mask);
8299 };
8300
8301 MachineOperand *SrcOp = &Def->getOperand(1);
8302 if (isMask(SrcOp))
8303 SrcOp = &Def->getOperand(2);
8304 else if (isMask(&Def->getOperand(2)))
8305 SrcOp = &Def->getOperand(1);
8306 else
8307 return false;
8308
8309 unsigned BitNo = countTrailingZeros((uint64_t)Mask);
8310 if (IsSigned && BitNo == SrcSize - 1)
8311 return false;
8312
8313 ExpectedValue <<= BitNo;
8314
8315 bool IsReversedCC = false;
8316 if (CmpValue != ExpectedValue) {
8317 if (!IsReversible)
8318 return false;
8319 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
8320 if (!IsReversedCC)
8321 return false;
8322 }
8323
8324 Register DefReg = Def->getOperand(0).getReg();
8325 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
8326 return false;
8327
8328 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
8329 I != E; ++I) {
8330 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
8331 I->killsRegister(AMDGPU::SCC, &RI))
8332 return false;
8333 }
8334
8335 MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC);
8336 SccDef->setIsDead(false);
8337 CmpInstr.eraseFromParent();
8338
8339 if (!MRI->use_nodbg_empty(DefReg)) {
8340 assert(!IsReversedCC)(static_cast <bool> (!IsReversedCC) ? void (0) : __assert_fail
("!IsReversedCC", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 8340
, __extension__ __PRETTY_FUNCTION__))
;
8341 return true;
8342 }
8343
8344 // Replace AND with unused result with a S_BITCMP.
8345 MachineBasicBlock *MBB = Def->getParent();
8346
8347 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
8348 : AMDGPU::S_BITCMP1_B32
8349 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
8350 : AMDGPU::S_BITCMP1_B64;
8351
8352 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
8353 .add(*SrcOp)
8354 .addImm(BitNo);
8355 Def->eraseFromParent();
8356
8357 return true;
8358 };
8359
8360 switch (CmpInstr.getOpcode()) {
8361 default:
8362 break;
8363 case AMDGPU::S_CMP_EQ_U32:
8364 case AMDGPU::S_CMP_EQ_I32:
8365 case AMDGPU::S_CMPK_EQ_U32:
8366 case AMDGPU::S_CMPK_EQ_I32:
8367 return optimizeCmpAnd(1, 32, true, false);
8368 case AMDGPU::S_CMP_GE_U32:
8369 case AMDGPU::S_CMPK_GE_U32:
8370 return optimizeCmpAnd(1, 32, false, false);
8371 case AMDGPU::S_CMP_GE_I32:
8372 case AMDGPU::S_CMPK_GE_I32:
8373 return optimizeCmpAnd(1, 32, false, true);
8374 case AMDGPU::S_CMP_EQ_U64:
8375 return optimizeCmpAnd(1, 64, true, false);
8376 case AMDGPU::S_CMP_LG_U32:
8377 case AMDGPU::S_CMP_LG_I32:
8378 case AMDGPU::S_CMPK_LG_U32:
8379 case AMDGPU::S_CMPK_LG_I32:
8380 return optimizeCmpAnd(0, 32, true, false);
8381 case AMDGPU::S_CMP_GT_U32:
8382 case AMDGPU::S_CMPK_GT_U32:
8383 return optimizeCmpAnd(0, 32, false, false);
8384 case AMDGPU::S_CMP_GT_I32:
8385 case AMDGPU::S_CMPK_GT_I32:
8386 return optimizeCmpAnd(0, 32, false, true);
8387 case AMDGPU::S_CMP_LG_U64:
8388 return optimizeCmpAnd(0, 64, true, false);
8389 }
8390
8391 return false;
8392}

/build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/llvm/include/llvm/CodeGen/Register.h

1//===-- llvm/CodeGen/Register.h ---------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_CODEGEN_REGISTER_H
10#define LLVM_CODEGEN_REGISTER_H
11
12#include "llvm/MC/MCRegister.h"
13#include <cassert>
14
15namespace llvm {
16
17/// Wrapper class representing virtual and physical registers. Should be passed
18/// by value.
19class Register {
20 unsigned Reg;
21
22public:
23 constexpr Register(unsigned Val = 0): Reg(Val) {}
24 constexpr Register(MCRegister Val): Reg(Val) {}
25
26 // Register numbers can represent physical registers, virtual registers, and
27 // sometimes stack slots. The unsigned values are divided into these ranges:
28 //
29 // 0 Not a register, can be used as a sentinel.
30 // [1;2^30) Physical registers assigned by TableGen.
31 // [2^30;2^31) Stack slots. (Rarely used.)
32 // [2^31;2^32) Virtual registers assigned by MachineRegisterInfo.
33 //
34 // Further sentinels can be allocated from the small negative integers.
35 // DenseMapInfo<unsigned> uses -1u and -2u.
36 static_assert(std::numeric_limits<decltype(Reg)>::max() >= 0xFFFFFFFF,
37 "Reg isn't large enough to hold full range.");
38
39 /// isStackSlot - Sometimes it is useful the be able to store a non-negative
40 /// frame index in a variable that normally holds a register. isStackSlot()
41 /// returns true if Reg is in the range used for stack slots.
42 ///
43 /// FIXME: remove in favor of member.
44 static bool isStackSlot(unsigned Reg) {
45 return MCRegister::isStackSlot(Reg);
46 }
47
48 /// Return true if this is a stack slot.
49 bool isStack() const { return MCRegister::isStackSlot(Reg); }
50
51 /// Compute the frame index from a register value representing a stack slot.
52 static int stackSlot2Index(Register Reg) {
53 assert(Reg.isStack() && "Not a stack slot")(static_cast <bool> (Reg.isStack() && "Not a stack slot"
) ? void (0) : __assert_fail ("Reg.isStack() && \"Not a stack slot\""
, "llvm/include/llvm/CodeGen/Register.h", 53, __extension__ __PRETTY_FUNCTION__
))
;
54 return int(Reg - MCRegister::FirstStackSlot);
55 }
56
57 /// Convert a non-negative frame index to a stack slot register value.
58 static Register index2StackSlot(int FI) {
59 assert(FI >= 0 && "Cannot hold a negative frame index.")(static_cast <bool> (FI >= 0 && "Cannot hold a negative frame index."
) ? void (0) : __assert_fail ("FI >= 0 && \"Cannot hold a negative frame index.\""
, "llvm/include/llvm/CodeGen/Register.h", 59, __extension__ __PRETTY_FUNCTION__
))
;
60 return Register(FI + MCRegister::FirstStackSlot);
61 }
62
63 /// Return true if the specified register number is in
64 /// the physical register namespace.
65 static bool isPhysicalRegister(unsigned Reg) {
66 return MCRegister::isPhysicalRegister(Reg);
67 }
68
69 /// Return true if the specified register number is in
70 /// the virtual register namespace.
71 static bool isVirtualRegister(unsigned Reg) {
72 return Reg & MCRegister::VirtualRegFlag && !isStackSlot(Reg);
7
Assuming the condition is true
8
Returning the value 1, which participates in a condition later
73 }
74
75 /// Convert a virtual register number to a 0-based index.
76 /// The first virtual register in a function will get the index 0.
77 static unsigned virtReg2Index(Register Reg) {
78 assert(isVirtualRegister(Reg) && "Not a virtual register")(static_cast <bool> (isVirtualRegister(Reg) && "Not a virtual register"
) ? void (0) : __assert_fail ("isVirtualRegister(Reg) && \"Not a virtual register\""
, "llvm/include/llvm/CodeGen/Register.h", 78, __extension__ __PRETTY_FUNCTION__
))
;
79 return Reg & ~MCRegister::VirtualRegFlag;
80 }
81
82 /// Convert a 0-based index to a virtual register number.
83 /// This is the inverse operation of VirtReg2IndexFunctor below.
84 static Register index2VirtReg(unsigned Index) {
85 assert(Index < (1u << 31) && "Index too large for virtual register range.")(static_cast <bool> (Index < (1u << 31) &&
"Index too large for virtual register range.") ? void (0) : __assert_fail
("Index < (1u << 31) && \"Index too large for virtual register range.\""
, "llvm/include/llvm/CodeGen/Register.h", 85, __extension__ __PRETTY_FUNCTION__
))
;
86 return Index | MCRegister::VirtualRegFlag;
87 }
88
89 /// Return true if the specified register number is in the virtual register
90 /// namespace.
91 bool isVirtual() const {
92 return isVirtualRegister(Reg);
6
Calling 'Register::isVirtualRegister'
9
Returning from 'Register::isVirtualRegister'
10
Returning the value 1, which participates in a condition later
93 }
94
95 /// Return true if the specified register number is in the physical register
96 /// namespace.
97 bool isPhysical() const {
98 return isPhysicalRegister(Reg);
99 }
100
101 /// Convert a virtual register number to a 0-based index. The first virtual
102 /// register in a function will get the index 0.
103 unsigned virtRegIndex() const {
104 return virtReg2Index(Reg);
105 }
106
107 constexpr operator unsigned() const {
108 return Reg;
109 }
110
111 unsigned id() const { return Reg; }
112
113 operator MCRegister() const {
114 return MCRegister(Reg);
115 }
116
117 /// Utility to check-convert this value to a MCRegister. The caller is
118 /// expected to have already validated that this Register is, indeed,
119 /// physical.
120 MCRegister asMCReg() const {
121 assert(Reg == MCRegister::NoRegister ||(static_cast <bool> (Reg == MCRegister::NoRegister || MCRegister
::isPhysicalRegister(Reg)) ? void (0) : __assert_fail ("Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister(Reg)"
, "llvm/include/llvm/CodeGen/Register.h", 122, __extension__ __PRETTY_FUNCTION__
))
122 MCRegister::isPhysicalRegister(Reg))(static_cast <bool> (Reg == MCRegister::NoRegister || MCRegister
::isPhysicalRegister(Reg)) ? void (0) : __assert_fail ("Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister(Reg)"
, "llvm/include/llvm/CodeGen/Register.h", 122, __extension__ __PRETTY_FUNCTION__
))
;
123 return MCRegister(Reg);
124 }
125
126 bool isValid() const { return Reg != MCRegister::NoRegister; }
127
128 /// Comparisons between register objects
129 bool operator==(const Register &Other) const { return Reg == Other.Reg; }
130 bool operator!=(const Register &Other) const { return Reg != Other.Reg; }
131 bool operator==(const MCRegister &Other) const { return Reg == Other.id(); }
132 bool operator!=(const MCRegister &Other) const { return Reg != Other.id(); }
133
134 /// Comparisons against register constants. E.g.
135 /// * R == AArch64::WZR
136 /// * R == 0
137 /// * R == VirtRegMap::NO_PHYS_REG
138 bool operator==(unsigned Other) const { return Reg == Other; }
139 bool operator!=(unsigned Other) const { return Reg != Other; }
140 bool operator==(int Other) const { return Reg == unsigned(Other); }
141 bool operator!=(int Other) const { return Reg != unsigned(Other); }
142 // MSVC requires that we explicitly declare these two as well.
143 bool operator==(MCPhysReg Other) const { return Reg == unsigned(Other); }
144 bool operator!=(MCPhysReg Other) const { return Reg != unsigned(Other); }
145};
146
147// Provide DenseMapInfo for Register
148template<> struct DenseMapInfo<Register> {
149 static inline unsigned getEmptyKey() {
150 return DenseMapInfo<unsigned>::getEmptyKey();
151 }
152 static inline unsigned getTombstoneKey() {
153 return DenseMapInfo<unsigned>::getTombstoneKey();
154 }
155 static unsigned getHashValue(const Register &Val) {
156 return DenseMapInfo<unsigned>::getHashValue(Val.id());
157 }
158 static bool isEqual(const Register &LHS, const Register &RHS) {
159 return DenseMapInfo<unsigned>::isEqual(LHS.id(), RHS.id());
160 }
161};
162
163}
164
165#endif // LLVM_CODEGEN_REGISTER_H