Line data Source code
1 : //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// SI Implementation of TargetInstrInfo.
12 : //
13 : //===----------------------------------------------------------------------===//
14 :
15 : #include "SIInstrInfo.h"
16 : #include "AMDGPU.h"
17 : #include "AMDGPUIntrinsicInfo.h"
18 : #include "AMDGPUSubtarget.h"
19 : #include "GCNHazardRecognizer.h"
20 : #include "SIDefines.h"
21 : #include "SIMachineFunctionInfo.h"
22 : #include "SIRegisterInfo.h"
23 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 : #include "Utils/AMDGPUBaseInfo.h"
25 : #include "llvm/ADT/APInt.h"
26 : #include "llvm/ADT/ArrayRef.h"
27 : #include "llvm/ADT/SmallVector.h"
28 : #include "llvm/ADT/StringRef.h"
29 : #include "llvm/ADT/iterator_range.h"
30 : #include "llvm/Analysis/AliasAnalysis.h"
31 : #include "llvm/Analysis/MemoryLocation.h"
32 : #include "llvm/Analysis/ValueTracking.h"
33 : #include "llvm/CodeGen/MachineBasicBlock.h"
34 : #include "llvm/CodeGen/MachineDominators.h"
35 : #include "llvm/CodeGen/MachineFrameInfo.h"
36 : #include "llvm/CodeGen/MachineFunction.h"
37 : #include "llvm/CodeGen/MachineInstr.h"
38 : #include "llvm/CodeGen/MachineInstrBuilder.h"
39 : #include "llvm/CodeGen/MachineInstrBundle.h"
40 : #include "llvm/CodeGen/MachineMemOperand.h"
41 : #include "llvm/CodeGen/MachineOperand.h"
42 : #include "llvm/CodeGen/MachineRegisterInfo.h"
43 : #include "llvm/CodeGen/RegisterScavenging.h"
44 : #include "llvm/CodeGen/ScheduleDAG.h"
45 : #include "llvm/CodeGen/SelectionDAGNodes.h"
46 : #include "llvm/CodeGen/TargetOpcodes.h"
47 : #include "llvm/CodeGen/TargetRegisterInfo.h"
48 : #include "llvm/IR/DebugLoc.h"
49 : #include "llvm/IR/DiagnosticInfo.h"
50 : #include "llvm/IR/Function.h"
51 : #include "llvm/IR/InlineAsm.h"
52 : #include "llvm/IR/LLVMContext.h"
53 : #include "llvm/MC/MCInstrDesc.h"
54 : #include "llvm/Support/Casting.h"
55 : #include "llvm/Support/CommandLine.h"
56 : #include "llvm/Support/Compiler.h"
57 : #include "llvm/Support/ErrorHandling.h"
58 : #include "llvm/Support/MachineValueType.h"
59 : #include "llvm/Support/MathExtras.h"
60 : #include "llvm/Target/TargetMachine.h"
61 : #include <cassert>
62 : #include <cstdint>
63 : #include <iterator>
64 : #include <utility>
65 :
66 : using namespace llvm;
67 :
68 : #define GET_INSTRINFO_CTOR_DTOR
69 : #include "AMDGPUGenInstrInfo.inc"
70 :
71 : namespace llvm {
72 : namespace AMDGPU {
73 : #define GET_D16ImageDimIntrinsics_IMPL
74 : #define GET_ImageDimIntrinsicTable_IMPL
75 : #define GET_RsrcIntrinsics_IMPL
76 : #include "AMDGPUGenSearchableTables.inc"
77 : }
78 : }
79 :
80 :
81 : // Must be at least 4 to be able to branch over minimum unconditional branch
82 : // code. This is only for making it possible to write reasonably small tests for
83 : // long branches.
84 : static cl::opt<unsigned>
85 : BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
86 : cl::desc("Restrict range of branch instructions (DEBUG)"));
87 :
88 2492 : SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
89 : : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
90 2492 : RI(ST), ST(ST) {}
91 :
92 : //===----------------------------------------------------------------------===//
93 : // TargetInstrInfo callbacks
94 : //===----------------------------------------------------------------------===//
95 :
96 : static unsigned getNumOperandsNoGlue(SDNode *Node) {
97 7246 : unsigned N = Node->getNumOperands();
98 471212 : while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
99 : --N;
100 : return N;
101 : }
102 :
103 428134 : static SDValue findChainOperand(SDNode *Load) {
104 856268 : SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
105 : assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
106 428134 : return LastOp;
107 : }
108 :
109 : /// Returns true if both nodes have the same value for the given
110 : /// operand \p Op, or if both nodes do not have this operand.
111 600570 : static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
112 600570 : unsigned Opc0 = N0->getMachineOpcode();
113 600570 : unsigned Opc1 = N1->getMachineOpcode();
114 :
115 600570 : int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
116 600570 : int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
117 :
118 600570 : if (Op0Idx == -1 && Op1Idx == -1)
119 : return true;
120 :
121 :
122 595783 : if ((Op0Idx == -1 && Op1Idx != -1) ||
123 : (Op1Idx == -1 && Op0Idx != -1))
124 : return false;
125 :
126 : // getNamedOperandIdx returns the index for the MachineInstr's operands,
127 : // which includes the result as the first operand. We are indexing into the
128 : // MachineSDNode's operands, so we need to skip the result operand to get
129 : // the real index.
130 595757 : --Op0Idx;
131 595757 : --Op1Idx;
132 :
133 1787271 : return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
134 : }
135 :
136 20938 : bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
137 : AliasAnalysis *AA) const {
138 : // TODO: The generic check fails for VALU instructions that should be
139 : // rematerializable due to implicit reads of exec. We really want all of the
140 : // generic logic for this except for this.
141 20938 : switch (MI.getOpcode()) {
142 : case AMDGPU::V_MOV_B32_e32:
143 : case AMDGPU::V_MOV_B32_e64:
144 : case AMDGPU::V_MOV_B64_PSEUDO:
145 : return true;
146 : default:
147 : return false;
148 : }
149 : }
150 :
151 415754 : bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
152 : int64_t &Offset0,
153 : int64_t &Offset1) const {
154 415754 : if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
155 : return false;
156 :
157 : unsigned Opc0 = Load0->getMachineOpcode();
158 : unsigned Opc1 = Load1->getMachineOpcode();
159 :
160 : // Make sure both are actually loads.
161 1048011 : if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
162 : return false;
163 :
164 629476 : if (isDS(Opc0) && isDS(Opc1)) {
165 :
166 : // FIXME: Handle this case:
167 3623 : if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
168 : return false;
169 :
170 : // Check base reg.
171 3607 : if (Load0->getOperand(1) != Load1->getOperand(1))
172 : return false;
173 :
174 : // Check chain.
175 16 : if (findChainOperand(Load0) != findChainOperand(Load1))
176 0 : return false;
177 :
178 : // Skip read2 / write2 variants for simplicity.
179 : // TODO: We should report true if the used offsets are adjacent (excluded
180 : // st64 versions).
181 16 : if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
182 16 : AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
183 : return false;
184 :
185 32 : Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
186 32 : Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
187 16 : return true;
188 : }
189 :
190 311115 : if (isSMRD(Opc0) && isSMRD(Opc1)) {
191 : // Skip time and cache invalidation instructions.
192 25256 : if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
193 25249 : AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
194 : return false;
195 :
196 : assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
197 :
198 : // Check base reg.
199 25242 : if (Load0->getOperand(0) != Load1->getOperand(0))
200 : return false;
201 :
202 : const ConstantSDNode *Load0Offset =
203 : dyn_cast<ConstantSDNode>(Load0->getOperand(1));
204 : const ConstantSDNode *Load1Offset =
205 : dyn_cast<ConstantSDNode>(Load1->getOperand(1));
206 :
207 18471 : if (!Load0Offset || !Load1Offset)
208 : return false;
209 :
210 : // Check chain.
211 18459 : if (findChainOperand(Load0) != findChainOperand(Load1))
212 0 : return false;
213 :
214 18459 : Offset0 = Load0Offset->getZExtValue();
215 18459 : Offset1 = Load1Offset->getZExtValue();
216 18459 : return true;
217 : }
218 :
219 : // MUBUF and MTBUF can access the same addresses.
220 285859 : if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
221 :
222 : // MUBUF and MTBUF have vaddr at different indices.
223 218278 : if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
224 195592 : findChainOperand(Load0) != findChainOperand(Load1) ||
225 413870 : !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
226 186700 : !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
227 33012 : return false;
228 :
229 185266 : int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
230 185266 : int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
231 :
232 185266 : if (OffIdx0 == -1 || OffIdx1 == -1)
233 : return false;
234 :
235 : // getNamedOperandIdx returns the index for MachineInstrs. Since they
236 : // inlcude the output in the operand list, but SDNodes don't, we need to
237 : // subtract the index by one.
238 185266 : --OffIdx0;
239 185266 : --OffIdx1;
240 :
241 185266 : SDValue Off0 = Load0->getOperand(OffIdx0);
242 370532 : SDValue Off1 = Load1->getOperand(OffIdx1);
243 :
244 : // The offset might be a FrameIndexSDNode.
245 : if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
246 : return false;
247 :
248 185266 : Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
249 185266 : Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
250 185266 : return true;
251 : }
252 :
253 : return false;
254 : }
255 :
256 : static bool isStride64(unsigned Opc) {
257 : switch (Opc) {
258 : case AMDGPU::DS_READ2ST64_B32:
259 : case AMDGPU::DS_READ2ST64_B64:
260 : case AMDGPU::DS_WRITE2ST64_B32:
261 : case AMDGPU::DS_WRITE2ST64_B64:
262 : return true;
263 : default:
264 : return false;
265 : }
266 : }
267 :
268 1050942 : bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
269 : int64_t &Offset,
270 : const TargetRegisterInfo *TRI) const {
271 1050942 : unsigned Opc = LdSt.getOpcode();
272 :
273 1050942 : if (isDS(LdSt)) {
274 : const MachineOperand *OffsetImm =
275 129747 : getNamedOperand(LdSt, AMDGPU::OpName::offset);
276 129747 : if (OffsetImm) {
277 : // Normal, single offset LDS instruction.
278 : const MachineOperand *AddrReg =
279 63825 : getNamedOperand(LdSt, AMDGPU::OpName::addr);
280 :
281 63825 : BaseReg = AddrReg->getReg();
282 63825 : Offset = OffsetImm->getImm();
283 63825 : return true;
284 : }
285 :
286 : // The 2 offset instructions use offset0 and offset1 instead. We can treat
287 : // these as a load with a single offset if the 2 offsets are consecutive. We
288 : // will use this for some partially aligned loads.
289 : const MachineOperand *Offset0Imm =
290 65922 : getNamedOperand(LdSt, AMDGPU::OpName::offset0);
291 : const MachineOperand *Offset1Imm =
292 65922 : getNamedOperand(LdSt, AMDGPU::OpName::offset1);
293 :
294 65922 : uint8_t Offset0 = Offset0Imm->getImm();
295 65922 : uint8_t Offset1 = Offset1Imm->getImm();
296 :
297 65922 : if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
298 : // Each of these offsets is in element sized units, so we need to convert
299 : // to bytes of the individual reads.
300 :
301 : unsigned EltSize;
302 51854 : if (LdSt.mayLoad())
303 13340 : EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
304 : else {
305 : assert(LdSt.mayStore());
306 38514 : int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
307 38514 : EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
308 : }
309 :
310 : if (isStride64(Opc))
311 5 : EltSize *= 64;
312 :
313 : const MachineOperand *AddrReg =
314 51854 : getNamedOperand(LdSt, AMDGPU::OpName::addr);
315 51854 : BaseReg = AddrReg->getReg();
316 51854 : Offset = EltSize * Offset0;
317 51854 : return true;
318 : }
319 :
320 : return false;
321 : }
322 :
323 921195 : if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
324 811514 : const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
325 811514 : if (SOffset && SOffset->isReg())
326 : return false;
327 :
328 : const MachineOperand *AddrReg =
329 95031 : getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
330 95031 : if (!AddrReg)
331 : return false;
332 :
333 : const MachineOperand *OffsetImm =
334 6176 : getNamedOperand(LdSt, AMDGPU::OpName::offset);
335 6176 : BaseReg = AddrReg->getReg();
336 6176 : Offset = OffsetImm->getImm();
337 :
338 6176 : if (SOffset) // soffset can be an inline immediate.
339 6176 : Offset += SOffset->getImm();
340 :
341 6176 : return true;
342 : }
343 :
344 109681 : if (isSMRD(LdSt)) {
345 : const MachineOperand *OffsetImm =
346 27118 : getNamedOperand(LdSt, AMDGPU::OpName::offset);
347 27118 : if (!OffsetImm)
348 : return false;
349 :
350 : const MachineOperand *SBaseReg =
351 27033 : getNamedOperand(LdSt, AMDGPU::OpName::sbase);
352 27033 : BaseReg = SBaseReg->getReg();
353 27033 : Offset = OffsetImm->getImm();
354 27033 : return true;
355 : }
356 :
357 82563 : if (isFLAT(LdSt)) {
358 80558 : const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
359 80558 : if (VAddr) {
360 : // Can't analyze 2 offsets.
361 80558 : if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
362 : return false;
363 :
364 80558 : BaseReg = VAddr->getReg();
365 : } else {
366 : // scratch instructions have either vaddr or saddr.
367 0 : BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
368 : }
369 :
370 80558 : Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
371 80558 : return true;
372 : }
373 :
374 : return false;
375 : }
376 :
377 23662 : static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
378 : const MachineInstr &MI2, unsigned BaseReg2) {
379 23662 : if (BaseReg1 == BaseReg2)
380 : return true;
381 :
382 20100 : if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
383 263 : return false;
384 :
385 9788 : auto MO1 = *MI1.memoperands_begin();
386 9788 : auto MO2 = *MI2.memoperands_begin();
387 9788 : if (MO1->getAddrSpace() != MO2->getAddrSpace())
388 : return false;
389 :
390 : auto Base1 = MO1->getValue();
391 : auto Base2 = MO2->getValue();
392 3168 : if (!Base1 || !Base2)
393 : return false;
394 3112 : const MachineFunction &MF = *MI1.getParent()->getParent();
395 3112 : const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
396 : Base1 = GetUnderlyingObject(Base1, DL);
397 : Base2 = GetUnderlyingObject(Base1, DL);
398 :
399 3112 : if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
400 : return false;
401 :
402 3100 : return Base1 == Base2;
403 : }
404 :
405 23662 : bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
406 : unsigned BaseReg1,
407 : MachineInstr &SecondLdSt,
408 : unsigned BaseReg2,
409 : unsigned NumLoads) const {
410 23662 : if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
411 : return false;
412 :
413 : const MachineOperand *FirstDst = nullptr;
414 : const MachineOperand *SecondDst = nullptr;
415 :
416 16711 : if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
417 33151 : (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
418 2122 : (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
419 : const unsigned MaxGlobalLoadCluster = 6;
420 2398 : if (NumLoads > MaxGlobalLoadCluster)
421 : return false;
422 :
423 2398 : FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
424 2398 : if (!FirstDst)
425 826 : FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
426 2398 : SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
427 2398 : if (!SecondDst)
428 826 : SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
429 14313 : } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
430 10964 : FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
431 10964 : SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
432 3349 : } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
433 3204 : FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
434 3204 : SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
435 : }
436 :
437 16711 : if (!FirstDst || !SecondDst)
438 : return false;
439 :
440 : // Try to limit clustering based on the total number of bytes loaded
441 : // rather than the number of instructions. This is done to help reduce
442 : // register pressure. The method used is somewhat inexact, though,
443 : // because it assumes that all loads in the cluster will load the
444 : // same number of bytes as FirstLdSt.
445 :
446 : // The unit of this value is bytes.
447 : // FIXME: This needs finer tuning.
448 : unsigned LoadClusterThreshold = 16;
449 :
450 : const MachineRegisterInfo &MRI =
451 15167 : FirstLdSt.getParent()->getParent()->getRegInfo();
452 15167 : const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
453 :
454 15167 : return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
455 : }
456 :
457 : // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
458 : // the first 16 loads will be interleaved with the stores, and the next 16 will
459 : // be clustered as expected. It should really split into 2 16 store batches.
460 : //
461 : // Loads are clustered until this returns false, rather than trying to schedule
462 : // groups of stores. This also means we have to deal with saying different
463 : // address space loads should be clustered, and ones which might cause bank
464 : // conflicts.
465 : //
466 : // This might be deprecated so it might not be worth that much effort to fix.
467 31587 : bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
468 : int64_t Offset0, int64_t Offset1,
469 : unsigned NumLoads) const {
470 : assert(Offset1 > Offset0 &&
471 : "Second offset should be larger than first offset!");
472 : // If we have less than 16 loads in a row, and the offsets are within 64
473 : // bytes, then schedule together.
474 :
475 : // A cacheline is 64 bytes (for global memory).
476 31587 : return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
477 : }
478 :
479 10 : static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
480 : MachineBasicBlock::iterator MI,
481 : const DebugLoc &DL, unsigned DestReg,
482 : unsigned SrcReg, bool KillSrc) {
483 10 : MachineFunction *MF = MBB.getParent();
484 : DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
485 : "illegal SGPR to VGPR copy",
486 10 : DL, DS_Error);
487 10 : LLVMContext &C = MF->getFunction().getContext();
488 10 : C.diagnose(IllegalCopy);
489 :
490 20 : BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
491 10 : .addReg(SrcReg, getKillRegState(KillSrc));
492 10 : }
493 :
494 59284 : void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
495 : MachineBasicBlock::iterator MI,
496 : const DebugLoc &DL, unsigned DestReg,
497 : unsigned SrcReg, bool KillSrc) const {
498 59284 : const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
499 :
500 59284 : if (RC == &AMDGPU::VGPR_32RegClass) {
501 : assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
502 : AMDGPU::SReg_32RegClass.contains(SrcReg));
503 64470 : BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
504 32235 : .addReg(SrcReg, getKillRegState(KillSrc));
505 32235 : return;
506 : }
507 :
508 27049 : if (RC == &AMDGPU::SReg_32_XM0RegClass ||
509 : RC == &AMDGPU::SReg_32RegClass) {
510 19591 : if (SrcReg == AMDGPU::SCC) {
511 0 : BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
512 : .addImm(-1)
513 : .addImm(0);
514 0 : return;
515 : }
516 :
517 19591 : if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
518 2 : reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
519 2 : return;
520 : }
521 :
522 39178 : BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
523 19589 : .addReg(SrcReg, getKillRegState(KillSrc));
524 19589 : return;
525 : }
526 :
527 7458 : if (RC == &AMDGPU::SReg_64RegClass) {
528 2236 : if (DestReg == AMDGPU::VCC) {
529 25 : if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
530 72 : BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
531 24 : .addReg(SrcReg, getKillRegState(KillSrc));
532 : } else {
533 : // FIXME: Hack until VReg_1 removed.
534 : assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
535 3 : BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
536 : .addImm(0)
537 1 : .addReg(SrcReg, getKillRegState(KillSrc));
538 : }
539 :
540 25 : return;
541 : }
542 :
543 2211 : if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
544 2 : reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
545 2 : return;
546 : }
547 :
548 4418 : BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
549 2209 : .addReg(SrcReg, getKillRegState(KillSrc));
550 2209 : return;
551 : }
552 :
553 5222 : if (DestReg == AMDGPU::SCC) {
554 : assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
555 0 : BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
556 0 : .addReg(SrcReg, getKillRegState(KillSrc))
557 : .addImm(0);
558 0 : return;
559 : }
560 :
561 : unsigned EltSize = 4;
562 : unsigned Opcode = AMDGPU::V_MOV_B32_e32;
563 5222 : if (RI.isSGPRClass(RC)) {
564 198 : if (RI.getRegSizeInBits(*RC) > 32) {
565 : Opcode = AMDGPU::S_MOV_B64;
566 : EltSize = 8;
567 : } else {
568 : Opcode = AMDGPU::S_MOV_B32;
569 : EltSize = 4;
570 : }
571 :
572 396 : if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
573 6 : reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
574 6 : return;
575 : }
576 : }
577 :
578 5216 : ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
579 : bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
580 :
581 16018 : for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
582 : unsigned SubIdx;
583 10802 : if (Forward)
584 6988 : SubIdx = SubIndices[Idx];
585 : else
586 7628 : SubIdx = SubIndices[SubIndices.size() - Idx - 1];
587 :
588 : MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
589 10802 : get(Opcode), RI.getSubReg(DestReg, SubIdx));
590 :
591 10802 : Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
592 :
593 10802 : if (Idx == 0)
594 5216 : Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
595 :
596 10802 : bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
597 10802 : Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
598 : }
599 : }
600 :
601 320216 : int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
602 : int NewOpc;
603 :
604 : // Try to map original to commuted opcode
605 320216 : NewOpc = AMDGPU::getCommuteRev(Opcode);
606 320216 : if (NewOpc != -1)
607 : // Check if the commuted (REV) opcode exists on the target.
608 19589 : return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
609 :
610 : // Try to map commuted to original opcode
611 300627 : NewOpc = AMDGPU::getCommuteOrig(Opcode);
612 300627 : if (NewOpc != -1)
613 : // Check if the original (non-REV) opcode exists on the target.
614 45164 : return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
615 :
616 255463 : return Opcode;
617 : }
618 :
619 0 : void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
620 : MachineBasicBlock::iterator MI,
621 : const DebugLoc &DL, unsigned DestReg,
622 : int64_t Value) const {
623 0 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
624 : const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
625 0 : if (RegClass == &AMDGPU::SReg_32RegClass ||
626 0 : RegClass == &AMDGPU::SGPR_32RegClass ||
627 0 : RegClass == &AMDGPU::SReg_32_XM0RegClass ||
628 : RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
629 0 : BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
630 : .addImm(Value);
631 0 : return;
632 : }
633 :
634 0 : if (RegClass == &AMDGPU::SReg_64RegClass ||
635 0 : RegClass == &AMDGPU::SGPR_64RegClass ||
636 : RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
637 0 : BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
638 : .addImm(Value);
639 0 : return;
640 : }
641 :
642 0 : if (RegClass == &AMDGPU::VGPR_32RegClass) {
643 0 : BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
644 : .addImm(Value);
645 0 : return;
646 : }
647 0 : if (RegClass == &AMDGPU::VReg_64RegClass) {
648 0 : BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
649 : .addImm(Value);
650 0 : return;
651 : }
652 :
653 : unsigned EltSize = 4;
654 : unsigned Opcode = AMDGPU::V_MOV_B32_e32;
655 0 : if (RI.isSGPRClass(RegClass)) {
656 0 : if (RI.getRegSizeInBits(*RegClass) > 32) {
657 : Opcode = AMDGPU::S_MOV_B64;
658 : EltSize = 8;
659 : } else {
660 : Opcode = AMDGPU::S_MOV_B32;
661 : EltSize = 4;
662 : }
663 : }
664 :
665 0 : ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
666 0 : for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
667 0 : int64_t IdxValue = Idx == 0 ? Value : 0;
668 :
669 : MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
670 0 : get(Opcode), RI.getSubReg(DestReg, Idx));
671 : Builder.addImm(IdxValue);
672 : }
673 : }
674 :
675 : const TargetRegisterClass *
676 0 : SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
677 0 : return &AMDGPU::VGPR_32RegClass;
678 : }
679 :
680 0 : void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
681 : MachineBasicBlock::iterator I,
682 : const DebugLoc &DL, unsigned DstReg,
683 : ArrayRef<MachineOperand> Cond,
684 : unsigned TrueReg,
685 : unsigned FalseReg) const {
686 0 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
687 : assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
688 : "Not a VGPR32 reg");
689 :
690 0 : if (Cond.size() == 1) {
691 0 : unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
692 0 : BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
693 : .add(Cond[0]);
694 0 : BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
695 0 : .addReg(FalseReg)
696 0 : .addReg(TrueReg)
697 0 : .addReg(SReg);
698 0 : } else if (Cond.size() == 2) {
699 : assert(Cond[0].isImm() && "Cond[0] is not an immediate");
700 0 : switch (Cond[0].getImm()) {
701 : case SIInstrInfo::SCC_TRUE: {
702 0 : unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
703 0 : BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
704 : .addImm(-1)
705 : .addImm(0);
706 0 : BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
707 0 : .addReg(FalseReg)
708 0 : .addReg(TrueReg)
709 0 : .addReg(SReg);
710 0 : break;
711 : }
712 : case SIInstrInfo::SCC_FALSE: {
713 0 : unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
714 0 : BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
715 : .addImm(0)
716 : .addImm(-1);
717 0 : BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
718 0 : .addReg(FalseReg)
719 0 : .addReg(TrueReg)
720 0 : .addReg(SReg);
721 0 : break;
722 : }
723 0 : case SIInstrInfo::VCCNZ: {
724 0 : MachineOperand RegOp = Cond[1];
725 : RegOp.setImplicit(false);
726 0 : unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
727 0 : BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
728 : .add(RegOp);
729 0 : BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
730 0 : .addReg(FalseReg)
731 0 : .addReg(TrueReg)
732 0 : .addReg(SReg);
733 : break;
734 : }
735 0 : case SIInstrInfo::VCCZ: {
736 0 : MachineOperand RegOp = Cond[1];
737 : RegOp.setImplicit(false);
738 0 : unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
739 0 : BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
740 : .add(RegOp);
741 0 : BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
742 0 : .addReg(TrueReg)
743 0 : .addReg(FalseReg)
744 0 : .addReg(SReg);
745 : break;
746 : }
747 : case SIInstrInfo::EXECNZ: {
748 0 : unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
749 0 : unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
750 0 : BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
751 : .addImm(0);
752 0 : BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
753 : .addImm(-1)
754 : .addImm(0);
755 0 : BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
756 0 : .addReg(FalseReg)
757 0 : .addReg(TrueReg)
758 0 : .addReg(SReg);
759 0 : break;
760 : }
761 : case SIInstrInfo::EXECZ: {
762 0 : unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
763 0 : unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
764 0 : BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
765 : .addImm(0);
766 0 : BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
767 : .addImm(0)
768 : .addImm(-1);
769 0 : BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
770 0 : .addReg(FalseReg)
771 0 : .addReg(TrueReg)
772 0 : .addReg(SReg);
773 0 : llvm_unreachable("Unhandled branch predicate EXECZ");
774 : break;
775 : }
776 0 : default:
777 0 : llvm_unreachable("invalid branch predicate");
778 : }
779 : } else {
780 0 : llvm_unreachable("Can only handle Cond size 1 or 2");
781 : }
782 0 : }
783 :
784 0 : unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
785 : MachineBasicBlock::iterator I,
786 : const DebugLoc &DL,
787 : unsigned SrcReg, int Value) const {
788 0 : MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
789 0 : unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
790 0 : BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
791 0 : .addImm(Value)
792 0 : .addReg(SrcReg);
793 :
794 0 : return Reg;
795 : }
796 :
797 0 : unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
798 : MachineBasicBlock::iterator I,
799 : const DebugLoc &DL,
800 : unsigned SrcReg, int Value) const {
801 0 : MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
802 0 : unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
803 0 : BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
804 0 : .addImm(Value)
805 0 : .addReg(SrcReg);
806 :
807 0 : return Reg;
808 : }
809 :
810 11269 : unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
811 :
812 11269 : if (RI.getRegSizeInBits(*DstRC) == 32) {
813 10835 : return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
814 434 : } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
815 : return AMDGPU::S_MOV_B64;
816 428 : } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
817 428 : return AMDGPU::V_MOV_B64_PSEUDO;
818 : }
819 : return AMDGPU::COPY;
820 : }
821 :
822 : static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
823 702 : switch (Size) {
824 : case 4:
825 : return AMDGPU::SI_SPILL_S32_SAVE;
826 104 : case 8:
827 : return AMDGPU::SI_SPILL_S64_SAVE;
828 60 : case 16:
829 : return AMDGPU::SI_SPILL_S128_SAVE;
830 33 : case 32:
831 : return AMDGPU::SI_SPILL_S256_SAVE;
832 8 : case 64:
833 : return AMDGPU::SI_SPILL_S512_SAVE;
834 0 : default:
835 0 : llvm_unreachable("unknown register size");
836 : }
837 : }
838 :
839 : static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
840 1240 : switch (Size) {
841 : case 4:
842 : return AMDGPU::SI_SPILL_V32_SAVE;
843 9 : case 8:
844 : return AMDGPU::SI_SPILL_V64_SAVE;
845 0 : case 12:
846 : return AMDGPU::SI_SPILL_V96_SAVE;
847 669 : case 16:
848 : return AMDGPU::SI_SPILL_V128_SAVE;
849 0 : case 32:
850 : return AMDGPU::SI_SPILL_V256_SAVE;
851 0 : case 64:
852 : return AMDGPU::SI_SPILL_V512_SAVE;
853 0 : default:
854 0 : llvm_unreachable("unknown register size");
855 : }
856 : }
857 :
858 1942 : void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
859 : MachineBasicBlock::iterator MI,
860 : unsigned SrcReg, bool isKill,
861 : int FrameIndex,
862 : const TargetRegisterClass *RC,
863 : const TargetRegisterInfo *TRI) const {
864 1942 : MachineFunction *MF = MBB.getParent();
865 1942 : SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
866 1942 : MachineFrameInfo &FrameInfo = MF->getFrameInfo();
867 : DebugLoc DL = MBB.findDebugLoc(MI);
868 :
869 : unsigned Size = FrameInfo.getObjectSize(FrameIndex);
870 : unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
871 : MachinePointerInfo PtrInfo
872 1942 : = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
873 : MachineMemOperand *MMO
874 1942 : = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
875 : Size, Align);
876 : unsigned SpillSize = TRI->getSpillSize(*RC);
877 :
878 1942 : if (RI.isSGPRClass(RC)) {
879 : MFI->setHasSpilledSGPRs();
880 :
881 : // We are only allowed to create one new instruction when spilling
882 : // registers, so we need to use pseudo instruction for spilling SGPRs.
883 702 : const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
884 :
885 : // The SGPR spill/restore instructions only work on number sgprs, so we need
886 : // to make sure we are using the correct register class.
887 702 : if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
888 23 : MachineRegisterInfo &MRI = MF->getRegInfo();
889 23 : MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
890 : }
891 :
892 702 : MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
893 702 : .addReg(SrcReg, getKillRegState(isKill)) // data
894 : .addFrameIndex(FrameIndex) // addr
895 : .addMemOperand(MMO)
896 702 : .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
897 702 : .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
898 : // Add the scratch resource registers as implicit uses because we may end up
899 : // needing them, and need to ensure that the reserved registers are
900 : // correctly handled.
901 :
902 : FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
903 702 : if (ST.hasScalarStores()) {
904 : // m0 is used for offset to scalar stores if used to spill.
905 370 : Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
906 : }
907 :
908 : return;
909 : }
910 :
911 1240 : if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
912 0 : LLVMContext &Ctx = MF->getFunction().getContext();
913 0 : Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
914 : " spill register");
915 0 : BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
916 0 : .addReg(SrcReg);
917 :
918 0 : return;
919 : }
920 :
921 : assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
922 :
923 : unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
924 : MFI->setHasSpilledVGPRs();
925 3720 : BuildMI(MBB, MI, DL, get(Opcode))
926 1240 : .addReg(SrcReg, getKillRegState(isKill)) // data
927 : .addFrameIndex(FrameIndex) // addr
928 1240 : .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
929 1240 : .addReg(MFI->getFrameOffsetReg()) // scratch_offset
930 : .addImm(0) // offset
931 : .addMemOperand(MMO);
932 : }
933 :
934 : static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
935 693 : switch (Size) {
936 : case 4:
937 : return AMDGPU::SI_SPILL_S32_RESTORE;
938 101 : case 8:
939 : return AMDGPU::SI_SPILL_S64_RESTORE;
940 59 : case 16:
941 : return AMDGPU::SI_SPILL_S128_RESTORE;
942 33 : case 32:
943 : return AMDGPU::SI_SPILL_S256_RESTORE;
944 8 : case 64:
945 : return AMDGPU::SI_SPILL_S512_RESTORE;
946 0 : default:
947 0 : llvm_unreachable("unknown register size");
948 : }
949 : }
950 :
951 : static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
952 1152 : switch (Size) {
953 : case 4:
954 : return AMDGPU::SI_SPILL_V32_RESTORE;
955 9 : case 8:
956 : return AMDGPU::SI_SPILL_V64_RESTORE;
957 0 : case 12:
958 : return AMDGPU::SI_SPILL_V96_RESTORE;
959 672 : case 16:
960 : return AMDGPU::SI_SPILL_V128_RESTORE;
961 0 : case 32:
962 : return AMDGPU::SI_SPILL_V256_RESTORE;
963 0 : case 64:
964 : return AMDGPU::SI_SPILL_V512_RESTORE;
965 0 : default:
966 0 : llvm_unreachable("unknown register size");
967 : }
968 : }
969 :
970 1845 : void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
971 : MachineBasicBlock::iterator MI,
972 : unsigned DestReg, int FrameIndex,
973 : const TargetRegisterClass *RC,
974 : const TargetRegisterInfo *TRI) const {
975 1845 : MachineFunction *MF = MBB.getParent();
976 1845 : const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
977 1845 : MachineFrameInfo &FrameInfo = MF->getFrameInfo();
978 : DebugLoc DL = MBB.findDebugLoc(MI);
979 : unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
980 : unsigned Size = FrameInfo.getObjectSize(FrameIndex);
981 : unsigned SpillSize = TRI->getSpillSize(*RC);
982 :
983 : MachinePointerInfo PtrInfo
984 1845 : = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
985 :
986 1845 : MachineMemOperand *MMO = MF->getMachineMemOperand(
987 : PtrInfo, MachineMemOperand::MOLoad, Size, Align);
988 :
989 1845 : if (RI.isSGPRClass(RC)) {
990 : // FIXME: Maybe this should not include a memoperand because it will be
991 : // lowered to non-memory instructions.
992 693 : const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
993 693 : if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
994 23 : MachineRegisterInfo &MRI = MF->getRegInfo();
995 23 : MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
996 : }
997 :
998 : FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
999 693 : MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
1000 : .addFrameIndex(FrameIndex) // addr
1001 : .addMemOperand(MMO)
1002 693 : .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
1003 693 : .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
1004 :
1005 693 : if (ST.hasScalarStores()) {
1006 : // m0 is used for offset to scalar stores if used to spill.
1007 368 : Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
1008 : }
1009 :
1010 : return;
1011 : }
1012 :
1013 1152 : if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
1014 0 : LLVMContext &Ctx = MF->getFunction().getContext();
1015 0 : Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
1016 : " restore register");
1017 0 : BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
1018 :
1019 0 : return;
1020 : }
1021 :
1022 : assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
1023 :
1024 : unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
1025 3456 : BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1026 : .addFrameIndex(FrameIndex) // vaddr
1027 1152 : .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
1028 1152 : .addReg(MFI->getFrameOffsetReg()) // scratch_offset
1029 : .addImm(0) // offset
1030 : .addMemOperand(MMO);
1031 : }
1032 :
1033 : /// \param @Offset Offset in bytes of the FrameIndex being spilled
1034 0 : unsigned SIInstrInfo::calculateLDSSpillAddress(
1035 : MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
1036 : unsigned FrameOffset, unsigned Size) const {
1037 0 : MachineFunction *MF = MBB.getParent();
1038 0 : SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1039 0 : const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1040 : DebugLoc DL = MBB.findDebugLoc(MI);
1041 0 : unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
1042 0 : unsigned WavefrontSize = ST.getWavefrontSize();
1043 :
1044 0 : unsigned TIDReg = MFI->getTIDReg();
1045 0 : if (!MFI->hasCalculatedTID()) {
1046 0 : MachineBasicBlock &Entry = MBB.getParent()->front();
1047 : MachineBasicBlock::iterator Insert = Entry.front();
1048 : DebugLoc DL = Insert->getDebugLoc();
1049 :
1050 0 : TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1051 : *MF);
1052 0 : if (TIDReg == AMDGPU::NoRegister)
1053 : return TIDReg;
1054 :
1055 0 : if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
1056 : WorkGroupSize > WavefrontSize) {
1057 : unsigned TIDIGXReg
1058 : = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1059 : unsigned TIDIGYReg
1060 : = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1061 : unsigned TIDIGZReg
1062 : = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1063 : unsigned InputPtrReg =
1064 : MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1065 0 : for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1066 0 : if (!Entry.isLiveIn(Reg))
1067 : Entry.addLiveIn(Reg);
1068 : }
1069 :
1070 0 : RS->enterBasicBlock(Entry);
1071 : // FIXME: Can we scavenge an SReg_64 and access the subregs?
1072 : unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1073 : unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1074 0 : BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1075 0 : .addReg(InputPtrReg)
1076 : .addImm(SI::KernelInputOffsets::NGROUPS_Z);
1077 0 : BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1078 0 : .addReg(InputPtrReg)
1079 : .addImm(SI::KernelInputOffsets::NGROUPS_Y);
1080 :
1081 : // NGROUPS.X * NGROUPS.Y
1082 0 : BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1083 0 : .addReg(STmp1)
1084 0 : .addReg(STmp0);
1085 : // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1086 0 : BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1087 0 : .addReg(STmp1)
1088 0 : .addReg(TIDIGXReg);
1089 : // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1090 0 : BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1091 0 : .addReg(STmp0)
1092 0 : .addReg(TIDIGYReg)
1093 0 : .addReg(TIDReg);
1094 : // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1095 0 : getAddNoCarry(Entry, Insert, DL, TIDReg)
1096 0 : .addReg(TIDReg)
1097 0 : .addReg(TIDIGZReg);
1098 : } else {
1099 : // Get the wave id
1100 : BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1101 0 : TIDReg)
1102 : .addImm(-1)
1103 : .addImm(0);
1104 :
1105 0 : BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1106 0 : TIDReg)
1107 : .addImm(-1)
1108 0 : .addReg(TIDReg);
1109 : }
1110 :
1111 0 : BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1112 0 : TIDReg)
1113 : .addImm(2)
1114 0 : .addReg(TIDReg);
1115 : MFI->setTIDReg(TIDReg);
1116 : }
1117 :
1118 : // Add FrameIndex to LDS offset
1119 0 : unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1120 0 : getAddNoCarry(MBB, MI, DL, TmpReg)
1121 0 : .addImm(LDSOffset)
1122 0 : .addReg(TIDReg);
1123 :
1124 0 : return TmpReg;
1125 : }
1126 :
1127 1774 : void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
1128 : MachineBasicBlock::iterator MI,
1129 : int Count) const {
1130 : DebugLoc DL = MBB.findDebugLoc(MI);
1131 3548 : while (Count > 0) {
1132 : int Arg;
1133 1774 : if (Count >= 8)
1134 : Arg = 7;
1135 : else
1136 1774 : Arg = Count - 1;
1137 1774 : Count -= 8;
1138 5322 : BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1139 1774 : .addImm(Arg);
1140 : }
1141 1774 : }
1142 :
1143 1774 : void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1144 : MachineBasicBlock::iterator MI) const {
1145 1774 : insertWaitStates(MBB, MI, 1);
1146 1774 : }
1147 :
1148 0 : void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1149 0 : auto MF = MBB.getParent();
1150 0 : SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1151 :
1152 : assert(Info->isEntryFunction());
1153 :
1154 0 : if (MBB.succ_empty()) {
1155 0 : bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1156 0 : if (HasNoTerminator)
1157 0 : BuildMI(MBB, MBB.end(), DebugLoc(),
1158 0 : get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
1159 : }
1160 0 : }
1161 :
1162 573761 : unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
1163 1147522 : switch (MI.getOpcode()) {
1164 : default: return 1; // FIXME: Do wait states equal cycles?
1165 :
1166 891 : case AMDGPU::S_NOP:
1167 891 : return MI.getOperand(0).getImm() + 1;
1168 : }
1169 : }
1170 :
1171 288198 : bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1172 288198 : MachineBasicBlock &MBB = *MI.getParent();
1173 : DebugLoc DL = MBB.findDebugLoc(MI);
1174 576396 : switch (MI.getOpcode()) {
1175 : default: return TargetInstrInfo::expandPostRAPseudo(MI);
1176 1 : case AMDGPU::S_MOV_B64_term:
1177 : // This is only a terminator to get the correct spill code placement during
1178 : // register allocation.
1179 1 : MI.setDesc(get(AMDGPU::S_MOV_B64));
1180 : break;
1181 :
1182 0 : case AMDGPU::S_XOR_B64_term:
1183 : // This is only a terminator to get the correct spill code placement during
1184 : // register allocation.
1185 0 : MI.setDesc(get(AMDGPU::S_XOR_B64));
1186 : break;
1187 :
1188 0 : case AMDGPU::S_ANDN2_B64_term:
1189 : // This is only a terminator to get the correct spill code placement during
1190 : // register allocation.
1191 0 : MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1192 : break;
1193 :
1194 336 : case AMDGPU::V_MOV_B64_PSEUDO: {
1195 336 : unsigned Dst = MI.getOperand(0).getReg();
1196 336 : unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1197 336 : unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1198 :
1199 336 : const MachineOperand &SrcOp = MI.getOperand(1);
1200 : // FIXME: Will this work for 64-bit floating point immediates?
1201 : assert(!SrcOp.isFPImm());
1202 336 : if (SrcOp.isImm()) {
1203 336 : APInt Imm(64, SrcOp.getImm());
1204 672 : BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1205 672 : .addImm(Imm.getLoBits(32).getZExtValue())
1206 336 : .addReg(Dst, RegState::Implicit | RegState::Define);
1207 672 : BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1208 1008 : .addImm(Imm.getHiBits(32).getZExtValue())
1209 336 : .addReg(Dst, RegState::Implicit | RegState::Define);
1210 : } else {
1211 : assert(SrcOp.isReg());
1212 0 : BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1213 0 : .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1214 0 : .addReg(Dst, RegState::Implicit | RegState::Define);
1215 0 : BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1216 0 : .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1217 0 : .addReg(Dst, RegState::Implicit | RegState::Define);
1218 : }
1219 336 : MI.eraseFromParent();
1220 336 : break;
1221 : }
1222 28 : case AMDGPU::V_SET_INACTIVE_B32: {
1223 56 : BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1224 28 : .addReg(AMDGPU::EXEC);
1225 56 : BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1226 28 : .add(MI.getOperand(2));
1227 56 : BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1228 28 : .addReg(AMDGPU::EXEC);
1229 28 : MI.eraseFromParent();
1230 28 : break;
1231 : }
1232 2 : case AMDGPU::V_SET_INACTIVE_B64: {
1233 4 : BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1234 2 : .addReg(AMDGPU::EXEC);
1235 2 : MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1236 4 : MI.getOperand(0).getReg())
1237 2 : .add(MI.getOperand(2));
1238 2 : expandPostRAPseudo(*Copy);
1239 4 : BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1240 2 : .addReg(AMDGPU::EXEC);
1241 2 : MI.eraseFromParent();
1242 2 : break;
1243 : }
1244 66 : case AMDGPU::V_MOVRELD_B32_V1:
1245 : case AMDGPU::V_MOVRELD_B32_V2:
1246 : case AMDGPU::V_MOVRELD_B32_V4:
1247 : case AMDGPU::V_MOVRELD_B32_V8:
1248 : case AMDGPU::V_MOVRELD_B32_V16: {
1249 66 : const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1250 66 : unsigned VecReg = MI.getOperand(0).getReg();
1251 : bool IsUndef = MI.getOperand(1).isUndef();
1252 66 : unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1253 : assert(VecReg == MI.getOperand(1).getReg());
1254 :
1255 : MachineInstr *MovRel =
1256 66 : BuildMI(MBB, MI, DL, MovRelDesc)
1257 66 : .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1258 66 : .add(MI.getOperand(2))
1259 66 : .addReg(VecReg, RegState::ImplicitDefine)
1260 : .addReg(VecReg,
1261 130 : RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1262 :
1263 : const int ImpDefIdx =
1264 132 : MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1265 66 : const int ImpUseIdx = ImpDefIdx + 1;
1266 66 : MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1267 :
1268 66 : MI.eraseFromParent();
1269 66 : break;
1270 : }
1271 611 : case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1272 611 : MachineFunction &MF = *MBB.getParent();
1273 611 : unsigned Reg = MI.getOperand(0).getReg();
1274 611 : unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1275 611 : unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1276 :
1277 : // Create a bundle so these instructions won't be re-ordered by the
1278 : // post-RA scheduler.
1279 : MIBundleBuilder Bundler(MBB, MI);
1280 1222 : Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1281 :
1282 : // Add 32-bit offset from this instruction to the start of the
1283 : // constant data.
1284 1222 : Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1285 611 : .addReg(RegLo)
1286 611 : .add(MI.getOperand(1)));
1287 :
1288 1222 : MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1289 611 : .addReg(RegHi);
1290 611 : if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
1291 : MIB.addImm(0);
1292 : else
1293 : MIB.add(MI.getOperand(2));
1294 :
1295 : Bundler.append(MIB);
1296 611 : finalizeBundle(MBB, Bundler.begin());
1297 :
1298 611 : MI.eraseFromParent();
1299 : break;
1300 : }
1301 44 : case AMDGPU::EXIT_WWM: {
1302 : // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
1303 : // is exited.
1304 44 : MI.setDesc(get(AMDGPU::S_MOV_B64));
1305 : break;
1306 : }
1307 37 : case TargetOpcode::BUNDLE: {
1308 37 : if (!MI.mayLoad())
1309 : return false;
1310 :
1311 : // If it is a load it must be a memory clause
1312 37 : for (MachineBasicBlock::instr_iterator I = MI.getIterator();
1313 145 : I->isBundledWithSucc(); ++I) {
1314 108 : I->unbundleFromSucc();
1315 647 : for (MachineOperand &MO : I->operands())
1316 539 : if (MO.isReg())
1317 : MO.setIsInternalRead(false);
1318 : }
1319 :
1320 37 : MI.eraseFromParent();
1321 37 : break;
1322 : }
1323 : }
1324 : return true;
1325 : }
1326 :
1327 263258 : bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
1328 : MachineOperand &Src0,
1329 : unsigned Src0OpName,
1330 : MachineOperand &Src1,
1331 : unsigned Src1OpName) const {
1332 263258 : MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1333 263258 : if (!Src0Mods)
1334 : return false;
1335 :
1336 53463 : MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1337 : assert(Src1Mods &&
1338 : "All commutable instructions have both src0 and src1 modifiers");
1339 :
1340 53463 : int Src0ModsVal = Src0Mods->getImm();
1341 53463 : int Src1ModsVal = Src1Mods->getImm();
1342 :
1343 53463 : Src1Mods->setImm(Src0ModsVal);
1344 53463 : Src0Mods->setImm(Src1ModsVal);
1345 53463 : return true;
1346 : }
1347 :
1348 42577 : static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
1349 : MachineOperand &RegOp,
1350 : MachineOperand &NonRegOp) {
1351 42577 : unsigned Reg = RegOp.getReg();
1352 : unsigned SubReg = RegOp.getSubReg();
1353 : bool IsKill = RegOp.isKill();
1354 : bool IsDead = RegOp.isDead();
1355 : bool IsUndef = RegOp.isUndef();
1356 : bool IsDebug = RegOp.isDebug();
1357 :
1358 42577 : if (NonRegOp.isImm())
1359 42577 : RegOp.ChangeToImmediate(NonRegOp.getImm());
1360 0 : else if (NonRegOp.isFI())
1361 0 : RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1362 : else
1363 : return nullptr;
1364 :
1365 42577 : NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1366 : NonRegOp.setSubReg(SubReg);
1367 :
1368 42577 : return &MI;
1369 : }
1370 :
1371 315266 : MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
1372 : unsigned Src0Idx,
1373 : unsigned Src1Idx) const {
1374 : assert(!NewMI && "this should never be used");
1375 :
1376 315266 : unsigned Opc = MI.getOpcode();
1377 315266 : int CommutedOpcode = commuteOpcode(Opc);
1378 315266 : if (CommutedOpcode == -1)
1379 : return nullptr;
1380 :
1381 : assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1382 : static_cast<int>(Src0Idx) &&
1383 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1384 : static_cast<int>(Src1Idx) &&
1385 : "inconsistency with findCommutedOpIndices");
1386 :
1387 294230 : MachineOperand &Src0 = MI.getOperand(Src0Idx);
1388 : MachineOperand &Src1 = MI.getOperand(Src1Idx);
1389 :
1390 : MachineInstr *CommutedMI = nullptr;
1391 294230 : if (Src0.isReg() && Src1.isReg()) {
1392 235900 : if (isOperandLegal(MI, Src1Idx, &Src0)) {
1393 : // Be sure to copy the source modifiers to the right place.
1394 : CommutedMI
1395 220681 : = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1396 : }
1397 :
1398 58330 : } else if (Src0.isReg() && !Src1.isReg()) {
1399 : // src0 should always be able to support any operand type, so no need to
1400 : // check operand legality.
1401 25063 : CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1402 33267 : } else if (!Src0.isReg() && Src1.isReg()) {
1403 33256 : if (isOperandLegal(MI, Src1Idx, &Src0))
1404 17514 : CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1405 : } else {
1406 : // FIXME: Found two non registers to commute. This does happen.
1407 : return nullptr;
1408 : }
1409 :
1410 263258 : if (CommutedMI) {
1411 263258 : swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1412 : Src1, AMDGPU::OpName::src1_modifiers);
1413 :
1414 263258 : CommutedMI->setDesc(get(CommutedOpcode));
1415 : }
1416 :
1417 : return CommutedMI;
1418 : }
1419 :
1420 : // This needs to be implemented because the source modifiers may be inserted
1421 : // between the true commutable operands, and the base
1422 : // TargetInstrInfo::commuteInstruction uses it.
1423 361965 : bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
1424 : unsigned &SrcOpIdx1) const {
1425 361965 : return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
1426 : }
1427 :
1428 363029 : bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
1429 : unsigned &SrcOpIdx1) const {
1430 726058 : if (!Desc.isCommutable())
1431 : return false;
1432 :
1433 311837 : unsigned Opc = Desc.getOpcode();
1434 311837 : int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1435 311837 : if (Src0Idx == -1)
1436 : return false;
1437 :
1438 311837 : int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1439 311837 : if (Src1Idx == -1)
1440 : return false;
1441 :
1442 311837 : return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1443 : }
1444 :
1445 1081 : bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1446 : int64_t BrOffset) const {
1447 : // BranchRelaxation should never have to check s_setpc_b64 because its dest
1448 : // block is unanalyzable.
1449 : assert(BranchOp != AMDGPU::S_SETPC_B64);
1450 :
1451 : // Convert to dwords.
1452 1081 : BrOffset /= 4;
1453 :
1454 : // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1455 : // from the next instruction.
1456 1081 : BrOffset -= 1;
1457 :
1458 1081 : return isIntN(BranchOffsetBits, BrOffset);
1459 : }
1460 :
1461 1115 : MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
1462 : const MachineInstr &MI) const {
1463 2230 : if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1464 : // This would be a difficult analysis to perform, but can always be legal so
1465 : // there's no need to analyze it.
1466 : return nullptr;
1467 : }
1468 :
1469 1115 : return MI.getOperand(0).getMBB();
1470 : }
1471 :
1472 34 : unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
1473 : MachineBasicBlock &DestBB,
1474 : const DebugLoc &DL,
1475 : int64_t BrOffset,
1476 : RegScavenger *RS) const {
1477 : assert(RS && "RegScavenger required for long branching");
1478 : assert(MBB.empty() &&
1479 : "new block should be inserted for expanding unconditional branch");
1480 : assert(MBB.pred_size() == 1);
1481 :
1482 34 : MachineFunction *MF = MBB.getParent();
1483 34 : MachineRegisterInfo &MRI = MF->getRegInfo();
1484 :
1485 : // FIXME: Virtual register workaround for RegScavenger not working with empty
1486 : // blocks.
1487 34 : unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1488 :
1489 34 : auto I = MBB.end();
1490 :
1491 : // We need to compute the offset relative to the instruction immediately after
1492 : // s_getpc_b64. Insert pc arithmetic code before last terminator.
1493 68 : MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1494 :
1495 : // TODO: Handle > 32-bit block address.
1496 34 : if (BrOffset >= 0) {
1497 52 : BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1498 26 : .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1499 26 : .addReg(PCReg, 0, AMDGPU::sub0)
1500 : .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
1501 78 : BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1502 26 : .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1503 26 : .addReg(PCReg, 0, AMDGPU::sub1)
1504 : .addImm(0);
1505 : } else {
1506 : // Backwards branch.
1507 16 : BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1508 8 : .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1509 8 : .addReg(PCReg, 0, AMDGPU::sub0)
1510 : .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
1511 24 : BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1512 8 : .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1513 8 : .addReg(PCReg, 0, AMDGPU::sub1)
1514 : .addImm(0);
1515 : }
1516 :
1517 : // Insert the indirect branch after the other terminator.
1518 34 : BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1519 34 : .addReg(PCReg);
1520 :
1521 : // FIXME: If spilling is necessary, this will fail because this scavenger has
1522 : // no emergency stack slots. It is non-trivial to spill in this situation,
1523 : // because the restore code needs to be specially placed after the
1524 : // jump. BranchRelaxation then needs to be made aware of the newly inserted
1525 : // block.
1526 : //
1527 : // If a spill is needed for the pc register pair, we need to insert a spill
1528 : // restore block right before the destination block, and insert a short branch
1529 : // into the old destination block's fallthrough predecessor.
1530 : // e.g.:
1531 : //
1532 : // s_cbranch_scc0 skip_long_branch:
1533 : //
1534 : // long_branch_bb:
1535 : // spill s[8:9]
1536 : // s_getpc_b64 s[8:9]
1537 : // s_add_u32 s8, s8, restore_bb
1538 : // s_addc_u32 s9, s9, 0
1539 : // s_setpc_b64 s[8:9]
1540 : //
1541 : // skip_long_branch:
1542 : // foo;
1543 : //
1544 : // .....
1545 : //
1546 : // dest_bb_fallthrough_predecessor:
1547 : // bar;
1548 : // s_branch dest_bb
1549 : //
1550 : // restore_bb:
1551 : // restore s[8:9]
1552 : // fallthrough dest_bb
1553 : ///
1554 : // dest_bb:
1555 : // buzz;
1556 :
1557 34 : RS->enterBasicBlockEnd(MBB);
1558 34 : unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
1559 : MachineBasicBlock::iterator(GetPC), 0);
1560 33 : MRI.replaceRegWith(PCReg, Scav);
1561 33 : MRI.clearVirtRegs();
1562 33 : RS->setRegUsed(Scav);
1563 :
1564 33 : return 4 + 8 + 4 + 4;
1565 : }
1566 :
1567 1579 : unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1568 1579 : switch (Cond) {
1569 : case SIInstrInfo::SCC_TRUE:
1570 : return AMDGPU::S_CBRANCH_SCC1;
1571 416 : case SIInstrInfo::SCC_FALSE:
1572 416 : return AMDGPU::S_CBRANCH_SCC0;
1573 266 : case SIInstrInfo::VCCNZ:
1574 266 : return AMDGPU::S_CBRANCH_VCCNZ;
1575 243 : case SIInstrInfo::VCCZ:
1576 243 : return AMDGPU::S_CBRANCH_VCCZ;
1577 138 : case SIInstrInfo::EXECNZ:
1578 138 : return AMDGPU::S_CBRANCH_EXECNZ;
1579 93 : case SIInstrInfo::EXECZ:
1580 93 : return AMDGPU::S_CBRANCH_EXECZ;
1581 0 : default:
1582 0 : llvm_unreachable("invalid branch predicate");
1583 : }
1584 : }
1585 :
1586 949677 : SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1587 : switch (Opcode) {
1588 : case AMDGPU::S_CBRANCH_SCC0:
1589 : return SCC_FALSE;
1590 : case AMDGPU::S_CBRANCH_SCC1:
1591 : return SCC_TRUE;
1592 : case AMDGPU::S_CBRANCH_VCCNZ:
1593 : return VCCNZ;
1594 : case AMDGPU::S_CBRANCH_VCCZ:
1595 : return VCCZ;
1596 : case AMDGPU::S_CBRANCH_EXECNZ:
1597 : return EXECNZ;
1598 : case AMDGPU::S_CBRANCH_EXECZ:
1599 : return EXECZ;
1600 : default:
1601 : return INVALID_BR;
1602 : }
1603 : }
1604 :
1605 981361 : bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
1606 : MachineBasicBlock::iterator I,
1607 : MachineBasicBlock *&TBB,
1608 : MachineBasicBlock *&FBB,
1609 : SmallVectorImpl<MachineOperand> &Cond,
1610 : bool AllowModify) const {
1611 1962722 : if (I->getOpcode() == AMDGPU::S_BRANCH) {
1612 : // Unconditional Branch
1613 31684 : TBB = I->getOperand(0).getMBB();
1614 31684 : return false;
1615 : }
1616 :
1617 : MachineBasicBlock *CondBB = nullptr;
1618 :
1619 949677 : if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1620 0 : CondBB = I->getOperand(1).getMBB();
1621 0 : Cond.push_back(I->getOperand(0));
1622 : } else {
1623 949677 : BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1624 949677 : if (Pred == INVALID_BR)
1625 : return true;
1626 :
1627 35095 : CondBB = I->getOperand(0).getMBB();
1628 70190 : Cond.push_back(MachineOperand::CreateImm(Pred));
1629 70190 : Cond.push_back(I->getOperand(1)); // Save the branch register.
1630 : }
1631 : ++I;
1632 :
1633 35095 : if (I == MBB.end()) {
1634 : // Conditional branch followed by fall-through.
1635 17841 : TBB = CondBB;
1636 17841 : return false;
1637 : }
1638 :
1639 34508 : if (I->getOpcode() == AMDGPU::S_BRANCH) {
1640 17251 : TBB = CondBB;
1641 17251 : FBB = I->getOperand(0).getMBB();
1642 17251 : return false;
1643 : }
1644 :
1645 : return true;
1646 : }
1647 :
1648 1036569 : bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
1649 : MachineBasicBlock *&FBB,
1650 : SmallVectorImpl<MachineOperand> &Cond,
1651 : bool AllowModify) const {
1652 1036569 : MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1653 1036569 : if (I == MBB.end())
1654 : return false;
1655 :
1656 1965166 : if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1657 965576 : return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1658 :
1659 : ++I;
1660 :
1661 : // TODO: Should be able to treat as fallthrough?
1662 17007 : if (I == MBB.end())
1663 : return true;
1664 :
1665 15785 : if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1666 : return true;
1667 :
1668 15785 : MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1669 :
1670 : // Specifically handle the case where the conditional branch is to the same
1671 : // destination as the mask branch. e.g.
1672 : //
1673 : // si_mask_branch BB8
1674 : // s_cbranch_execz BB8
1675 : // s_cbranch BB9
1676 : //
1677 : // This is required to understand divergent loops which may need the branches
1678 : // to be relaxed.
1679 15785 : if (TBB != MaskBrDest || Cond.empty())
1680 : return true;
1681 :
1682 457 : auto Pred = Cond[0].getImm();
1683 457 : return (Pred != EXECZ && Pred != EXECNZ);
1684 : }
1685 :
1686 2516 : unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
1687 : int *BytesRemoved) const {
1688 2516 : MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1689 :
1690 : unsigned Count = 0;
1691 : unsigned RemovedSize = 0;
1692 5676 : while (I != MBB.end()) {
1693 3160 : MachineBasicBlock::iterator Next = std::next(I);
1694 6320 : if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1695 : I = Next;
1696 : continue;
1697 : }
1698 :
1699 3152 : RemovedSize += getInstSizeInBytes(*I);
1700 3152 : I->eraseFromParent();
1701 3152 : ++Count;
1702 : I = Next;
1703 : }
1704 :
1705 2516 : if (BytesRemoved)
1706 32 : *BytesRemoved = RemovedSize;
1707 :
1708 2516 : return Count;
1709 : }
1710 :
1711 : // Copy the flags onto the implicit condition register operand.
1712 : static void preserveCondRegFlags(MachineOperand &CondReg,
1713 : const MachineOperand &OrigCond) {
1714 : CondReg.setIsUndef(OrigCond.isUndef());
1715 : CondReg.setIsKill(OrigCond.isKill());
1716 : }
1717 :
1718 2249 : unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
1719 : MachineBasicBlock *TBB,
1720 : MachineBasicBlock *FBB,
1721 : ArrayRef<MachineOperand> Cond,
1722 : const DebugLoc &DL,
1723 : int *BytesAdded) const {
1724 2249 : if (!FBB && Cond.empty()) {
1725 670 : BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1726 : .addMBB(TBB);
1727 670 : if (BytesAdded)
1728 0 : *BytesAdded = 4;
1729 670 : return 1;
1730 : }
1731 :
1732 1579 : if(Cond.size() == 1 && Cond[0].isReg()) {
1733 0 : BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1734 : .add(Cond[0])
1735 : .addMBB(TBB);
1736 0 : return 1;
1737 : }
1738 :
1739 : assert(TBB && Cond[0].isImm());
1740 :
1741 : unsigned Opcode
1742 1579 : = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1743 :
1744 1579 : if (!FBB) {
1745 : Cond[1].isUndef();
1746 : MachineInstr *CondBr =
1747 1481 : BuildMI(&MBB, DL, get(Opcode))
1748 : .addMBB(TBB);
1749 :
1750 : // Copy the flags onto the implicit condition register operand.
1751 1481 : preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1752 :
1753 1481 : if (BytesAdded)
1754 0 : *BytesAdded = 4;
1755 1481 : return 1;
1756 : }
1757 :
1758 : assert(TBB && FBB);
1759 :
1760 : MachineInstr *CondBr =
1761 98 : BuildMI(&MBB, DL, get(Opcode))
1762 : .addMBB(TBB);
1763 98 : BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1764 : .addMBB(FBB);
1765 :
1766 98 : MachineOperand &CondReg = CondBr->getOperand(1);
1767 : CondReg.setIsUndef(Cond[1].isUndef());
1768 : CondReg.setIsKill(Cond[1].isKill());
1769 :
1770 98 : if (BytesAdded)
1771 32 : *BytesAdded = 8;
1772 :
1773 : return 2;
1774 : }
1775 :
1776 1296 : bool SIInstrInfo::reverseBranchCondition(
1777 : SmallVectorImpl<MachineOperand> &Cond) const {
1778 1296 : if (Cond.size() != 2) {
1779 : return true;
1780 : }
1781 :
1782 1296 : if (Cond[0].isImm()) {
1783 1296 : Cond[0].setImm(-Cond[0].getImm());
1784 1296 : return false;
1785 : }
1786 :
1787 : return true;
1788 : }
1789 :
1790 22 : bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
1791 : ArrayRef<MachineOperand> Cond,
1792 : unsigned TrueReg, unsigned FalseReg,
1793 : int &CondCycles,
1794 : int &TrueCycles, int &FalseCycles) const {
1795 22 : switch (Cond[0].getImm()) {
1796 15 : case VCCNZ:
1797 : case VCCZ: {
1798 15 : const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1799 : const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1800 : assert(MRI.getRegClass(FalseReg) == RC);
1801 :
1802 30 : int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1803 15 : CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1804 :
1805 : // Limit to equal cost for branch vs. N v_cndmask_b32s.
1806 15 : return !RI.isSGPRClass(RC) && NumInsts <= 6;
1807 : }
1808 7 : case SCC_TRUE:
1809 : case SCC_FALSE: {
1810 : // FIXME: We could insert for VGPRs if we could replace the original compare
1811 : // with a vector one.
1812 7 : const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1813 : const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1814 : assert(MRI.getRegClass(FalseReg) == RC);
1815 :
1816 14 : int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1817 :
1818 : // Multiples of 8 can do s_cselect_b64
1819 7 : if (NumInsts % 2 == 0)
1820 3 : NumInsts /= 2;
1821 :
1822 7 : CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1823 7 : return RI.isSGPRClass(RC);
1824 : }
1825 : default:
1826 : return false;
1827 : }
1828 : }
1829 :
1830 16 : void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
1831 : MachineBasicBlock::iterator I, const DebugLoc &DL,
1832 : unsigned DstReg, ArrayRef<MachineOperand> Cond,
1833 : unsigned TrueReg, unsigned FalseReg) const {
1834 16 : BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1835 16 : if (Pred == VCCZ || Pred == SCC_FALSE) {
1836 0 : Pred = static_cast<BranchPredicate>(-Pred);
1837 : std::swap(TrueReg, FalseReg);
1838 : }
1839 :
1840 16 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1841 : const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1842 : unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1843 :
1844 16 : if (DstSize == 32) {
1845 9 : unsigned SelOp = Pred == SCC_TRUE ?
1846 : AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1847 :
1848 : // Instruction's operands are backwards from what is expected.
1849 : MachineInstr *Select =
1850 18 : BuildMI(MBB, I, DL, get(SelOp), DstReg)
1851 9 : .addReg(FalseReg)
1852 9 : .addReg(TrueReg);
1853 :
1854 9 : preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1855 10 : return;
1856 : }
1857 :
1858 7 : if (DstSize == 64 && Pred == SCC_TRUE) {
1859 : MachineInstr *Select =
1860 2 : BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1861 1 : .addReg(FalseReg)
1862 1 : .addReg(TrueReg);
1863 :
1864 1 : preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1865 1 : return;
1866 : }
1867 :
1868 : static const int16_t Sub0_15[] = {
1869 : AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1870 : AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1871 : AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1872 : AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1873 : };
1874 :
1875 : static const int16_t Sub0_15_64[] = {
1876 : AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1877 : AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1878 : AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1879 : AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1880 : };
1881 :
1882 : unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1883 : const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1884 : const int16_t *SubIndices = Sub0_15;
1885 6 : int NElts = DstSize / 32;
1886 :
1887 : // 64-bit select is only avaialble for SALU.
1888 6 : if (Pred == SCC_TRUE) {
1889 : SelOp = AMDGPU::S_CSELECT_B64;
1890 : EltRC = &AMDGPU::SGPR_64RegClass;
1891 : SubIndices = Sub0_15_64;
1892 :
1893 : assert(NElts % 2 == 0);
1894 2 : NElts /= 2;
1895 : }
1896 :
1897 : MachineInstrBuilder MIB = BuildMI(
1898 12 : MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1899 :
1900 6 : I = MIB->getIterator();
1901 :
1902 : SmallVector<unsigned, 8> Regs;
1903 22 : for (int Idx = 0; Idx != NElts; ++Idx) {
1904 16 : unsigned DstElt = MRI.createVirtualRegister(EltRC);
1905 16 : Regs.push_back(DstElt);
1906 :
1907 16 : unsigned SubIdx = SubIndices[Idx];
1908 :
1909 : MachineInstr *Select =
1910 32 : BuildMI(MBB, I, DL, get(SelOp), DstElt)
1911 16 : .addReg(FalseReg, 0, SubIdx)
1912 16 : .addReg(TrueReg, 0, SubIdx);
1913 16 : preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1914 :
1915 16 : MIB.addReg(DstElt)
1916 16 : .addImm(SubIdx);
1917 : }
1918 : }
1919 :
1920 964557 : bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
1921 1929114 : switch (MI.getOpcode()) {
1922 25142 : case AMDGPU::V_MOV_B32_e32:
1923 : case AMDGPU::V_MOV_B32_e64:
1924 : case AMDGPU::V_MOV_B64_PSEUDO: {
1925 : // If there are additional implicit register operands, this may be used for
1926 : // register indexing so the source register operand isn't simply copied.
1927 25142 : unsigned NumOps = MI.getDesc().getNumOperands() +
1928 25142 : MI.getDesc().getNumImplicitUses();
1929 :
1930 25142 : return MI.getNumOperands() == NumOps;
1931 : }
1932 : case AMDGPU::S_MOV_B32:
1933 : case AMDGPU::S_MOV_B64:
1934 : case AMDGPU::COPY:
1935 : return true;
1936 533556 : default:
1937 533556 : return false;
1938 : }
1939 : }
1940 :
1941 86955 : unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
1942 : unsigned Kind) const {
1943 : switch(Kind) {
1944 : case PseudoSourceValue::Stack:
1945 : case PseudoSourceValue::FixedStack:
1946 : return AMDGPUAS::PRIVATE_ADDRESS;
1947 : case PseudoSourceValue::ConstantPool:
1948 : case PseudoSourceValue::GOT:
1949 : case PseudoSourceValue::JumpTable:
1950 : case PseudoSourceValue::GlobalValueCallEntry:
1951 : case PseudoSourceValue::ExternalSymbolCallEntry:
1952 : case PseudoSourceValue::TargetCustom:
1953 : return AMDGPUAS::CONSTANT_ADDRESS;
1954 : }
1955 : return AMDGPUAS::FLAT_ADDRESS;
1956 : }
1957 :
1958 38 : static void removeModOperands(MachineInstr &MI) {
1959 38 : unsigned Opc = MI.getOpcode();
1960 38 : int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1961 : AMDGPU::OpName::src0_modifiers);
1962 38 : int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1963 : AMDGPU::OpName::src1_modifiers);
1964 38 : int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1965 : AMDGPU::OpName::src2_modifiers);
1966 :
1967 38 : MI.RemoveOperand(Src2ModIdx);
1968 38 : MI.RemoveOperand(Src1ModIdx);
1969 38 : MI.RemoveOperand(Src0ModIdx);
1970 38 : }
1971 :
1972 57238 : bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
1973 : unsigned Reg, MachineRegisterInfo *MRI) const {
1974 57238 : if (!MRI->hasOneNonDBGUse(Reg))
1975 : return false;
1976 :
1977 45780 : switch (DefMI.getOpcode()) {
1978 : default:
1979 : return false;
1980 : case AMDGPU::S_MOV_B64:
1981 : // TODO: We could fold 64-bit immediates, but this get compilicated
1982 : // when there are sub-registers.
1983 : return false;
1984 :
1985 : case AMDGPU::V_MOV_B32_e32:
1986 : case AMDGPU::S_MOV_B32:
1987 : break;
1988 : }
1989 :
1990 22313 : const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
1991 : assert(ImmOp);
1992 : // FIXME: We could handle FrameIndex values here.
1993 22313 : if (!ImmOp->isImm())
1994 : return false;
1995 :
1996 21939 : unsigned Opc = UseMI.getOpcode();
1997 21939 : if (Opc == AMDGPU::COPY) {
1998 5813 : bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
1999 5813 : unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2000 5813 : UseMI.setDesc(get(NewOpc));
2001 11626 : UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
2002 5813 : UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2003 5813 : return true;
2004 : }
2005 :
2006 16126 : if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2007 15920 : Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
2008 : // Don't fold if we are using source or output modifiers. The new VOP2
2009 : // instructions don't have them.
2010 241 : if (hasAnyModifiersSet(UseMI))
2011 : return false;
2012 :
2013 : // If this is a free constant, there's no reason to do this.
2014 : // TODO: We could fold this here instead of letting SIFoldOperands do it
2015 : // later.
2016 207 : MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2017 :
2018 : // Any src operand can be used for the legality check.
2019 207 : if (isInlineConstant(UseMI, *Src0, *ImmOp))
2020 : return false;
2021 :
2022 : bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
2023 106 : MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2024 106 : MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2025 :
2026 : // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2027 : // We should only expect these to be on src0 due to canonicalizations.
2028 106 : if (Src0->isReg() && Src0->getReg() == Reg) {
2029 7 : if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2030 0 : return false;
2031 :
2032 7 : if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2033 0 : return false;
2034 :
2035 : // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2036 :
2037 7 : const int64_t Imm = ImmOp->getImm();
2038 :
2039 : // FIXME: This would be a lot easier if we could return a new instruction
2040 : // instead of having to modify in place.
2041 :
2042 : // Remove these first since they are at the end.
2043 7 : UseMI.RemoveOperand(
2044 7 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2045 7 : UseMI.RemoveOperand(
2046 7 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2047 :
2048 7 : unsigned Src1Reg = Src1->getReg();
2049 : unsigned Src1SubReg = Src1->getSubReg();
2050 7 : Src0->setReg(Src1Reg);
2051 : Src0->setSubReg(Src1SubReg);
2052 : Src0->setIsKill(Src1->isKill());
2053 :
2054 7 : if (Opc == AMDGPU::V_MAC_F32_e64 ||
2055 7 : Opc == AMDGPU::V_MAC_F16_e64)
2056 0 : UseMI.untieRegOperand(
2057 0 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2058 :
2059 7 : Src1->ChangeToImmediate(Imm);
2060 :
2061 7 : removeModOperands(UseMI);
2062 10 : UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
2063 :
2064 7 : bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2065 7 : if (DeleteDef)
2066 0 : DefMI.eraseFromParent();
2067 :
2068 7 : return true;
2069 : }
2070 :
2071 : // Added part is the constant: Use v_madak_{f16, f32}.
2072 99 : if (Src2->isReg() && Src2->getReg() == Reg) {
2073 : // Not allowed to use constant bus for another operand.
2074 : // We can however allow an inline immediate as src0.
2075 : bool Src0Inlined = false;
2076 44 : if (Src0->isReg()) {
2077 : // Try to inline constant if possible.
2078 : // If the Def moves immediate and the use is single
2079 : // We are saving VGPR here.
2080 44 : MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
2081 46 : if (Def && Def->isMoveImmediate() &&
2082 53 : isInlineConstant(Def->getOperand(1)) &&
2083 3 : MRI->hasOneUse(Src0->getReg())) {
2084 3 : Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2085 : Src0Inlined = true;
2086 43 : } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
2087 42 : RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
2088 79 : (RI.isVirtualRegister(Src0->getReg()) &&
2089 39 : RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
2090 7 : return false;
2091 : // VGPR is okay as Src0 - fallthrough
2092 : }
2093 :
2094 37 : if (Src1->isReg() && !Src0Inlined ) {
2095 : // We have one slot for inlinable constant so far - try to fill it
2096 34 : MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
2097 49 : if (Def && Def->isMoveImmediate() &&
2098 46 : isInlineConstant(Def->getOperand(1)) &&
2099 60 : MRI->hasOneUse(Src1->getReg()) &&
2100 12 : commuteInstruction(UseMI)) {
2101 12 : Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2102 23 : } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
2103 22 : RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2104 42 : (RI.isVirtualRegister(Src1->getReg()) &&
2105 21 : RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2106 6 : return false;
2107 : // VGPR is okay as Src1 - fallthrough
2108 : }
2109 :
2110 31 : const int64_t Imm = ImmOp->getImm();
2111 :
2112 : // FIXME: This would be a lot easier if we could return a new instruction
2113 : // instead of having to modify in place.
2114 :
2115 : // Remove these first since they are at the end.
2116 31 : UseMI.RemoveOperand(
2117 31 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2118 31 : UseMI.RemoveOperand(
2119 31 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2120 :
2121 31 : if (Opc == AMDGPU::V_MAC_F32_e64 ||
2122 31 : Opc == AMDGPU::V_MAC_F16_e64)
2123 28 : UseMI.untieRegOperand(
2124 28 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2125 :
2126 : // ChangingToImmediate adds Src2 back to the instruction.
2127 31 : Src2->ChangeToImmediate(Imm);
2128 :
2129 : // These come before src2.
2130 31 : removeModOperands(UseMI);
2131 35 : UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
2132 :
2133 31 : bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2134 31 : if (DeleteDef)
2135 0 : DefMI.eraseFromParent();
2136 :
2137 31 : return true;
2138 : }
2139 : }
2140 :
2141 : return false;
2142 : }
2143 :
2144 : static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2145 : int WidthB, int OffsetB) {
2146 23856 : int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2147 23856 : int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2148 23856 : int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2149 23856 : return LowOffset + LowWidth <= HighOffset;
2150 : }
2151 :
2152 874733 : bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
2153 : MachineInstr &MIb) const {
2154 : unsigned BaseReg0, BaseReg1;
2155 : int64_t Offset0, Offset1;
2156 :
2157 964574 : if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
2158 89841 : getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
2159 :
2160 148347 : if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2161 : // FIXME: Handle ds_read2 / ds_write2.
2162 23814 : return false;
2163 : }
2164 61403 : unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2165 61403 : unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2166 85259 : if (BaseReg0 == BaseReg1 &&
2167 23856 : offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2168 18845 : return true;
2169 : }
2170 : }
2171 :
2172 : return false;
2173 : }
2174 :
2175 959878 : bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
2176 : MachineInstr &MIb,
2177 : AliasAnalysis *AA) const {
2178 : assert((MIa.mayLoad() || MIa.mayStore()) &&
2179 : "MIa must load from or modify a memory location");
2180 : assert((MIb.mayLoad() || MIb.mayStore()) &&
2181 : "MIb must load from or modify a memory location");
2182 :
2183 959878 : if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
2184 0 : return false;
2185 :
2186 : // XXX - Can we relax this between address spaces?
2187 959878 : if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2188 208 : return false;
2189 :
2190 964990 : if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
2191 2622 : const MachineMemOperand *MMOa = *MIa.memoperands_begin();
2192 2622 : const MachineMemOperand *MMOb = *MIb.memoperands_begin();
2193 4461 : if (MMOa->getValue() && MMOb->getValue()) {
2194 1666 : MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
2195 1666 : MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
2196 1666 : if (!AA->alias(LocA, LocB))
2197 1172 : return true;
2198 : }
2199 : }
2200 :
2201 : // TODO: Should we check the address space from the MachineMemOperand? That
2202 : // would allow us to distinguish objects we know don't alias based on the
2203 : // underlying address space, even if it was lowered to a different one,
2204 : // e.g. private accesses lowered to use MUBUF instructions on a scratch
2205 : // buffer.
2206 958498 : if (isDS(MIa)) {
2207 118542 : if (isDS(MIb))
2208 64311 : return checkInstOffsetsDoNotOverlap(MIa, MIb);
2209 :
2210 54231 : return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2211 : }
2212 :
2213 839956 : if (isMUBUF(MIa) || isMTBUF(MIa)) {
2214 789645 : if (isMUBUF(MIb) || isMTBUF(MIb))
2215 777403 : return checkInstOffsetsDoNotOverlap(MIa, MIb);
2216 :
2217 12242 : return !isFLAT(MIb) && !isSMRD(MIb);
2218 : }
2219 :
2220 50311 : if (isSMRD(MIa)) {
2221 3689 : if (isSMRD(MIb))
2222 0 : return checkInstOffsetsDoNotOverlap(MIa, MIb);
2223 :
2224 3689 : return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2225 : }
2226 :
2227 46622 : if (isFLAT(MIa)) {
2228 46563 : if (isFLAT(MIb))
2229 33019 : return checkInstOffsetsDoNotOverlap(MIa, MIb);
2230 :
2231 : return false;
2232 : }
2233 :
2234 : return false;
2235 : }
2236 :
2237 768 : static int64_t getFoldableImm(const MachineOperand* MO) {
2238 768 : if (!MO->isReg())
2239 : return false;
2240 766 : const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2241 766 : const MachineRegisterInfo &MRI = MF->getRegInfo();
2242 766 : auto Def = MRI.getUniqueVRegDef(MO->getReg());
2243 766 : if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
2244 17 : Def->getOperand(1).isImm())
2245 17 : return Def->getOperand(1).getImm();
2246 : return AMDGPU::NoRegister;
2247 : }
2248 :
2249 305 : MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
2250 : MachineInstr &MI,
2251 : LiveVariables *LV) const {
2252 305 : unsigned Opc = MI.getOpcode();
2253 : bool IsF16 = false;
2254 305 : bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
2255 :
2256 305 : switch (Opc) {
2257 : default:
2258 : return nullptr;
2259 0 : case AMDGPU::V_MAC_F16_e64:
2260 : IsF16 = true;
2261 : LLVM_FALLTHROUGH;
2262 : case AMDGPU::V_MAC_F32_e64:
2263 : case AMDGPU::V_FMAC_F32_e64:
2264 : break;
2265 6 : case AMDGPU::V_MAC_F16_e32:
2266 : IsF16 = true;
2267 : LLVM_FALLTHROUGH;
2268 291 : case AMDGPU::V_MAC_F32_e32:
2269 : case AMDGPU::V_FMAC_F32_e32: {
2270 291 : int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2271 : AMDGPU::OpName::src0);
2272 291 : const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2273 291 : if (!Src0->isReg() && !Src0->isImm())
2274 : return nullptr;
2275 :
2276 290 : if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2277 : return nullptr;
2278 :
2279 : break;
2280 : }
2281 : }
2282 :
2283 301 : const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2284 301 : const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2285 : const MachineOperand *Src0Mods =
2286 301 : getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2287 301 : const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2288 : const MachineOperand *Src1Mods =
2289 301 : getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2290 301 : const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2291 301 : const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2292 301 : const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2293 :
2294 301 : if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
2295 : // If we have an SGPR input, we will violate the constant bus restriction.
2296 558 : (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
2297 265 : if (auto Imm = getFoldableImm(Src2)) {
2298 12 : return BuildMI(*MBB, MI, MI.getDebugLoc(),
2299 32 : get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
2300 : .add(*Dst)
2301 : .add(*Src0)
2302 : .add(*Src1)
2303 : .addImm(Imm);
2304 : }
2305 253 : if (auto Imm = getFoldableImm(Src1)) {
2306 3 : return BuildMI(*MBB, MI, MI.getDebugLoc(),
2307 7 : get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2308 : .add(*Dst)
2309 : .add(*Src0)
2310 : .addImm(Imm)
2311 : .add(*Src2);
2312 : }
2313 250 : if (auto Imm = getFoldableImm(Src0)) {
2314 2 : if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2315 : AMDGPU::OpName::src0), Src1))
2316 2 : return BuildMI(*MBB, MI, MI.getDebugLoc(),
2317 4 : get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2318 : .add(*Dst)
2319 : .add(*Src1)
2320 : .addImm(Imm)
2321 : .add(*Src2);
2322 : }
2323 : }
2324 :
2325 : assert((!IsFMA || !IsF16) && "fmac only expected with f32");
2326 284 : unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
2327 : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2328 568 : return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2329 : .add(*Dst)
2330 284 : .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2331 : .add(*Src0)
2332 284 : .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2333 : .add(*Src1)
2334 : .addImm(0) // Src mods
2335 : .add(*Src2)
2336 284 : .addImm(Clamp ? Clamp->getImm() : 0)
2337 284 : .addImm(Omod ? Omod->getImm() : 0);
2338 : }
2339 :
2340 : // It's not generally safe to move VALU instructions across these since it will
2341 : // start using the register as a base index rather than directly.
2342 : // XXX - Why isn't hasSideEffects sufficient for these?
2343 : static bool changesVGPRIndexingMode(const MachineInstr &MI) {
2344 533351 : switch (MI.getOpcode()) {
2345 : case AMDGPU::S_SET_GPR_IDX_ON:
2346 : case AMDGPU::S_SET_GPR_IDX_MODE:
2347 : case AMDGPU::S_SET_GPR_IDX_OFF:
2348 : return true;
2349 : default:
2350 : return false;
2351 : }
2352 : }
2353 :
2354 576381 : bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
2355 : const MachineBasicBlock *MBB,
2356 : const MachineFunction &MF) const {
2357 : // XXX - Do we want the SP check in the base implementation?
2358 :
2359 : // Target-independent instructions do not have an implicit-use of EXEC, even
2360 : // when they operate on VGPRs. Treating EXEC modifications as scheduling
2361 : // boundaries prevents incorrect movements of such instructions.
2362 1113214 : return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2363 1070358 : MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2364 533525 : MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2365 576381 : MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2366 576381 : changesVGPRIndexingMode(MI);
2367 : }
2368 :
2369 3260 : bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
2370 3260 : unsigned Opcode = MI.getOpcode();
2371 :
2372 3260 : if (MI.mayStore() && isSMRD(MI))
2373 : return true; // scalar store or atomic
2374 :
2375 : // These instructions cause shader I/O that may cause hardware lockups
2376 : // when executed with an empty EXEC mask.
2377 : //
2378 : // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
2379 : // EXEC = 0, but checking for that case here seems not worth it
2380 : // given the typical code patterns.
2381 6520 : if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
2382 3260 : Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE)
2383 : return true;
2384 :
2385 3248 : if (MI.isInlineAsm())
2386 : return true; // conservative assumption
2387 :
2388 : // These are like SALU instructions in terms of effects, so it's questionable
2389 : // whether we should return true for those.
2390 : //
2391 : // However, executing them with EXEC = 0 causes them to operate on undefined
2392 : // data, which we avoid by returning true here.
2393 3236 : if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
2394 12 : return true;
2395 :
2396 : return false;
2397 : }
2398 :
2399 5823 : bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2400 5823 : switch (Imm.getBitWidth()) {
2401 156 : case 32:
2402 312 : return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
2403 312 : ST.hasInv2PiInlineImm());
2404 5514 : case 64:
2405 11028 : return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
2406 11028 : ST.hasInv2PiInlineImm());
2407 153 : case 16:
2408 153 : return ST.has16BitInsts() &&
2409 153 : AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
2410 153 : ST.hasInv2PiInlineImm());
2411 0 : default:
2412 0 : llvm_unreachable("invalid bitwidth");
2413 : }
2414 : }
2415 :
2416 4760521 : bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
2417 : uint8_t OperandType) const {
2418 : if (!MO.isImm() ||
2419 4760521 : OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2420 : OperandType > AMDGPU::OPERAND_SRC_LAST)
2421 : return false;
2422 :
2423 : // MachineOperand provides no way to tell the true operand size, since it only
2424 : // records a 64-bit value. We need to know the size to determine if a 32-bit
2425 : // floating point immediate bit pattern is legal for an integer immediate. It
2426 : // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2427 :
2428 4707374 : int64_t Imm = MO.getImm();
2429 4707374 : switch (OperandType) {
2430 4458799 : case AMDGPU::OPERAND_REG_IMM_INT32:
2431 : case AMDGPU::OPERAND_REG_IMM_FP32:
2432 : case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2433 : case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
2434 4458799 : int32_t Trunc = static_cast<int32_t>(Imm);
2435 4458799 : return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
2436 : }
2437 49180 : case AMDGPU::OPERAND_REG_IMM_INT64:
2438 : case AMDGPU::OPERAND_REG_IMM_FP64:
2439 : case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2440 : case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2441 49180 : return AMDGPU::isInlinableLiteral64(MO.getImm(),
2442 49180 : ST.hasInv2PiInlineImm());
2443 : case AMDGPU::OPERAND_REG_IMM_INT16:
2444 : case AMDGPU::OPERAND_REG_IMM_FP16:
2445 : case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2446 : case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2447 191705 : if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2448 : // A few special case instructions have 16-bit operands on subtargets
2449 : // where 16-bit instructions are not legal.
2450 : // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2451 : // constants in these cases
2452 : int16_t Trunc = static_cast<int16_t>(Imm);
2453 191582 : return ST.has16BitInsts() &&
2454 191580 : AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
2455 : }
2456 :
2457 : return false;
2458 : }
2459 7690 : case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
2460 : case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
2461 7690 : if (isUInt<16>(Imm)) {
2462 925 : int16_t Trunc = static_cast<int16_t>(Imm);
2463 925 : return ST.has16BitInsts() &&
2464 925 : AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
2465 : }
2466 6765 : if (!(Imm & 0xffff)) {
2467 25 : return ST.has16BitInsts() &&
2468 25 : AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
2469 : }
2470 : uint32_t Trunc = static_cast<uint32_t>(Imm);
2471 6740 : return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
2472 : }
2473 0 : default:
2474 0 : llvm_unreachable("invalid bitwidth");
2475 : }
2476 : }
2477 :
2478 640447 : bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
2479 : const MCOperandInfo &OpInfo) const {
2480 : switch (MO.getType()) {
2481 : case MachineOperand::MO_Register:
2482 : return false;
2483 191414 : case MachineOperand::MO_Immediate:
2484 191414 : return !isInlineConstant(MO, OpInfo);
2485 : case MachineOperand::MO_FrameIndex:
2486 : case MachineOperand::MO_MachineBasicBlock:
2487 : case MachineOperand::MO_ExternalSymbol:
2488 : case MachineOperand::MO_GlobalAddress:
2489 : case MachineOperand::MO_MCSymbol:
2490 : return true;
2491 0 : default:
2492 0 : llvm_unreachable("unexpected operand type");
2493 : }
2494 : }
2495 :
2496 : static bool compareMachineOp(const MachineOperand &Op0,
2497 : const MachineOperand &Op1) {
2498 : if (Op0.getType() != Op1.getType())
2499 : return false;
2500 :
2501 : switch (Op0.getType()) {
2502 5892 : case MachineOperand::MO_Register:
2503 17842 : return Op0.getReg() == Op1.getReg();
2504 : case MachineOperand::MO_Immediate:
2505 : return Op0.getImm() == Op1.getImm();
2506 : default:
2507 : llvm_unreachable("Didn't expect to be comparing these operand types");
2508 : }
2509 : }
2510 :
2511 92762 : bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
2512 : const MachineOperand &MO) const {
2513 185524 : const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2514 :
2515 : assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2516 :
2517 92762 : if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2518 : return true;
2519 :
2520 92762 : if (OpInfo.RegClass < 0)
2521 : return false;
2522 :
2523 185262 : if (MO.isImm() && isInlineConstant(MO, OpInfo))
2524 128406 : return RI.opCanUseInlineConstant(OpInfo.OperandType);
2525 :
2526 57118 : return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2527 : }
2528 :
2529 777103 : bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2530 777103 : int Op32 = AMDGPU::getVOPe32(Opcode);
2531 777103 : if (Op32 == -1)
2532 : return false;
2533 :
2534 134545 : return pseudoToMCOpcode(Op32) != -1;
2535 : }
2536 :
2537 0 : bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2538 : // The src0_modifier operand is present on all instructions
2539 : // that have modifiers.
2540 :
2541 0 : return AMDGPU::getNamedOperandIdx(Opcode,
2542 0 : AMDGPU::OpName::src0_modifiers) != -1;
2543 : }
2544 :
2545 219769 : bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
2546 : unsigned OpName) const {
2547 : const MachineOperand *Mods = getNamedOperand(MI, OpName);
2548 219769 : return Mods && Mods->getImm();
2549 : }
2550 :
2551 241 : bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
2552 473 : return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2553 461 : hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2554 440 : hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2555 661 : hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2556 209 : hasModifiersSet(MI, AMDGPU::OpName::omod);
2557 : }
2558 :
2559 88061 : bool SIInstrInfo::canShrink(const MachineInstr &MI,
2560 : const MachineRegisterInfo &MRI) const {
2561 : const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2562 : // Can't shrink instruction with three operands.
2563 : // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
2564 : // a special case for it. It can only be shrunk if the third operand
2565 : // is vcc. We should handle this the same way we handle vopc, by addding
2566 : // a register allocation hint pre-regalloc and then do the shrinking
2567 : // post-regalloc.
2568 88061 : if (Src2) {
2569 50100 : switch (MI.getOpcode()) {
2570 : default: return false;
2571 :
2572 : case AMDGPU::V_ADDC_U32_e64:
2573 : case AMDGPU::V_SUBB_U32_e64:
2574 : case AMDGPU::V_SUBBREV_U32_e64: {
2575 : const MachineOperand *Src1
2576 : = getNamedOperand(MI, AMDGPU::OpName::src1);
2577 14517 : if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
2578 3679 : return false;
2579 : // Additional verification is needed for sdst/src2.
2580 : return true;
2581 : }
2582 : case AMDGPU::V_MAC_F32_e64:
2583 : case AMDGPU::V_MAC_F16_e64:
2584 : case AMDGPU::V_FMAC_F32_e64:
2585 3272 : if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
2586 1636 : hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
2587 0 : return false;
2588 : break;
2589 :
2590 : case AMDGPU::V_CNDMASK_B32_e64:
2591 : break;
2592 : }
2593 : }
2594 :
2595 : const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2596 121738 : if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
2597 48194 : hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
2598 20647 : return false;
2599 :
2600 : // We don't need to check src0, all input types are legal, so just make sure
2601 : // src0 isn't using any modifiers.
2602 52897 : if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
2603 : return false;
2604 :
2605 : // Check output modifiers
2606 103864 : return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
2607 51914 : !hasModifiersSet(MI, AMDGPU::OpName::clamp);
2608 : }
2609 :
2610 : // Set VCC operand with all flags from \p Orig, except for setting it as
2611 : // implicit.
2612 7406 : static void copyFlagsToImplicitVCC(MachineInstr &MI,
2613 : const MachineOperand &Orig) {
2614 :
2615 12554 : for (MachineOperand &Use : MI.implicit_operands()) {
2616 12554 : if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
2617 : Use.setIsUndef(Orig.isUndef());
2618 : Use.setIsKill(Orig.isKill());
2619 7406 : return;
2620 : }
2621 : }
2622 : }
2623 :
2624 42959 : MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
2625 : unsigned Op32) const {
2626 42959 : MachineBasicBlock *MBB = MI.getParent();;
2627 : MachineInstrBuilder Inst32 =
2628 85918 : BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
2629 :
2630 : // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
2631 : // For VOPC instructions, this is replaced by an implicit def of vcc.
2632 42959 : int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
2633 42959 : if (Op32DstIdx != -1) {
2634 : // dst
2635 40730 : Inst32.add(MI.getOperand(0));
2636 : } else {
2637 : assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
2638 : "Unexpected case");
2639 : }
2640 :
2641 42959 : Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
2642 :
2643 42959 : const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2644 42959 : if (Src1)
2645 : Inst32.add(*Src1);
2646 :
2647 42959 : const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2648 :
2649 42959 : if (Src2) {
2650 8629 : int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
2651 8629 : if (Op32Src2Idx != -1) {
2652 : Inst32.add(*Src2);
2653 : } else {
2654 : // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
2655 : // replaced with an implicit read of vcc. This was already added
2656 : // during the initial BuildMI, so find it to preserve the flags.
2657 7406 : copyFlagsToImplicitVCC(*Inst32, *Src2);
2658 : }
2659 : }
2660 :
2661 42959 : return Inst32;
2662 : }
2663 :
2664 8785463 : bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
2665 : const MachineOperand &MO,
2666 : const MCOperandInfo &OpInfo) const {
2667 : // Literal constants use the constant bus.
2668 : //if (isLiteralConstantLike(MO, OpInfo))
2669 : // return true;
2670 8785463 : if (MO.isImm())
2671 2234167 : return !isInlineConstant(MO, OpInfo);
2672 :
2673 6551296 : if (!MO.isReg())
2674 : return true; // Misc other operands like FrameIndex
2675 :
2676 6541148 : if (!MO.isUse())
2677 : return false;
2678 :
2679 12703584 : if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2680 3182316 : return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2681 :
2682 : // FLAT_SCR is just an SGPR pair.
2683 3169476 : if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2684 : return true;
2685 :
2686 : // EXEC register uses the constant bus.
2687 3169476 : if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2688 : return true;
2689 :
2690 : // SGPRs use the constant bus
2691 3169476 : return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2692 788109 : (!MO.isImplicit() &&
2693 5012663 : (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2694 2166148 : AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2695 : }
2696 :
2697 4239473 : static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2698 9203088 : for (const MachineOperand &MO : MI.implicit_operands()) {
2699 : // We only care about reads.
2700 5029705 : if (MO.isDef())
2701 : continue;
2702 :
2703 4735708 : switch (MO.getReg()) {
2704 : case AMDGPU::VCC:
2705 : case AMDGPU::M0:
2706 : case AMDGPU::FLAT_SCR:
2707 : return MO.getReg();
2708 :
2709 : default:
2710 : break;
2711 : }
2712 : }
2713 :
2714 : return AMDGPU::NoRegister;
2715 : }
2716 :
2717 11521906 : static bool shouldReadExec(const MachineInstr &MI) {
2718 11521906 : if (SIInstrInfo::isVALU(MI)) {
2719 4223992 : switch (MI.getOpcode()) {
2720 : case AMDGPU::V_READLANE_B32:
2721 : case AMDGPU::V_READLANE_B32_si:
2722 : case AMDGPU::V_READLANE_B32_vi:
2723 : case AMDGPU::V_WRITELANE_B32:
2724 : case AMDGPU::V_WRITELANE_B32_si:
2725 : case AMDGPU::V_WRITELANE_B32_vi:
2726 : return false;
2727 : }
2728 :
2729 4191418 : return true;
2730 : }
2731 :
2732 7297914 : if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2733 10775007 : SIInstrInfo::isSALU(MI) ||
2734 : SIInstrInfo::isSMRD(MI))
2735 5031784 : return false;
2736 :
2737 : return true;
2738 : }
2739 :
2740 2970 : static bool isSubRegOf(const SIRegisterInfo &TRI,
2741 : const MachineOperand &SuperVec,
2742 : const MachineOperand &SubReg) {
2743 5940 : if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
2744 1906 : return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2745 :
2746 1064 : return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2747 1064 : SubReg.getReg() == SuperVec.getReg();
2748 : }
2749 :
2750 17022433 : bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
2751 : StringRef &ErrInfo) const {
2752 17022433 : uint16_t Opcode = MI.getOpcode();
2753 17022433 : if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2754 : return true;
2755 :
2756 11521906 : const MachineFunction *MF = MI.getParent()->getParent();
2757 11521906 : const MachineRegisterInfo &MRI = MF->getRegInfo();
2758 :
2759 11521906 : int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2760 11521906 : int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2761 11521906 : int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2762 :
2763 : // Make sure the number of operands is correct.
2764 11521906 : const MCInstrDesc &Desc = get(Opcode);
2765 34515469 : if (!Desc.isVariadic() &&
2766 22943314 : Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2767 0 : ErrInfo = "Instruction has wrong number of operands.";
2768 0 : return false;
2769 : }
2770 :
2771 11521906 : if (MI.isInlineAsm()) {
2772 : // Verify register classes for inlineasm constraints.
2773 0 : for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2774 0 : I != E; ++I) {
2775 0 : const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2776 0 : if (!RC)
2777 : continue;
2778 :
2779 0 : const MachineOperand &Op = MI.getOperand(I);
2780 0 : if (!Op.isReg())
2781 : continue;
2782 :
2783 0 : unsigned Reg = Op.getReg();
2784 0 : if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2785 0 : ErrInfo = "inlineasm operand has incorrect register class.";
2786 0 : return false;
2787 : }
2788 : }
2789 :
2790 : return true;
2791 : }
2792 :
2793 : // Make sure the register classes are correct.
2794 52252741 : for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2795 81461670 : if (MI.getOperand(i).isFPImm()) {
2796 0 : ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2797 : "all fp values to integers.";
2798 0 : return false;
2799 : }
2800 :
2801 40730835 : int RegClass = Desc.OpInfo[i].RegClass;
2802 :
2803 40730835 : switch (Desc.OpInfo[i].OperandType) {
2804 16380505 : case MCOI::OPERAND_REGISTER:
2805 16380505 : if (MI.getOperand(i).isImm()) {
2806 0 : ErrInfo = "Illegal immediate value for operand.";
2807 0 : return false;
2808 : }
2809 : break;
2810 : case AMDGPU::OPERAND_REG_IMM_INT32:
2811 : case AMDGPU::OPERAND_REG_IMM_FP32:
2812 : break;
2813 5529387 : case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2814 : case AMDGPU::OPERAND_REG_INLINE_C_FP32:
2815 : case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2816 : case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2817 : case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2818 : case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2819 : const MachineOperand &MO = MI.getOperand(i);
2820 7635630 : if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2821 0 : ErrInfo = "Illegal immediate value for operand.";
2822 0 : return false;
2823 : }
2824 : break;
2825 : }
2826 11799723 : case MCOI::OPERAND_IMMEDIATE:
2827 : case AMDGPU::OPERAND_KIMM32:
2828 : // Check if this operand is an immediate.
2829 : // FrameIndex operands will be replaced by immediates, so they are
2830 : // allowed.
2831 11799723 : if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2832 0 : ErrInfo = "Expected immediate, but got non-immediate";
2833 0 : return false;
2834 : }
2835 : LLVM_FALLTHROUGH;
2836 : default:
2837 : continue;
2838 : }
2839 :
2840 53920256 : if (!MI.getOperand(i).isReg())
2841 : continue;
2842 :
2843 21968251 : if (RegClass != -1) {
2844 21968251 : unsigned Reg = MI.getOperand(i).getReg();
2845 21968251 : if (Reg == AMDGPU::NoRegister ||
2846 : TargetRegisterInfo::isVirtualRegister(Reg))
2847 : continue;
2848 :
2849 11927407 : const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2850 11927407 : if (!RC->contains(Reg)) {
2851 0 : ErrInfo = "Operand has incorrect register class.";
2852 0 : return false;
2853 : }
2854 : }
2855 : }
2856 :
2857 : // Verify SDWA
2858 11521906 : if (isSDWA(MI)) {
2859 58235 : if (!ST.hasSDWA()) {
2860 0 : ErrInfo = "SDWA is not supported on this target";
2861 0 : return false;
2862 : }
2863 :
2864 58235 : int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2865 :
2866 58235 : const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2867 :
2868 291175 : for (int OpIdx: OpIndicies) {
2869 232940 : if (OpIdx == -1)
2870 : continue;
2871 167034 : const MachineOperand &MO = MI.getOperand(OpIdx);
2872 :
2873 167034 : if (!ST.hasSDWAScalar()) {
2874 : // Only VGPRS on VI
2875 130685 : if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2876 0 : ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2877 0 : return false;
2878 : }
2879 : } else {
2880 : // No immediates on GFX9
2881 36349 : if (!MO.isReg()) {
2882 0 : ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2883 0 : return false;
2884 : }
2885 : }
2886 : }
2887 :
2888 58235 : if (!ST.hasSDWAOmod()) {
2889 : // No omod allowed on VI
2890 : const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2891 45943 : if (OMod != nullptr &&
2892 14742 : (!OMod->isImm() || OMod->getImm() != 0)) {
2893 0 : ErrInfo = "OMod not allowed in SDWA instructions on VI";
2894 0 : return false;
2895 : }
2896 : }
2897 :
2898 58235 : uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2899 58235 : if (isVOPC(BasicOpcode)) {
2900 63 : if (!ST.hasSDWASdst() && DstIdx != -1) {
2901 : // Only vcc allowed as dst on VI for VOPC
2902 0 : const MachineOperand &Dst = MI.getOperand(DstIdx);
2903 0 : if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
2904 0 : ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2905 0 : return false;
2906 : }
2907 63 : } else if (!ST.hasSDWAOutModsVOPC()) {
2908 : // No clamp allowed on GFX9 for VOPC
2909 : const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2910 50 : if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
2911 0 : ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2912 0 : return false;
2913 : }
2914 :
2915 : // No omod allowed on GFX9 for VOPC
2916 : const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2917 50 : if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
2918 0 : ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2919 0 : return false;
2920 : }
2921 : }
2922 : }
2923 :
2924 : const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
2925 58235 : if (DstUnused && DstUnused->isImm() &&
2926 58172 : DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
2927 236 : const MachineOperand &Dst = MI.getOperand(DstIdx);
2928 236 : if (!Dst.isReg() || !Dst.isTied()) {
2929 0 : ErrInfo = "Dst register should have tied register";
2930 0 : return false;
2931 : }
2932 :
2933 : const MachineOperand &TiedMO =
2934 236 : MI.getOperand(MI.findTiedOperandIdx(DstIdx));
2935 236 : if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
2936 0 : ErrInfo =
2937 : "Dst register should be tied to implicit use of preserved register";
2938 0 : return false;
2939 472 : } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
2940 144 : Dst.getReg() != TiedMO.getReg()) {
2941 0 : ErrInfo = "Dst register should use same physical register as preserved";
2942 0 : return false;
2943 : }
2944 : }
2945 : }
2946 :
2947 : // Verify VOP*. Ignore multiple sgpr operands on writelane.
2948 11521906 : if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
2949 11521906 : && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
2950 : // Only look at the true operands. Only a real operand can use the constant
2951 : // bus, and we don't want to check pseudo-operands like the source modifier
2952 : // flags.
2953 4166938 : const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
2954 :
2955 : unsigned ConstantBusCount = 0;
2956 : unsigned LiteralCount = 0;
2957 :
2958 4166938 : if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
2959 : ++ConstantBusCount;
2960 :
2961 4166938 : unsigned SGPRUsed = findImplicitSGPRRead(MI);
2962 4166938 : if (SGPRUsed != AMDGPU::NoRegister)
2963 65712 : ++ConstantBusCount;
2964 :
2965 12206927 : for (int OpIdx : OpIndices) {
2966 11338099 : if (OpIdx == -1)
2967 : break;
2968 8039989 : const MachineOperand &MO = MI.getOperand(OpIdx);
2969 8039989 : if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
2970 1834096 : if (MO.isReg()) {
2971 1669597 : if (MO.getReg() != SGPRUsed)
2972 1651032 : ++ConstantBusCount;
2973 : SGPRUsed = MO.getReg();
2974 : } else {
2975 164499 : ++ConstantBusCount;
2976 164499 : ++LiteralCount;
2977 : }
2978 : }
2979 : }
2980 4166938 : if (ConstantBusCount > 1) {
2981 0 : ErrInfo = "VOP* instruction uses the constant bus more than once";
2982 0 : return false;
2983 : }
2984 :
2985 4166938 : if (isVOP3(MI) && LiteralCount) {
2986 0 : ErrInfo = "VOP3 instruction uses literal";
2987 0 : return false;
2988 : }
2989 : }
2990 :
2991 : // Verify misc. restrictions on specific instructions.
2992 23043812 : if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
2993 : Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
2994 12467 : const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2995 12467 : const MachineOperand &Src1 = MI.getOperand(Src1Idx);
2996 12467 : const MachineOperand &Src2 = MI.getOperand(Src2Idx);
2997 12467 : if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
2998 11950 : if (!compareMachineOp(Src0, Src1) &&
2999 : !compareMachineOp(Src0, Src2)) {
3000 0 : ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
3001 0 : return false;
3002 : }
3003 : }
3004 : }
3005 :
3006 11521906 : if (isSOPK(MI)) {
3007 13823 : int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
3008 13823 : if (sopkIsZext(MI)) {
3009 792 : if (!isUInt<16>(Imm)) {
3010 0 : ErrInfo = "invalid immediate for SOPK instruction";
3011 0 : return false;
3012 : }
3013 : } else {
3014 13031 : if (!isInt<16>(Imm)) {
3015 0 : ErrInfo = "invalid immediate for SOPK instruction";
3016 0 : return false;
3017 : }
3018 : }
3019 : }
3020 :
3021 11519707 : if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
3022 11519707 : Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
3023 23040842 : Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3024 : Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
3025 2970 : const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3026 : Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
3027 :
3028 2970 : const unsigned StaticNumOps = Desc.getNumOperands() +
3029 2970 : Desc.getNumImplicitUses();
3030 2970 : const unsigned NumImplicitOps = IsDst ? 2 : 1;
3031 :
3032 : // Allow additional implicit operands. This allows a fixup done by the post
3033 : // RA scheduler where the main implicit operand is killed and implicit-defs
3034 : // are added for sub-registers that remain live after this instruction.
3035 2970 : if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
3036 0 : ErrInfo = "missing implicit register operands";
3037 0 : return false;
3038 : }
3039 :
3040 : const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3041 2970 : if (IsDst) {
3042 771 : if (!Dst->isUse()) {
3043 0 : ErrInfo = "v_movreld_b32 vdst should be a use operand";
3044 0 : return false;
3045 : }
3046 :
3047 : unsigned UseOpIdx;
3048 771 : if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
3049 771 : UseOpIdx != StaticNumOps + 1) {
3050 0 : ErrInfo = "movrel implicit operands should be tied";
3051 0 : return false;
3052 : }
3053 : }
3054 :
3055 2970 : const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3056 : const MachineOperand &ImpUse
3057 2970 : = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
3058 5940 : if (!ImpUse.isReg() || !ImpUse.isUse() ||
3059 5169 : !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
3060 0 : ErrInfo = "src0 should be subreg of implicit vector use";
3061 0 : return false;
3062 : }
3063 : }
3064 :
3065 : // Make sure we aren't losing exec uses in the td files. This mostly requires
3066 : // being careful when using let Uses to try to add other use registers.
3067 11521906 : if (shouldReadExec(MI)) {
3068 6457548 : if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
3069 0 : ErrInfo = "VALU instruction does not implicitly read exec mask";
3070 0 : return false;
3071 : }
3072 : }
3073 :
3074 11521906 : if (isSMRD(MI)) {
3075 1210963 : if (MI.mayStore()) {
3076 : // The register offset form of scalar stores may only use m0 as the
3077 : // soffset register.
3078 : const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
3079 363 : if (Soff && Soff->getReg() != AMDGPU::M0) {
3080 0 : ErrInfo = "scalar stores must use m0 as offset register";
3081 0 : return false;
3082 : }
3083 : }
3084 : }
3085 :
3086 11521906 : if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
3087 : const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3088 393535 : if (Offset->getImm() != 0) {
3089 0 : ErrInfo = "subtarget does not support offsets in flat instructions";
3090 0 : return false;
3091 : }
3092 : }
3093 :
3094 : const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
3095 11521906 : if (DppCt) {
3096 : using namespace AMDGPU::DPP;
3097 :
3098 7256 : unsigned DC = DppCt->getImm();
3099 7256 : if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
3100 7256 : DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
3101 7256 : (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
3102 7256 : (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
3103 7256 : (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
3104 7256 : (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
3105 0 : ErrInfo = "Invalid dpp_ctrl value";
3106 0 : return false;
3107 : }
3108 : }
3109 :
3110 : return true;
3111 : }
3112 :
3113 95107 : unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
3114 190214 : switch (MI.getOpcode()) {
3115 : default: return AMDGPU::INSTRUCTION_LIST_END;
3116 19533 : case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
3117 44013 : case AMDGPU::COPY: return AMDGPU::COPY;
3118 472 : case AMDGPU::PHI: return AMDGPU::PHI;
3119 7 : case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
3120 4 : case AMDGPU::WQM: return AMDGPU::WQM;
3121 266 : case AMDGPU::WWM: return AMDGPU::WWM;
3122 25 : case AMDGPU::S_MOV_B32:
3123 50 : return MI.getOperand(1).isReg() ?
3124 : AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
3125 1948 : case AMDGPU::S_ADD_I32:
3126 1948 : return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
3127 241 : case AMDGPU::S_ADDC_U32:
3128 241 : return AMDGPU::V_ADDC_U32_e32;
3129 801 : case AMDGPU::S_SUB_I32:
3130 801 : return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
3131 : // FIXME: These are not consistently handled, and selected when the carry is
3132 : // used.
3133 121 : case AMDGPU::S_ADD_U32:
3134 121 : return AMDGPU::V_ADD_I32_e32;
3135 0 : case AMDGPU::S_SUB_U32:
3136 0 : return AMDGPU::V_SUB_I32_e32;
3137 0 : case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
3138 1217 : case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
3139 3413 : case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
3140 1964 : case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
3141 237 : case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
3142 24 : case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
3143 0 : case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
3144 29 : case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
3145 3 : case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
3146 2687 : case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
3147 175 : case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
3148 1308 : case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
3149 144 : case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
3150 3402 : case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
3151 109 : case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
3152 588 : case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
3153 985 : case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
3154 1303 : case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
3155 1989 : case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
3156 0 : case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
3157 15 : case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
3158 13 : case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
3159 18 : case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
3160 0 : case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
3161 0 : case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
3162 14 : case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
3163 4 : case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
3164 5 : case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
3165 0 : case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
3166 17 : case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
3167 26 : case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
3168 2 : case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
3169 3 : case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
3170 2 : case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
3171 0 : case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
3172 1 : case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
3173 1 : case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
3174 128 : case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
3175 34 : case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
3176 158 : case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
3177 2 : case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
3178 0 : case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
3179 75 : case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
3180 : }
3181 : }
3182 :
3183 1908046 : const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
3184 : unsigned OpNo) const {
3185 1908046 : const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3186 3816092 : const MCInstrDesc &Desc = get(MI.getOpcode());
3187 1908046 : if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
3188 1473048 : Desc.OpInfo[OpNo].RegClass == -1) {
3189 1595208 : unsigned Reg = MI.getOperand(OpNo).getReg();
3190 :
3191 797604 : if (TargetRegisterInfo::isVirtualRegister(Reg))
3192 423859 : return MRI.getRegClass(Reg);
3193 373745 : return RI.getPhysRegClass(Reg);
3194 : }
3195 :
3196 1110442 : unsigned RCID = Desc.OpInfo[OpNo].RegClass;
3197 2220884 : return RI.getRegClass(RCID);
3198 : }
3199 :
3200 117447 : bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
3201 117447 : switch (MI.getOpcode()) {
3202 75348 : case AMDGPU::COPY:
3203 : case AMDGPU::REG_SEQUENCE:
3204 : case AMDGPU::PHI:
3205 : case AMDGPU::INSERT_SUBREG:
3206 75348 : return RI.hasVGPRs(getOpRegClass(MI, 0));
3207 42099 : default:
3208 42099 : return RI.hasVGPRs(getOpRegClass(MI, OpNo));
3209 : }
3210 : }
3211 :
3212 21885 : void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
3213 : MachineBasicBlock::iterator I = MI;
3214 21885 : MachineBasicBlock *MBB = MI.getParent();
3215 21885 : MachineOperand &MO = MI.getOperand(OpIdx);
3216 21885 : MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
3217 43770 : unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
3218 21885 : const TargetRegisterClass *RC = RI.getRegClass(RCID);
3219 : unsigned Opcode = AMDGPU::V_MOV_B32_e32;
3220 21885 : if (MO.isReg())
3221 : Opcode = AMDGPU::COPY;
3222 0 : else if (RI.isSGPRClass(RC))
3223 : Opcode = AMDGPU::S_MOV_B32;
3224 :
3225 21885 : const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
3226 21885 : if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
3227 : VRC = &AMDGPU::VReg_64RegClass;
3228 : else
3229 : VRC = &AMDGPU::VGPR_32RegClass;
3230 :
3231 21885 : unsigned Reg = MRI.createVirtualRegister(VRC);
3232 : DebugLoc DL = MBB->findDebugLoc(I);
3233 43770 : BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
3234 21885 : MO.ChangeToRegister(Reg, false);
3235 21885 : }
3236 :
3237 30646 : unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
3238 : MachineRegisterInfo &MRI,
3239 : MachineOperand &SuperReg,
3240 : const TargetRegisterClass *SuperRC,
3241 : unsigned SubIdx,
3242 : const TargetRegisterClass *SubRC)
3243 : const {
3244 30646 : MachineBasicBlock *MBB = MI->getParent();
3245 : DebugLoc DL = MI->getDebugLoc();
3246 30646 : unsigned SubReg = MRI.createVirtualRegister(SubRC);
3247 :
3248 30646 : if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
3249 61292 : BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3250 30646 : .addReg(SuperReg.getReg(), 0, SubIdx);
3251 30646 : return SubReg;
3252 : }
3253 :
3254 : // Just in case the super register is itself a sub-register, copy it to a new
3255 : // value so we don't need to worry about merging its subreg index with the
3256 : // SubIdx passed to this function. The register coalescer should be able to
3257 : // eliminate this extra copy.
3258 0 : unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
3259 :
3260 0 : BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
3261 0 : .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
3262 :
3263 0 : BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3264 0 : .addReg(NewSuperReg, 0, SubIdx);
3265 :
3266 0 : return SubReg;
3267 : }
3268 :
3269 30640 : MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
3270 : MachineBasicBlock::iterator MII,
3271 : MachineRegisterInfo &MRI,
3272 : MachineOperand &Op,
3273 : const TargetRegisterClass *SuperRC,
3274 : unsigned SubIdx,
3275 : const TargetRegisterClass *SubRC) const {
3276 30640 : if (Op.isImm()) {
3277 0 : if (SubIdx == AMDGPU::sub0)
3278 0 : return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
3279 0 : if (SubIdx == AMDGPU::sub1)
3280 0 : return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
3281 :
3282 0 : llvm_unreachable("Unhandled register index for immediate");
3283 : }
3284 :
3285 30640 : unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
3286 : SubIdx, SubRC);
3287 : return MachineOperand::CreateReg(SubReg, false);
3288 : }
3289 :
3290 : // Change the order of operands from (0, 1, 2) to (0, 2, 1)
3291 4407 : void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3292 : assert(Inst.getNumExplicitOperands() == 3);
3293 4407 : MachineOperand Op1 = Inst.getOperand(1);
3294 4407 : Inst.RemoveOperand(1);
3295 4407 : Inst.addOperand(Op1);
3296 4407 : }
3297 :
3298 391929 : bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
3299 : const MCOperandInfo &OpInfo,
3300 : const MachineOperand &MO) const {
3301 391929 : if (!MO.isReg())
3302 : return false;
3303 :
3304 391386 : unsigned Reg = MO.getReg();
3305 : const TargetRegisterClass *RC =
3306 391386 : TargetRegisterInfo::isVirtualRegister(Reg) ?
3307 : MRI.getRegClass(Reg) :
3308 11384 : RI.getPhysRegClass(Reg);
3309 :
3310 : const SIRegisterInfo *TRI =
3311 391386 : static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3312 391386 : RC = TRI->getSubRegClass(RC, MO.getSubReg());
3313 :
3314 : // In order to be legal, the common sub-class must be equal to the
3315 : // class of the current operand. For example:
3316 : //
3317 : // v_mov_b32 s0 ; Operand defined as vsrc_b32
3318 : // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3319 : //
3320 : // s_sendmsg 0, s0 ; Operand defined as m0reg
3321 : // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3322 :
3323 782772 : return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3324 : }
3325 :
3326 0 : bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
3327 : const MCOperandInfo &OpInfo,
3328 : const MachineOperand &MO) const {
3329 0 : if (MO.isReg())
3330 0 : return isLegalRegOperand(MRI, OpInfo, MO);
3331 :
3332 : // Handle non-register types that are treated like immediates.
3333 : assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3334 : return true;
3335 : }
3336 :
3337 523716 : bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3338 : const MachineOperand *MO) const {
3339 523716 : const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3340 523716 : const MCInstrDesc &InstDesc = MI.getDesc();
3341 523716 : const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3342 : const TargetRegisterClass *DefinedRC =
3343 523716 : OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
3344 523716 : if (!MO)
3345 0 : MO = &MI.getOperand(OpIdx);
3346 :
3347 523716 : if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
3348 :
3349 : RegSubRegPair SGPRUsed;
3350 141176 : if (MO->isReg())
3351 117120 : SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
3352 :
3353 746675 : for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3354 655857 : if (i == OpIdx)
3355 : continue;
3356 529411 : const MachineOperand &Op = MI.getOperand(i);
3357 529411 : if (Op.isReg()) {
3358 814783 : if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
3359 379432 : usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
3360 : return false;
3361 : }
3362 94060 : } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
3363 : return false;
3364 : }
3365 : }
3366 : }
3367 :
3368 473358 : if (MO->isReg()) {
3369 : assert(DefinedRC);
3370 380919 : return isLegalRegOperand(MRI, OpInfo, *MO);
3371 : }
3372 :
3373 : // Handle non-register types that are treated like immediates.
3374 : assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3375 :
3376 92439 : if (!DefinedRC) {
3377 : // This operand expects an immediate.
3378 : return true;
3379 : }
3380 :
3381 92439 : return isImmOperandLegal(MI, OpIdx, *MO);
3382 : }
3383 :
3384 6062 : void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
3385 : MachineInstr &MI) const {
3386 6062 : unsigned Opc = MI.getOpcode();
3387 6062 : const MCInstrDesc &InstrDesc = get(Opc);
3388 :
3389 6062 : int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3390 6062 : MachineOperand &Src1 = MI.getOperand(Src1Idx);
3391 :
3392 : // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3393 : // we need to only have one constant bus use.
3394 : //
3395 : // Note we do not need to worry about literal constants here. They are
3396 : // disabled for the operand type for instructions because they will always
3397 : // violate the one constant bus use rule.
3398 6062 : bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3399 6062 : if (HasImplicitSGPR) {
3400 241 : int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3401 241 : MachineOperand &Src0 = MI.getOperand(Src0Idx);
3402 :
3403 241 : if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
3404 52 : legalizeOpWithMove(MI, Src0Idx);
3405 : }
3406 :
3407 : // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3408 : // both the value to write (src0) and lane select (src1). Fix up non-SGPR
3409 : // src0/src1 with V_READFIRSTLANE.
3410 6062 : if (Opc == AMDGPU::V_WRITELANE_B32) {
3411 2 : int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3412 2 : MachineOperand &Src0 = MI.getOperand(Src0Idx);
3413 : const DebugLoc &DL = MI.getDebugLoc();
3414 2 : if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3415 0 : unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3416 0 : BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3417 : .add(Src0);
3418 0 : Src0.ChangeToRegister(Reg, false);
3419 : }
3420 2 : if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3421 2 : unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3422 : const DebugLoc &DL = MI.getDebugLoc();
3423 4 : BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3424 : .add(Src1);
3425 2 : Src1.ChangeToRegister(Reg, false);
3426 : }
3427 2 : return;
3428 : }
3429 :
3430 : // VOP2 src0 instructions support all operand types, so we don't need to check
3431 : // their legality. If src1 is already legal, we don't need to do anything.
3432 6060 : if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3433 : return;
3434 :
3435 : // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3436 : // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3437 : // select is uniform.
3438 5141 : if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3439 1 : RI.isVGPR(MRI, Src1.getReg())) {
3440 1 : unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3441 : const DebugLoc &DL = MI.getDebugLoc();
3442 2 : BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3443 : .add(Src1);
3444 1 : Src1.ChangeToRegister(Reg, false);
3445 1 : return;
3446 : }
3447 :
3448 : // We do not use commuteInstruction here because it is too aggressive and will
3449 : // commute if it is possible. We only want to commute here if it improves
3450 : // legality. This can be called a fairly large number of times so don't waste
3451 : // compile time pointlessly swapping and checking legality again.
3452 10089 : if (HasImplicitSGPR || !MI.isCommutable()) {
3453 189 : legalizeOpWithMove(MI, Src1Idx);
3454 189 : return;
3455 : }
3456 :
3457 4950 : int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3458 4950 : MachineOperand &Src0 = MI.getOperand(Src0Idx);
3459 :
3460 : // If src0 can be used as src1, commuting will make the operands legal.
3461 : // Otherwise we have to give up and insert a move.
3462 : //
3463 : // TODO: Other immediate-like operand kinds could be commuted if there was a
3464 : // MachineOperand::ChangeTo* for them.
3465 9900 : if ((!Src1.isImm() && !Src1.isReg()) ||
3466 4950 : !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
3467 0 : legalizeOpWithMove(MI, Src1Idx);
3468 0 : return;
3469 : }
3470 :
3471 : int CommutedOpc = commuteOpcode(MI);
3472 4950 : if (CommutedOpc == -1) {
3473 0 : legalizeOpWithMove(MI, Src1Idx);
3474 0 : return;
3475 : }
3476 :
3477 4950 : MI.setDesc(get(CommutedOpc));
3478 :
3479 4950 : unsigned Src0Reg = Src0.getReg();
3480 : unsigned Src0SubReg = Src0.getSubReg();
3481 : bool Src0Kill = Src0.isKill();
3482 :
3483 4950 : if (Src1.isImm())
3484 543 : Src0.ChangeToImmediate(Src1.getImm());
3485 4407 : else if (Src1.isReg()) {
3486 4407 : Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3487 : Src0.setSubReg(Src1.getSubReg());
3488 : } else
3489 0 : llvm_unreachable("Should only have register or immediate operands");
3490 :
3491 4950 : Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3492 : Src1.setSubReg(Src0SubReg);
3493 : }
3494 :
3495 : // Legalize VOP3 operands. Because all operand types are supported for any
3496 : // operand, and since literal constants are not allowed and should never be
3497 : // seen, we only need to worry about inserting copies if we use multiple SGPR
3498 : // operands.
3499 66473 : void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
3500 : MachineInstr &MI) const {
3501 66473 : unsigned Opc = MI.getOpcode();
3502 :
3503 : int VOP3Idx[3] = {
3504 66473 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3505 66473 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3506 66473 : AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3507 199419 : };
3508 :
3509 : // Find the one SGPR operand we are allowed to use.
3510 66473 : unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3511 :
3512 213758 : for (unsigned i = 0; i < 3; ++i) {
3513 193492 : int Idx = VOP3Idx[i];
3514 193492 : if (Idx == -1)
3515 : break;
3516 147285 : MachineOperand &MO = MI.getOperand(Idx);
3517 :
3518 : // We should never see a VOP3 instruction with an illegal immediate operand.
3519 147285 : if (!MO.isReg())
3520 : continue;
3521 :
3522 135045 : if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3523 : continue; // VGPRs are legal
3524 :
3525 71636 : if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3526 49992 : SGPRReg = MO.getReg();
3527 : // We can use one SGPR in each VOP3 instruction.
3528 49992 : continue;
3529 : }
3530 :
3531 : // If we make it this far, then the operand is not legal and we must
3532 : // legalize it.
3533 21644 : legalizeOpWithMove(MI, Idx);
3534 : }
3535 66473 : }
3536 :
3537 57 : unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
3538 : MachineRegisterInfo &MRI) const {
3539 : const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3540 57 : const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3541 57 : unsigned DstReg = MRI.createVirtualRegister(SRC);
3542 57 : unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3543 :
3544 57 : if (SubRegs == 1) {
3545 32 : BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3546 32 : get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
3547 16 : .addReg(SrcReg);
3548 16 : return DstReg;
3549 : }
3550 :
3551 : SmallVector<unsigned, 8> SRegs;
3552 155 : for (unsigned i = 0; i < SubRegs; ++i) {
3553 114 : unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3554 228 : BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3555 228 : get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3556 114 : .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3557 114 : SRegs.push_back(SGPR);
3558 : }
3559 :
3560 : MachineInstrBuilder MIB =
3561 41 : BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3562 82 : get(AMDGPU::REG_SEQUENCE), DstReg);
3563 155 : for (unsigned i = 0; i < SubRegs; ++i) {
3564 228 : MIB.addReg(SRegs[i]);
3565 114 : MIB.addImm(RI.getSubRegFromChannel(i));
3566 : }
3567 : return DstReg;
3568 : }
3569 :
3570 47 : void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
3571 : MachineInstr &MI) const {
3572 :
3573 : // If the pointer is store in VGPRs, then we need to move them to
3574 : // SGPRs using v_readfirstlane. This is safe because we only select
3575 : // loads with uniform pointers to SMRD instruction so we know the
3576 : // pointer value is uniform.
3577 47 : MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3578 47 : if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3579 33 : unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3580 33 : SBase->setReg(SGPR);
3581 : }
3582 47 : MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
3583 47 : if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
3584 14 : unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
3585 14 : SOff->setReg(SGPR);
3586 : }
3587 47 : }
3588 :
3589 25333 : void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
3590 : MachineBasicBlock::iterator I,
3591 : const TargetRegisterClass *DstRC,
3592 : MachineOperand &Op,
3593 : MachineRegisterInfo &MRI,
3594 : const DebugLoc &DL) const {
3595 25333 : unsigned OpReg = Op.getReg();
3596 : unsigned OpSubReg = Op.getSubReg();
3597 :
3598 25333 : const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3599 : RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3600 :
3601 : // Check if operand is already the correct register class.
3602 25333 : if (DstRC == OpRC)
3603 : return;
3604 :
3605 24962 : unsigned DstReg = MRI.createVirtualRegister(DstRC);
3606 : MachineInstr *Copy =
3607 49924 : BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3608 :
3609 24962 : Op.setReg(DstReg);
3610 : Op.setSubReg(0);
3611 :
3612 24962 : MachineInstr *Def = MRI.getVRegDef(OpReg);
3613 24962 : if (!Def)
3614 : return;
3615 :
3616 : // Try to eliminate the copy if it is copying an immediate value.
3617 24962 : if (Def->isMoveImmediate())
3618 6554 : FoldImmediate(*Copy, *Def, OpReg, &MRI);
3619 : }
3620 :
3621 : // Emit the actual waterfall loop, executing the wrapped instruction for each
3622 : // unique value of \p Rsrc across all lanes. In the best case we execute 1
3623 : // iteration, in the worst case we execute 64 (once per lane).
3624 : static void
3625 0 : emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
3626 : MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3627 : const DebugLoc &DL, MachineOperand &Rsrc) {
3628 0 : MachineBasicBlock::iterator I = LoopBB.begin();
3629 :
3630 0 : unsigned VRsrc = Rsrc.getReg();
3631 : unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
3632 :
3633 0 : unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3634 0 : unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3635 0 : unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3636 0 : unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3637 0 : unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3638 0 : unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3639 0 : unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3640 0 : unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3641 0 : unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3642 :
3643 : // Beginning of the loop, read the next Rsrc variant.
3644 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
3645 0 : .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
3646 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
3647 0 : .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
3648 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
3649 0 : .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
3650 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
3651 0 : .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
3652 :
3653 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
3654 0 : .addReg(SRsrcSub0)
3655 : .addImm(AMDGPU::sub0)
3656 0 : .addReg(SRsrcSub1)
3657 : .addImm(AMDGPU::sub1)
3658 0 : .addReg(SRsrcSub2)
3659 : .addImm(AMDGPU::sub2)
3660 0 : .addReg(SRsrcSub3)
3661 : .addImm(AMDGPU::sub3);
3662 :
3663 : // Update Rsrc operand to use the SGPR Rsrc.
3664 0 : Rsrc.setReg(SRsrc);
3665 : Rsrc.setIsKill(true);
3666 :
3667 : // Identify all lanes with identical Rsrc operands in their VGPRs.
3668 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
3669 0 : .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
3670 0 : .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
3671 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
3672 0 : .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
3673 0 : .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
3674 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
3675 0 : .addReg(CondReg0)
3676 0 : .addReg(CondReg1);
3677 :
3678 : MRI.setSimpleHint(SaveExec, AndCond);
3679 :
3680 : // Update EXEC to matching lanes, saving original to SaveExec.
3681 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
3682 0 : .addReg(AndCond, RegState::Kill);
3683 :
3684 : // The original instruction is here; we insert the terminators after it.
3685 0 : I = LoopBB.end();
3686 :
3687 : // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3688 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
3689 0 : .addReg(AMDGPU::EXEC)
3690 0 : .addReg(SaveExec);
3691 0 : BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
3692 0 : }
3693 :
3694 : // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
3695 : // with SGPRs by iterating over all unique values across all lanes.
3696 17 : static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
3697 : MachineOperand &Rsrc, MachineDominatorTree *MDT) {
3698 17 : MachineBasicBlock &MBB = *MI.getParent();
3699 17 : MachineFunction &MF = *MBB.getParent();
3700 17 : MachineRegisterInfo &MRI = MF.getRegInfo();
3701 : MachineBasicBlock::iterator I(&MI);
3702 : const DebugLoc &DL = MI.getDebugLoc();
3703 :
3704 17 : unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3705 :
3706 : // Save the EXEC mask
3707 51 : BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
3708 17 : .addReg(AMDGPU::EXEC);
3709 :
3710 : // Killed uses in the instruction we are waterfalling around will be
3711 : // incorrect due to the added control-flow.
3712 152 : for (auto &MO : MI.uses()) {
3713 135 : if (MO.isReg() && MO.isUse()) {
3714 60 : MRI.clearKillFlags(MO.getReg());
3715 : }
3716 : }
3717 :
3718 : // To insert the loop we need to split the block. Move everything after this
3719 : // point to a new block, and insert a new empty block between the two.
3720 17 : MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
3721 17 : MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
3722 : MachineFunction::iterator MBBI(MBB);
3723 : ++MBBI;
3724 :
3725 : MF.insert(MBBI, LoopBB);
3726 : MF.insert(MBBI, RemainderBB);
3727 :
3728 17 : LoopBB->addSuccessor(LoopBB);
3729 17 : LoopBB->addSuccessor(RemainderBB);
3730 :
3731 : // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
3732 17 : MachineBasicBlock::iterator J = I++;
3733 17 : RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3734 : RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3735 17 : LoopBB->splice(LoopBB->begin(), &MBB, J);
3736 :
3737 17 : MBB.addSuccessor(LoopBB);
3738 :
3739 : // Update dominators. We know that MBB immediately dominates LoopBB, that
3740 : // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
3741 : // dominates all of the successors transferred to it from MBB that MBB used
3742 : // to dominate.
3743 17 : if (MDT) {
3744 : MDT->addNewBlock(LoopBB, &MBB);
3745 : MDT->addNewBlock(RemainderBB, LoopBB);
3746 25 : for (auto &Succ : RemainderBB->successors()) {
3747 16 : if (MDT->dominates(&MBB, Succ)) {
3748 6 : MDT->changeImmediateDominator(Succ, RemainderBB);
3749 : }
3750 : }
3751 : }
3752 :
3753 17 : emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
3754 :
3755 : // Restore the EXEC mask
3756 17 : MachineBasicBlock::iterator First = RemainderBB->begin();
3757 34 : BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3758 17 : .addReg(SaveExec);
3759 17 : }
3760 :
3761 : // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
3762 : static std::tuple<unsigned, unsigned>
3763 6 : extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
3764 6 : MachineBasicBlock &MBB = *MI.getParent();
3765 6 : MachineFunction &MF = *MBB.getParent();
3766 6 : MachineRegisterInfo &MRI = MF.getRegInfo();
3767 :
3768 : // Extract the ptr from the resource descriptor.
3769 : unsigned RsrcPtr =
3770 6 : TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
3771 : AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3772 :
3773 : // Create an empty resource descriptor
3774 6 : unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3775 6 : unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3776 6 : unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3777 6 : unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3778 6 : uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
3779 :
3780 : // Zero64 = 0
3781 12 : BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
3782 : .addImm(0);
3783 :
3784 : // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3785 12 : BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3786 6 : .addImm(RsrcDataFormat & 0xFFFFFFFF);
3787 :
3788 : // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3789 12 : BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3790 6 : .addImm(RsrcDataFormat >> 32);
3791 :
3792 : // NewSRsrc = {Zero64, SRsrcFormat}
3793 12 : BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3794 6 : .addReg(Zero64)
3795 : .addImm(AMDGPU::sub0_sub1)
3796 6 : .addReg(SRsrcFormatLo)
3797 : .addImm(AMDGPU::sub2)
3798 6 : .addReg(SRsrcFormatHi)
3799 : .addImm(AMDGPU::sub3);
3800 :
3801 6 : return std::make_tuple(RsrcPtr, NewSRsrc);
3802 : }
3803 :
3804 82030 : void SIInstrInfo::legalizeOperands(MachineInstr &MI,
3805 : MachineDominatorTree *MDT) const {
3806 82030 : MachineFunction &MF = *MI.getParent()->getParent();
3807 82030 : MachineRegisterInfo &MRI = MF.getRegInfo();
3808 :
3809 : // Legalize VOP2
3810 82030 : if (isVOP2(MI) || isVOPC(MI)) {
3811 6062 : legalizeOperandsVOP2(MRI, MI);
3812 6062 : return;
3813 : }
3814 :
3815 : // Legalize VOP3
3816 75968 : if (isVOP3(MI)) {
3817 27530 : legalizeOperandsVOP3(MRI, MI);
3818 27530 : return;
3819 : }
3820 :
3821 : // Legalize SMRD
3822 48438 : if (isSMRD(MI)) {
3823 47 : legalizeOperandsSMRD(MRI, MI);
3824 47 : return;
3825 : }
3826 :
3827 : // Legalize REG_SEQUENCE and PHI
3828 : // The register class of the operands much be the same type as the register
3829 : // class of the output.
3830 48391 : if (MI.getOpcode() == AMDGPU::PHI) {
3831 : const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3832 1412 : for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3833 1880 : if (!MI.getOperand(i).isReg() ||
3834 940 : !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
3835 : continue;
3836 : const TargetRegisterClass *OpRC =
3837 : MRI.getRegClass(MI.getOperand(i).getReg());
3838 940 : if (RI.hasVGPRs(OpRC)) {
3839 : VRC = OpRC;
3840 : } else {
3841 : SRC = OpRC;
3842 : }
3843 : }
3844 :
3845 : // If any of the operands are VGPR registers, then they all most be
3846 : // otherwise we will create illegal VGPR->SGPR copies when legalizing
3847 : // them.
3848 608 : if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3849 472 : if (!VRC) {
3850 : assert(SRC);
3851 136 : VRC = RI.getEquivalentVGPRClass(SRC);
3852 : }
3853 : RC = VRC;
3854 : } else {
3855 : RC = SRC;
3856 : }
3857 :
3858 : // Update all the operands so they have the same type.
3859 1412 : for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3860 940 : MachineOperand &Op = MI.getOperand(I);
3861 940 : if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
3862 0 : continue;
3863 :
3864 : // MI is a PHI instruction.
3865 940 : MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3866 940 : MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3867 :
3868 : // Avoid creating no-op copies with the same src and dst reg class. These
3869 : // confuse some of the machine passes.
3870 940 : legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3871 : }
3872 : }
3873 :
3874 : // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3875 : // VGPR dest type and SGPR sources, insert copies so all operands are
3876 : // VGPRs. This seems to help operand folding / the register coalescer.
3877 96782 : if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
3878 19533 : MachineBasicBlock *MBB = MI.getParent();
3879 19533 : const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3880 19533 : if (RI.hasVGPRs(DstRC)) {
3881 : // Update all the operands so they are VGPR register classes. These may
3882 : // not be the same register class because REG_SEQUENCE supports mixing
3883 : // subregister index types e.g. sub0_sub1 + sub2 + sub3
3884 71181 : for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3885 51648 : MachineOperand &Op = MI.getOperand(I);
3886 51648 : if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
3887 : continue;
3888 :
3889 : const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3890 51648 : const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3891 51648 : if (VRC == OpRC)
3892 : continue;
3893 :
3894 24358 : legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3895 : Op.setIsKill();
3896 : }
3897 : }
3898 :
3899 19533 : return;
3900 : }
3901 :
3902 : // Legalize INSERT_SUBREG
3903 : // src0 must have the same register class as dst
3904 28858 : if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
3905 7 : unsigned Dst = MI.getOperand(0).getReg();
3906 7 : unsigned Src0 = MI.getOperand(1).getReg();
3907 : const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3908 : const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3909 7 : if (DstRC != Src0RC) {
3910 5 : MachineBasicBlock *MBB = MI.getParent();
3911 : MachineOperand &Op = MI.getOperand(1);
3912 5 : legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3913 : }
3914 7 : return;
3915 : }
3916 :
3917 : // Legalize SI_INIT_M0
3918 28851 : if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
3919 2 : MachineOperand &Src = MI.getOperand(0);
3920 2 : if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
3921 2 : Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
3922 2 : return;
3923 : }
3924 :
3925 : // Legalize MIMG and MUBUF/MTBUF for shaders.
3926 : //
3927 : // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
3928 : // scratch memory access. In both cases, the legalization never involves
3929 : // conversion to the addr64 form.
3930 28849 : if (isMIMG(MI) ||
3931 57690 : (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
3932 488 : (isMUBUF(MI) || isMTBUF(MI)))) {
3933 8 : MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
3934 8 : if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
3935 6 : unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
3936 6 : SRsrc->setReg(SGPR);
3937 : }
3938 :
3939 8 : MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
3940 8 : if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
3941 2 : unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
3942 2 : SSamp->setReg(SGPR);
3943 : }
3944 8 : return;
3945 : }
3946 :
3947 : // Legalize MUBUF* instructions.
3948 : int RsrcIdx =
3949 28841 : AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
3950 28841 : if (RsrcIdx != -1) {
3951 : // We have an MUBUF instruction
3952 23 : MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
3953 23 : unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
3954 69 : if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
3955 : RI.getRegClass(RsrcRC))) {
3956 : // The operands are legal.
3957 : // FIXME: We may need to legalize operands besided srsrc.
3958 : return;
3959 : }
3960 :
3961 : // Legalize a VGPR Rsrc.
3962 : //
3963 : // If the instruction is _ADDR64, we can avoid a waterfall by extracting
3964 : // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
3965 : // a zero-value SRsrc.
3966 : //
3967 : // If the instruction is _OFFSET (both idxen and offen disabled), and we
3968 : // support ADDR64 instructions, we can convert to ADDR64 and do the same as
3969 : // above.
3970 : //
3971 : // Otherwise we are on non-ADDR64 hardware, and/or we have
3972 : // idxen/offen/bothen and we fall back to a waterfall loop.
3973 :
3974 23 : MachineBasicBlock &MBB = *MI.getParent();
3975 :
3976 23 : MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
3977 23 : if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
3978 : // This is already an ADDR64 instruction so we need to add the pointer
3979 : // extracted from the resource descriptor to the current value of VAddr.
3980 4 : unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3981 4 : unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3982 4 : unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3983 :
3984 : unsigned RsrcPtr, NewSRsrc;
3985 4 : std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
3986 :
3987 : // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
3988 : DebugLoc DL = MI.getDebugLoc();
3989 8 : BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
3990 4 : .addReg(RsrcPtr, 0, AMDGPU::sub0)
3991 4 : .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
3992 :
3993 : // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
3994 8 : BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
3995 4 : .addReg(RsrcPtr, 0, AMDGPU::sub1)
3996 4 : .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
3997 :
3998 : // NewVaddr = {NewVaddrHi, NewVaddrLo}
3999 8 : BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
4000 4 : .addReg(NewVAddrLo)
4001 : .addImm(AMDGPU::sub0)
4002 4 : .addReg(NewVAddrHi)
4003 : .addImm(AMDGPU::sub1);
4004 :
4005 4 : VAddr->setReg(NewVAddr);
4006 4 : Rsrc->setReg(NewSRsrc);
4007 19 : } else if (!VAddr && ST.hasAddr64()) {
4008 : // This instructions is the _OFFSET variant, so we need to convert it to
4009 : // ADDR64.
4010 : assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
4011 : < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
4012 : "FIXME: Need to emit flat atomics here");
4013 :
4014 : unsigned RsrcPtr, NewSRsrc;
4015 2 : std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4016 :
4017 2 : unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4018 2 : MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
4019 2 : MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4020 2 : MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
4021 2 : unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
4022 :
4023 : // Atomics rith return have have an additional tied operand and are
4024 : // missing some of the special bits.
4025 2 : MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
4026 : MachineInstr *Addr64;
4027 :
4028 2 : if (!VDataIn) {
4029 : // Regular buffer load / store.
4030 : MachineInstrBuilder MIB =
4031 4 : BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4032 : .add(*VData)
4033 2 : .addReg(NewVAddr)
4034 2 : .addReg(NewSRsrc)
4035 : .add(*SOffset)
4036 2 : .add(*Offset);
4037 :
4038 : // Atomics do not have this operand.
4039 2 : if (const MachineOperand *GLC =
4040 2 : getNamedOperand(MI, AMDGPU::OpName::glc)) {
4041 2 : MIB.addImm(GLC->getImm());
4042 : }
4043 :
4044 : MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
4045 :
4046 2 : if (const MachineOperand *TFE =
4047 2 : getNamedOperand(MI, AMDGPU::OpName::tfe)) {
4048 2 : MIB.addImm(TFE->getImm());
4049 : }
4050 :
4051 : MIB.cloneMemRefs(MI);
4052 : Addr64 = MIB;
4053 : } else {
4054 : // Atomics with return.
4055 0 : Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4056 : .add(*VData)
4057 : .add(*VDataIn)
4058 0 : .addReg(NewVAddr)
4059 0 : .addReg(NewSRsrc)
4060 : .add(*SOffset)
4061 : .add(*Offset)
4062 : .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
4063 0 : .cloneMemRefs(MI);
4064 : }
4065 :
4066 2 : MI.removeFromParent();
4067 :
4068 : // NewVaddr = {NewVaddrHi, NewVaddrLo}
4069 2 : BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
4070 2 : NewVAddr)
4071 2 : .addReg(RsrcPtr, 0, AMDGPU::sub0)
4072 : .addImm(AMDGPU::sub0)
4073 2 : .addReg(RsrcPtr, 0, AMDGPU::sub1)
4074 : .addImm(AMDGPU::sub1);
4075 : } else {
4076 : // This is another variant; legalize Rsrc with waterfall loop from VGPRs
4077 : // to SGPRs.
4078 17 : loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
4079 : }
4080 : }
4081 : }
4082 :
4083 38259 : void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
4084 : MachineDominatorTree *MDT) const {
4085 : SetVectorType Worklist;
4086 38259 : Worklist.insert(&TopInst);
4087 :
4088 133366 : while (!Worklist.empty()) {
4089 : MachineInstr &Inst = *Worklist.pop_back_val();
4090 95107 : MachineBasicBlock *MBB = Inst.getParent();
4091 95107 : MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
4092 :
4093 95107 : unsigned Opcode = Inst.getOpcode();
4094 95107 : unsigned NewOpcode = getVALUOp(Inst);
4095 :
4096 : // Handle some special cases
4097 95107 : switch (Opcode) {
4098 : default:
4099 : break;
4100 5086 : case AMDGPU::S_ADD_U64_PSEUDO:
4101 : case AMDGPU::S_SUB_U64_PSEUDO:
4102 5086 : splitScalar64BitAddSub(Worklist, Inst, MDT);
4103 5086 : Inst.eraseFromParent();
4104 5086 : continue;
4105 2749 : case AMDGPU::S_ADD_I32:
4106 : case AMDGPU::S_SUB_I32:
4107 : // FIXME: The u32 versions currently selected use the carry.
4108 2749 : if (moveScalarAddSub(Worklist, Inst, MDT))
4109 : continue;
4110 :
4111 : // Default handling
4112 : break;
4113 62 : case AMDGPU::S_AND_B64:
4114 62 : splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT);
4115 62 : Inst.eraseFromParent();
4116 62 : continue;
4117 :
4118 92 : case AMDGPU::S_OR_B64:
4119 92 : splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT);
4120 92 : Inst.eraseFromParent();
4121 92 : continue;
4122 :
4123 108 : case AMDGPU::S_XOR_B64:
4124 108 : splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT);
4125 108 : Inst.eraseFromParent();
4126 108 : continue;
4127 :
4128 18 : case AMDGPU::S_NOT_B64:
4129 18 : splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
4130 18 : Inst.eraseFromParent();
4131 18 : continue;
4132 :
4133 26 : case AMDGPU::S_BCNT1_I32_B64:
4134 26 : splitScalar64BitBCNT(Worklist, Inst);
4135 26 : Inst.eraseFromParent();
4136 26 : continue;
4137 :
4138 1811 : case AMDGPU::S_BFE_I64:
4139 1811 : splitScalar64BitBFE(Worklist, Inst);
4140 1811 : Inst.eraseFromParent();
4141 1811 : continue;
4142 :
4143 1308 : case AMDGPU::S_LSHL_B32:
4144 1308 : if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4145 : NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
4146 614 : swapOperands(Inst);
4147 : }
4148 : break;
4149 2687 : case AMDGPU::S_ASHR_I32:
4150 2687 : if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4151 : NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
4152 1617 : swapOperands(Inst);
4153 : }
4154 : break;
4155 3402 : case AMDGPU::S_LSHR_B32:
4156 3402 : if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4157 : NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
4158 1996 : swapOperands(Inst);
4159 : }
4160 : break;
4161 144 : case AMDGPU::S_LSHL_B64:
4162 144 : if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4163 : NewOpcode = AMDGPU::V_LSHLREV_B64;
4164 68 : swapOperands(Inst);
4165 : }
4166 : break;
4167 175 : case AMDGPU::S_ASHR_I64:
4168 175 : if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4169 : NewOpcode = AMDGPU::V_ASHRREV_I64;
4170 58 : swapOperands(Inst);
4171 : }
4172 : break;
4173 109 : case AMDGPU::S_LSHR_B64:
4174 109 : if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4175 : NewOpcode = AMDGPU::V_LSHRREV_B64;
4176 54 : swapOperands(Inst);
4177 : }
4178 : break;
4179 :
4180 24 : case AMDGPU::S_ABS_I32:
4181 24 : lowerScalarAbs(Worklist, Inst);
4182 24 : Inst.eraseFromParent();
4183 24 : continue;
4184 :
4185 75 : case AMDGPU::S_CBRANCH_SCC0:
4186 : case AMDGPU::S_CBRANCH_SCC1:
4187 : // Clear unused bits of vcc
4188 75 : BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
4189 150 : AMDGPU::VCC)
4190 75 : .addReg(AMDGPU::EXEC)
4191 75 : .addReg(AMDGPU::VCC);
4192 75 : break;
4193 :
4194 : case AMDGPU::S_BFE_U64:
4195 : case AMDGPU::S_BFM_B64:
4196 : llvm_unreachable("Moving this op to VALU not implemented");
4197 :
4198 263 : case AMDGPU::S_PACK_LL_B32_B16:
4199 : case AMDGPU::S_PACK_LH_B32_B16:
4200 : case AMDGPU::S_PACK_HH_B32_B16:
4201 263 : movePackToVALU(Worklist, MRI, Inst);
4202 263 : Inst.eraseFromParent();
4203 263 : continue;
4204 :
4205 15 : case AMDGPU::S_XNOR_B32:
4206 15 : lowerScalarXnor(Worklist, Inst);
4207 15 : Inst.eraseFromParent();
4208 15 : continue;
4209 :
4210 5 : case AMDGPU::S_XNOR_B64:
4211 5 : splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
4212 5 : Inst.eraseFromParent();
4213 5 : continue;
4214 : }
4215 :
4216 82893 : if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
4217 : // We cannot move this instruction to the VALU, so we should try to
4218 : // legalize its operands instead.
4219 89 : legalizeOperands(Inst, MDT);
4220 89 : continue;
4221 : }
4222 :
4223 : // Use the new VALU Opcode.
4224 87211 : const MCInstrDesc &NewDesc = get(NewOpcode);
4225 : Inst.setDesc(NewDesc);
4226 :
4227 : // Remove any references to SCC. Vector instructions can't read from it, and
4228 : // We're just about to add the implicit use / defs of VCC, and we don't want
4229 : // both.
4230 300753 : for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
4231 213542 : MachineOperand &Op = Inst.getOperand(i);
4232 213542 : if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
4233 20133 : Inst.RemoveOperand(i);
4234 20133 : addSCCDefUsersToVALUWorklist(Inst, Worklist);
4235 : }
4236 : }
4237 :
4238 87211 : if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
4239 : // We are converting these to a BFE, so we need to add the missing
4240 : // operands for the size and offset.
4241 1573 : unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
4242 1573 : Inst.addOperand(MachineOperand::CreateImm(0));
4243 3146 : Inst.addOperand(MachineOperand::CreateImm(Size));
4244 :
4245 85638 : } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
4246 : // The VALU version adds the second operand to the result, so insert an
4247 : // extra 0 operand.
4248 128 : Inst.addOperand(MachineOperand::CreateImm(0));
4249 : }
4250 :
4251 87211 : Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
4252 :
4253 87211 : if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
4254 3292 : const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
4255 : // If we need to move this to VGPRs, we need to unpack the second operand
4256 : // back into the 2 separate ones for bit offset and width.
4257 : assert(OffsetWidthOp.isImm() &&
4258 : "Scalar BFE is only implemented for constant width and offset");
4259 3292 : uint32_t Imm = OffsetWidthOp.getImm();
4260 :
4261 3292 : uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4262 3292 : uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4263 3292 : Inst.RemoveOperand(2); // Remove old immediate.
4264 6584 : Inst.addOperand(MachineOperand::CreateImm(Offset));
4265 6584 : Inst.addOperand(MachineOperand::CreateImm(BitWidth));
4266 : }
4267 :
4268 174422 : bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
4269 : unsigned NewDstReg = AMDGPU::NoRegister;
4270 : if (HasDst) {
4271 87061 : unsigned DstReg = Inst.getOperand(0).getReg();
4272 87061 : if (TargetRegisterInfo::isPhysicalRegister(DstReg))
4273 : continue;
4274 :
4275 : // Update the destination register class.
4276 87004 : const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
4277 87004 : if (!NewDstRC)
4278 : continue;
4279 :
4280 43981 : if (Inst.isCopy() &&
4281 130609 : TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
4282 43605 : NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
4283 : // Instead of creating a copy where src and dst are the same register
4284 : // class, we just replace all uses of dst with src. These kinds of
4285 : // copies interfere with the heuristics MachineSink uses to decide
4286 : // whether or not to split a critical edge. Since the pass assumes
4287 : // that copies will end up as machine instructions and not be
4288 : // eliminated.
4289 16216 : addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
4290 16216 : MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
4291 16216 : MRI.clearKillFlags(Inst.getOperand(1).getReg());
4292 16216 : Inst.getOperand(0).setReg(DstReg);
4293 :
4294 : // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
4295 : // these are deleted later, but at -O0 it would leave a suspicious
4296 : // looking illegal copy of an undef register.
4297 32432 : for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
4298 16216 : Inst.RemoveOperand(I);
4299 16216 : Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
4300 16216 : continue;
4301 : }
4302 :
4303 70788 : NewDstReg = MRI.createVirtualRegister(NewDstRC);
4304 70788 : MRI.replaceRegWith(DstReg, NewDstReg);
4305 : }
4306 :
4307 : // Legalize the operands
4308 70938 : legalizeOperands(Inst, MDT);
4309 :
4310 70938 : if (HasDst)
4311 70788 : addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
4312 : }
4313 38259 : }
4314 :
4315 : // Add/sub require special handling to deal with carry outs.
4316 2749 : bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
4317 : MachineDominatorTree *MDT) const {
4318 2749 : if (ST.hasAddNoCarry()) {
4319 : // Assume there is no user of scc since we don't select this in that case.
4320 : // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
4321 : // is used.
4322 :
4323 297 : MachineBasicBlock &MBB = *Inst.getParent();
4324 297 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4325 :
4326 297 : unsigned OldDstReg = Inst.getOperand(0).getReg();
4327 297 : unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4328 :
4329 297 : unsigned Opc = Inst.getOpcode();
4330 : assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
4331 :
4332 297 : unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
4333 : AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
4334 :
4335 : assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
4336 297 : Inst.RemoveOperand(3);
4337 :
4338 297 : Inst.setDesc(get(NewOpc));
4339 297 : Inst.addImplicitDefUseOperands(*MBB.getParent());
4340 297 : MRI.replaceRegWith(OldDstReg, ResultReg);
4341 297 : legalizeOperands(Inst, MDT);
4342 :
4343 297 : addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4344 297 : return true;
4345 : }
4346 :
4347 : return false;
4348 : }
4349 :
4350 24 : void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
4351 : MachineInstr &Inst) const {
4352 24 : MachineBasicBlock &MBB = *Inst.getParent();
4353 24 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4354 : MachineBasicBlock::iterator MII = Inst;
4355 : DebugLoc DL = Inst.getDebugLoc();
4356 :
4357 24 : MachineOperand &Dest = Inst.getOperand(0);
4358 : MachineOperand &Src = Inst.getOperand(1);
4359 24 : unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4360 24 : unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4361 :
4362 24 : unsigned SubOp = ST.hasAddNoCarry() ?
4363 : AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
4364 :
4365 48 : BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
4366 : .addImm(0)
4367 24 : .addReg(Src.getReg());
4368 :
4369 48 : BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
4370 24 : .addReg(Src.getReg())
4371 24 : .addReg(TmpReg);
4372 :
4373 24 : MRI.replaceRegWith(Dest.getReg(), ResultReg);
4374 24 : addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4375 24 : }
4376 :
4377 15 : void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
4378 : MachineInstr &Inst) const {
4379 15 : MachineBasicBlock &MBB = *Inst.getParent();
4380 15 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4381 : MachineBasicBlock::iterator MII = Inst;
4382 : const DebugLoc &DL = Inst.getDebugLoc();
4383 :
4384 15 : MachineOperand &Dest = Inst.getOperand(0);
4385 : MachineOperand &Src0 = Inst.getOperand(1);
4386 : MachineOperand &Src1 = Inst.getOperand(2);
4387 :
4388 15 : legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
4389 15 : legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
4390 :
4391 15 : unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4392 15 : if (ST.hasDLInsts()) {
4393 6 : BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
4394 : .add(Src0)
4395 : .add(Src1);
4396 : } else {
4397 12 : unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4398 24 : BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
4399 : .add(Src0)
4400 : .add(Src1);
4401 :
4402 36 : BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
4403 12 : .addReg(Xor);
4404 : }
4405 :
4406 15 : MRI.replaceRegWith(Dest.getReg(), NewDest);
4407 15 : addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4408 15 : }
4409 :
4410 18 : void SIInstrInfo::splitScalar64BitUnaryOp(
4411 : SetVectorType &Worklist, MachineInstr &Inst,
4412 : unsigned Opcode) const {
4413 18 : MachineBasicBlock &MBB = *Inst.getParent();
4414 18 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4415 :
4416 18 : MachineOperand &Dest = Inst.getOperand(0);
4417 : MachineOperand &Src0 = Inst.getOperand(1);
4418 : DebugLoc DL = Inst.getDebugLoc();
4419 :
4420 : MachineBasicBlock::iterator MII = Inst;
4421 :
4422 18 : const MCInstrDesc &InstDesc = get(Opcode);
4423 18 : const TargetRegisterClass *Src0RC = Src0.isReg() ?
4424 18 : MRI.getRegClass(Src0.getReg()) :
4425 : &AMDGPU::SGPR_32RegClass;
4426 :
4427 18 : const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4428 :
4429 : MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4430 18 : AMDGPU::sub0, Src0SubRC);
4431 :
4432 18 : const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4433 18 : const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4434 18 : const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4435 :
4436 18 : unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4437 18 : BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
4438 :
4439 : MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4440 18 : AMDGPU::sub1, Src0SubRC);
4441 :
4442 18 : unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4443 36 : BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
4444 :
4445 18 : unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4446 36 : BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4447 18 : .addReg(DestSub0)
4448 : .addImm(AMDGPU::sub0)
4449 18 : .addReg(DestSub1)
4450 : .addImm(AMDGPU::sub1);
4451 :
4452 18 : MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4453 :
4454 : // We don't need to legalizeOperands here because for a single operand, src0
4455 : // will support any kind of input.
4456 :
4457 : // Move all users of this moved value.
4458 18 : addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4459 18 : }
4460 :
4461 5086 : void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
4462 : MachineInstr &Inst,
4463 : MachineDominatorTree *MDT) const {
4464 5086 : bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4465 :
4466 5086 : MachineBasicBlock &MBB = *Inst.getParent();
4467 5086 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4468 :
4469 5086 : unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4470 5086 : unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4471 5086 : unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4472 :
4473 5086 : unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4474 5086 : unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4475 :
4476 5086 : MachineOperand &Dest = Inst.getOperand(0);
4477 : MachineOperand &Src0 = Inst.getOperand(1);
4478 : MachineOperand &Src1 = Inst.getOperand(2);
4479 : const DebugLoc &DL = Inst.getDebugLoc();
4480 : MachineBasicBlock::iterator MII = Inst;
4481 :
4482 5086 : const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
4483 5086 : const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
4484 5086 : const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4485 5086 : const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4486 :
4487 : MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4488 5086 : AMDGPU::sub0, Src0SubRC);
4489 : MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4490 5086 : AMDGPU::sub0, Src1SubRC);
4491 :
4492 :
4493 : MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4494 5086 : AMDGPU::sub1, Src0SubRC);
4495 : MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4496 5086 : AMDGPU::sub1, Src1SubRC);
4497 :
4498 5086 : unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
4499 : MachineInstr *LoHalf =
4500 10172 : BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
4501 5086 : .addReg(CarryReg, RegState::Define)
4502 : .add(SrcReg0Sub0)
4503 5086 : .add(SrcReg1Sub0);
4504 :
4505 5086 : unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4506 : MachineInstr *HiHalf =
4507 10172 : BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
4508 5086 : .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4509 : .add(SrcReg0Sub1)
4510 : .add(SrcReg1Sub1)
4511 5086 : .addReg(CarryReg, RegState::Kill);
4512 :
4513 10172 : BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4514 5086 : .addReg(DestSub0)
4515 : .addImm(AMDGPU::sub0)
4516 5086 : .addReg(DestSub1)
4517 : .addImm(AMDGPU::sub1);
4518 :
4519 5086 : MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4520 :
4521 : // Try to legalize the operands in case we need to swap the order to keep it
4522 : // valid.
4523 5086 : legalizeOperands(*LoHalf, MDT);
4524 5086 : legalizeOperands(*HiHalf, MDT);
4525 :
4526 : // Move all users of this moved vlaue.
4527 5086 : addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4528 5086 : }
4529 :
4530 267 : void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
4531 : MachineInstr &Inst, unsigned Opcode,
4532 : MachineDominatorTree *MDT) const {
4533 267 : MachineBasicBlock &MBB = *Inst.getParent();
4534 267 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4535 :
4536 267 : MachineOperand &Dest = Inst.getOperand(0);
4537 : MachineOperand &Src0 = Inst.getOperand(1);
4538 : MachineOperand &Src1 = Inst.getOperand(2);
4539 : DebugLoc DL = Inst.getDebugLoc();
4540 :
4541 : MachineBasicBlock::iterator MII = Inst;
4542 :
4543 267 : const MCInstrDesc &InstDesc = get(Opcode);
4544 267 : const TargetRegisterClass *Src0RC = Src0.isReg() ?
4545 267 : MRI.getRegClass(Src0.getReg()) :
4546 : &AMDGPU::SGPR_32RegClass;
4547 :
4548 267 : const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4549 267 : const TargetRegisterClass *Src1RC = Src1.isReg() ?
4550 267 : MRI.getRegClass(Src1.getReg()) :
4551 : &AMDGPU::SGPR_32RegClass;
4552 :
4553 267 : const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4554 :
4555 : MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4556 267 : AMDGPU::sub0, Src0SubRC);
4557 : MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4558 267 : AMDGPU::sub0, Src1SubRC);
4559 :
4560 267 : const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4561 267 : const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4562 267 : const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4563 :
4564 267 : unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4565 267 : MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
4566 : .add(SrcReg0Sub0)
4567 : .add(SrcReg1Sub0);
4568 :
4569 : MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4570 267 : AMDGPU::sub1, Src0SubRC);
4571 : MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4572 267 : AMDGPU::sub1, Src1SubRC);
4573 :
4574 267 : unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4575 267 : MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
4576 : .add(SrcReg0Sub1)
4577 : .add(SrcReg1Sub1);
4578 :
4579 267 : unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4580 534 : BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4581 267 : .addReg(DestSub0)
4582 : .addImm(AMDGPU::sub0)
4583 267 : .addReg(DestSub1)
4584 : .addImm(AMDGPU::sub1);
4585 :
4586 267 : MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4587 :
4588 : // Try to legalize the operands in case we need to swap the order to keep it
4589 : // valid.
4590 267 : legalizeOperands(LoHalf, MDT);
4591 267 : legalizeOperands(HiHalf, MDT);
4592 :
4593 : // Move all users of this moved vlaue.
4594 267 : addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4595 267 : }
4596 :
4597 26 : void SIInstrInfo::splitScalar64BitBCNT(
4598 : SetVectorType &Worklist, MachineInstr &Inst) const {
4599 26 : MachineBasicBlock &MBB = *Inst.getParent();
4600 26 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4601 :
4602 : MachineBasicBlock::iterator MII = Inst;
4603 : DebugLoc DL = Inst.getDebugLoc();
4604 :
4605 26 : MachineOperand &Dest = Inst.getOperand(0);
4606 : MachineOperand &Src = Inst.getOperand(1);
4607 :
4608 26 : const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
4609 26 : const TargetRegisterClass *SrcRC = Src.isReg() ?
4610 26 : MRI.getRegClass(Src.getReg()) :
4611 : &AMDGPU::SGPR_32RegClass;
4612 :
4613 26 : unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4614 26 : unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4615 :
4616 26 : const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
4617 :
4618 : MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4619 26 : AMDGPU::sub0, SrcSubRC);
4620 : MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4621 26 : AMDGPU::sub1, SrcSubRC);
4622 :
4623 26 : BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
4624 :
4625 52 : BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
4626 :
4627 26 : MRI.replaceRegWith(Dest.getReg(), ResultReg);
4628 :
4629 : // We don't need to legalize operands here. src0 for etiher instruction can be
4630 : // an SGPR, and the second input is unused or determined here.
4631 26 : addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4632 26 : }
4633 :
4634 1811 : void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
4635 : MachineInstr &Inst) const {
4636 1811 : MachineBasicBlock &MBB = *Inst.getParent();
4637 1811 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4638 : MachineBasicBlock::iterator MII = Inst;
4639 : DebugLoc DL = Inst.getDebugLoc();
4640 :
4641 1811 : MachineOperand &Dest = Inst.getOperand(0);
4642 1811 : uint32_t Imm = Inst.getOperand(2).getImm();
4643 : uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4644 1811 : uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4645 :
4646 : (void) Offset;
4647 :
4648 : // Only sext_inreg cases handled.
4649 : assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
4650 : Offset == 0 && "Not implemented");
4651 :
4652 1811 : if (BitWidth < 32) {
4653 1805 : unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4654 1805 : unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4655 1805 : unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4656 :
4657 3610 : BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
4658 1805 : .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
4659 : .addImm(0)
4660 1805 : .addImm(BitWidth);
4661 :
4662 3610 : BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
4663 : .addImm(31)
4664 1805 : .addReg(MidRegLo);
4665 :
4666 3610 : BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4667 1805 : .addReg(MidRegLo)
4668 : .addImm(AMDGPU::sub0)
4669 1805 : .addReg(MidRegHi)
4670 : .addImm(AMDGPU::sub1);
4671 :
4672 1805 : MRI.replaceRegWith(Dest.getReg(), ResultReg);
4673 1805 : addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4674 : return;
4675 : }
4676 :
4677 : MachineOperand &Src = Inst.getOperand(1);
4678 6 : unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4679 6 : unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4680 :
4681 12 : BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
4682 : .addImm(31)
4683 6 : .addReg(Src.getReg(), 0, AMDGPU::sub0);
4684 :
4685 12 : BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4686 6 : .addReg(Src.getReg(), 0, AMDGPU::sub0)
4687 : .addImm(AMDGPU::sub0)
4688 6 : .addReg(TmpReg)
4689 : .addImm(AMDGPU::sub1);
4690 :
4691 6 : MRI.replaceRegWith(Dest.getReg(), ResultReg);
4692 6 : addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4693 : }
4694 :
4695 94811 : void SIInstrInfo::addUsersToMoveToVALUWorklist(
4696 : unsigned DstReg,
4697 : MachineRegisterInfo &MRI,
4698 : SetVectorType &Worklist) const {
4699 94811 : for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
4700 212258 : E = MRI.use_end(); I != E;) {
4701 117447 : MachineInstr &UseMI = *I->getParent();
4702 117447 : if (!canReadVGPR(UseMI, I.getOperandNo())) {
4703 56780 : Worklist.insert(&UseMI);
4704 :
4705 : do {
4706 : ++I;
4707 56889 : } while (I != E && I->getParent() == &UseMI);
4708 : } else {
4709 : ++I;
4710 : }
4711 : }
4712 94811 : }
4713 :
4714 263 : void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
4715 : MachineRegisterInfo &MRI,
4716 : MachineInstr &Inst) const {
4717 263 : unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4718 263 : MachineBasicBlock *MBB = Inst.getParent();
4719 263 : MachineOperand &Src0 = Inst.getOperand(1);
4720 : MachineOperand &Src1 = Inst.getOperand(2);
4721 : const DebugLoc &DL = Inst.getDebugLoc();
4722 :
4723 526 : switch (Inst.getOpcode()) {
4724 : case AMDGPU::S_PACK_LL_B32_B16: {
4725 254 : unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4726 254 : unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4727 :
4728 : // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
4729 : // 0.
4730 508 : BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4731 : .addImm(0xffff);
4732 :
4733 508 : BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
4734 254 : .addReg(ImmReg, RegState::Kill)
4735 : .add(Src0);
4736 :
4737 508 : BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
4738 : .add(Src1)
4739 : .addImm(16)
4740 254 : .addReg(TmpReg, RegState::Kill);
4741 254 : break;
4742 : }
4743 : case AMDGPU::S_PACK_LH_B32_B16: {
4744 6 : unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4745 12 : BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4746 : .addImm(0xffff);
4747 12 : BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
4748 6 : .addReg(ImmReg, RegState::Kill)
4749 : .add(Src0)
4750 : .add(Src1);
4751 6 : break;
4752 : }
4753 : case AMDGPU::S_PACK_HH_B32_B16: {
4754 3 : unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4755 3 : unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4756 6 : BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
4757 : .addImm(16)
4758 : .add(Src0);
4759 6 : BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4760 : .addImm(0xffff0000);
4761 6 : BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
4762 : .add(Src1)
4763 3 : .addReg(ImmReg, RegState::Kill)
4764 3 : .addReg(TmpReg, RegState::Kill);
4765 3 : break;
4766 : }
4767 0 : default:
4768 0 : llvm_unreachable("unhandled s_pack_* instruction");
4769 : }
4770 :
4771 263 : MachineOperand &Dest = Inst.getOperand(0);
4772 263 : MRI.replaceRegWith(Dest.getReg(), ResultReg);
4773 263 : addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4774 263 : }
4775 :
4776 20133 : void SIInstrInfo::addSCCDefUsersToVALUWorklist(
4777 : MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
4778 : // This assumes that all the users of SCC are in the same block
4779 : // as the SCC def.
4780 : for (MachineInstr &MI :
4781 : make_range(MachineBasicBlock::iterator(SCCDefInst),
4782 582809 : SCCDefInst.getParent()->end())) {
4783 : // Exit if we find another SCC def.
4784 576470 : if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
4785 : return;
4786 :
4787 562676 : if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
4788 75 : Worklist.insert(&MI);
4789 : }
4790 : }
4791 :
4792 87004 : const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
4793 : const MachineInstr &Inst) const {
4794 87004 : const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
4795 :
4796 174008 : switch (Inst.getOpcode()) {
4797 : // For target instructions, getOpRegClass just returns the virtual register
4798 : // class associated with the operand, so we need to find an equivalent VGPR
4799 : // register class in order to move the instruction to the VALU.
4800 64263 : case AMDGPU::COPY:
4801 : case AMDGPU::PHI:
4802 : case AMDGPU::REG_SEQUENCE:
4803 : case AMDGPU::INSERT_SUBREG:
4804 : case AMDGPU::WQM:
4805 : case AMDGPU::WWM:
4806 64263 : if (RI.hasVGPRs(NewDstRC))
4807 : return nullptr;
4808 :
4809 64263 : NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
4810 64263 : if (!NewDstRC)
4811 0 : return nullptr;
4812 : return NewDstRC;
4813 : default:
4814 : return NewDstRC;
4815 : }
4816 : }
4817 :
4818 : // Find the one SGPR operand we are allowed to use.
4819 66473 : unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
4820 : int OpIndices[3]) const {
4821 66473 : const MCInstrDesc &Desc = MI.getDesc();
4822 :
4823 : // Find the one SGPR operand we are allowed to use.
4824 : //
4825 : // First we need to consider the instruction's operand requirements before
4826 : // legalizing. Some operands are required to be SGPRs, such as implicit uses
4827 : // of VCC, but we are still bound by the constant bus requirement to only use
4828 : // one.
4829 : //
4830 : // If the operand's class is an SGPR, we can never move it.
4831 :
4832 66473 : unsigned SGPRReg = findImplicitSGPRRead(MI);
4833 66473 : if (SGPRReg != AMDGPU::NoRegister)
4834 : return SGPRReg;
4835 :
4836 66336 : unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
4837 66336 : const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4838 :
4839 203415 : for (unsigned i = 0; i < 3; ++i) {
4840 193081 : int Idx = OpIndices[i];
4841 193081 : if (Idx == -1)
4842 : break;
4843 :
4844 146874 : const MachineOperand &MO = MI.getOperand(Idx);
4845 146874 : if (!MO.isReg())
4846 : continue;
4847 :
4848 : // Is this operand statically required to be an SGPR based on the operand
4849 : // constraints?
4850 134634 : const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
4851 134634 : bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
4852 134634 : if (IsRequiredSGPR)
4853 9795 : return MO.getReg();
4854 :
4855 : // If this could be a VGPR or an SGPR, Check the dynamic register class.
4856 124839 : unsigned Reg = MO.getReg();
4857 : const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
4858 124839 : if (RI.isSGPRClass(RegRC))
4859 61817 : UsedSGPRs[i] = Reg;
4860 : }
4861 :
4862 : // We don't have a required SGPR operand, so we have a bit more freedom in
4863 : // selecting operands to move.
4864 :
4865 : // Try to select the most used SGPR. If an SGPR is equal to one of the
4866 : // others, we choose that.
4867 : //
4868 : // e.g.
4869 : // V_FMA_F32 v0, s0, s0, s0 -> No moves
4870 : // V_FMA_F32 v0, s0, s1, s0 -> Move s1
4871 :
4872 : // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
4873 : // prefer those.
4874 :
4875 56541 : if (UsedSGPRs[0] != AMDGPU::NoRegister) {
4876 27468 : if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
4877 : SGPRReg = UsedSGPRs[0];
4878 : }
4879 :
4880 56541 : if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
4881 21375 : if (UsedSGPRs[1] == UsedSGPRs[2])
4882 : SGPRReg = UsedSGPRs[1];
4883 : }
4884 :
4885 : return SGPRReg;
4886 : }
4887 :
4888 14888322 : MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
4889 : unsigned OperandName) const {
4890 14888322 : int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
4891 14888322 : if (Idx == -1)
4892 : return nullptr;
4893 :
4894 5043442 : return &MI.getOperand(Idx);
4895 : }
4896 :
4897 21399 : uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
4898 : uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
4899 42798 : if (ST.isAmdHsaOS()) {
4900 : // Set ATC = 1. GFX9 doesn't have this bit.
4901 688 : if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4902 : RsrcDataFormat |= (1ULL << 56);
4903 :
4904 : // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
4905 : // BTW, it disables TC L2 and therefore decreases performance.
4906 688 : if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
4907 351 : RsrcDataFormat |= (2ULL << 59);
4908 : }
4909 :
4910 21399 : return RsrcDataFormat;
4911 : }
4912 :
4913 485 : uint64_t SIInstrInfo::getScratchRsrcWords23() const {
4914 485 : uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
4915 : AMDGPU::RSRC_TID_ENABLE |
4916 485 : 0xffffffff; // Size;
4917 :
4918 : // GFX9 doesn't have ELEMENT_SIZE.
4919 485 : if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4920 401 : uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
4921 401 : Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
4922 : }
4923 :
4924 : // IndexStride = 64.
4925 485 : Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
4926 :
4927 : // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
4928 : // Clear them unless we want a huge stride.
4929 485 : if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4930 255 : Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
4931 :
4932 485 : return Rsrc23;
4933 : }
4934 :
4935 60 : bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
4936 60 : unsigned Opc = MI.getOpcode();
4937 :
4938 60 : return isSMRD(Opc);
4939 : }
4940 :
4941 14 : bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
4942 14 : unsigned Opc = MI.getOpcode();
4943 :
4944 14 : return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
4945 : }
4946 :
4947 2829 : unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
4948 : int &FrameIndex) const {
4949 : const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4950 2829 : if (!Addr || !Addr->isFI())
4951 : return AMDGPU::NoRegister;
4952 :
4953 : assert(!MI.memoperands_empty() &&
4954 : (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
4955 :
4956 2074 : FrameIndex = Addr->getIndex();
4957 2074 : return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
4958 : }
4959 :
4960 28 : unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
4961 : int &FrameIndex) const {
4962 : const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
4963 : assert(Addr && Addr->isFI());
4964 28 : FrameIndex = Addr->getIndex();
4965 28 : return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
4966 : }
4967 :
4968 18319 : unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
4969 : int &FrameIndex) const {
4970 18319 : if (!MI.mayLoad())
4971 : return AMDGPU::NoRegister;
4972 :
4973 2167 : if (isMUBUF(MI) || isVGPRSpill(MI))
4974 1425 : return isStackAccess(MI, FrameIndex);
4975 :
4976 742 : if (isSGPRSpill(MI))
4977 27 : return isSGPRStackAccess(MI, FrameIndex);
4978 :
4979 : return AMDGPU::NoRegister;
4980 : }
4981 :
4982 9084 : unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
4983 : int &FrameIndex) const {
4984 9084 : if (!MI.mayStore())
4985 : return AMDGPU::NoRegister;
4986 :
4987 1896 : if (isMUBUF(MI) || isVGPRSpill(MI))
4988 1404 : return isStackAccess(MI, FrameIndex);
4989 :
4990 492 : if (isSGPRSpill(MI))
4991 1 : return isSGPRStackAccess(MI, FrameIndex);
4992 :
4993 : return AMDGPU::NoRegister;
4994 : }
4995 :
4996 1216 : unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
4997 : unsigned Size = 0;
4998 1216 : MachineBasicBlock::const_instr_iterator I = MI.getIterator();
4999 1216 : MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
5000 4864 : while (++I != E && I->isInsideBundle()) {
5001 : assert(!I->isBundle() && "No nested bundle!");
5002 3648 : Size += getInstSizeInBytes(*I);
5003 : }
5004 :
5005 1216 : return Size;
5006 : }
5007 :
5008 732495 : unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
5009 732495 : unsigned Opc = MI.getOpcode();
5010 : const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
5011 732495 : unsigned DescSize = Desc.getSize();
5012 :
5013 : // If we have a definitive size, we can use it. Otherwise we need to inspect
5014 : // the operands to know the size.
5015 732495 : if (isFixedSize(MI))
5016 : return DescSize;
5017 :
5018 : // 4-byte instructions may have a 32-bit literal encoded after them. Check
5019 : // operands that coud ever be literals.
5020 728896 : if (isVALU(MI) || isSALU(MI)) {
5021 547189 : int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5022 547189 : if (Src0Idx == -1)
5023 : return DescSize; // No operands.
5024 :
5025 838802 : if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
5026 31104 : return DescSize + 4;
5027 :
5028 388297 : int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5029 388297 : if (Src1Idx == -1)
5030 : return DescSize;
5031 :
5032 384986 : if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
5033 13375 : return DescSize + 4;
5034 :
5035 179118 : int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5036 179118 : if (Src2Idx == -1)
5037 : return DescSize;
5038 :
5039 57106 : if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
5040 0 : return DescSize + 4;
5041 :
5042 : return DescSize;
5043 : }
5044 :
5045 : switch (Opc) {
5046 : case TargetOpcode::IMPLICIT_DEF:
5047 : case TargetOpcode::KILL:
5048 : case TargetOpcode::DBG_VALUE:
5049 : case TargetOpcode::EH_LABEL:
5050 : return 0;
5051 1216 : case TargetOpcode::BUNDLE:
5052 1216 : return getInstBundleSize(MI);
5053 2902 : case TargetOpcode::INLINEASM: {
5054 2902 : const MachineFunction *MF = MI.getParent()->getParent();
5055 2902 : const char *AsmStr = MI.getOperand(0).getSymbolName();
5056 5804 : return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
5057 : }
5058 : default:
5059 : return DescSize;
5060 : }
5061 : }
5062 :
5063 0 : bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
5064 0 : if (!isFLAT(MI))
5065 : return false;
5066 :
5067 0 : if (MI.memoperands_empty())
5068 : return true;
5069 :
5070 0 : for (const MachineMemOperand *MMO : MI.memoperands()) {
5071 0 : if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
5072 : return true;
5073 : }
5074 : return false;
5075 : }
5076 :
5077 0 : bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
5078 0 : return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
5079 : }
5080 :
5081 0 : void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
5082 : MachineBasicBlock *IfEnd) const {
5083 0 : MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
5084 : assert(TI != IfEntry->end());
5085 :
5086 : MachineInstr *Branch = &(*TI);
5087 0 : MachineFunction *MF = IfEntry->getParent();
5088 0 : MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
5089 :
5090 0 : if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
5091 0 : unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5092 : MachineInstr *SIIF =
5093 0 : BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
5094 0 : .add(Branch->getOperand(0))
5095 0 : .add(Branch->getOperand(1));
5096 : MachineInstr *SIEND =
5097 0 : BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
5098 0 : .addReg(DstReg);
5099 :
5100 0 : IfEntry->erase(TI);
5101 : IfEntry->insert(IfEntry->end(), SIIF);
5102 0 : IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
5103 : }
5104 0 : }
5105 :
5106 0 : void SIInstrInfo::convertNonUniformLoopRegion(
5107 : MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
5108 0 : MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
5109 : // We expect 2 terminators, one conditional and one unconditional.
5110 : assert(TI != LoopEnd->end());
5111 :
5112 : MachineInstr *Branch = &(*TI);
5113 0 : MachineFunction *MF = LoopEnd->getParent();
5114 0 : MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
5115 :
5116 0 : if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
5117 :
5118 0 : unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5119 0 : unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5120 : MachineInstrBuilder HeaderPHIBuilder =
5121 0 : BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
5122 : for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
5123 : E = LoopEntry->pred_end();
5124 0 : PI != E; ++PI) {
5125 0 : if (*PI == LoopEnd) {
5126 0 : HeaderPHIBuilder.addReg(BackEdgeReg);
5127 : } else {
5128 : MachineBasicBlock *PMBB = *PI;
5129 0 : unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5130 0 : materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
5131 : ZeroReg, 0);
5132 0 : HeaderPHIBuilder.addReg(ZeroReg);
5133 : }
5134 0 : HeaderPHIBuilder.addMBB(*PI);
5135 : }
5136 0 : MachineInstr *HeaderPhi = HeaderPHIBuilder;
5137 0 : MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
5138 0 : get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
5139 0 : .addReg(DstReg)
5140 0 : .add(Branch->getOperand(0));
5141 : MachineInstr *SILOOP =
5142 0 : BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
5143 0 : .addReg(BackEdgeReg)
5144 0 : .addMBB(LoopEntry);
5145 :
5146 : LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
5147 0 : LoopEnd->erase(TI);
5148 : LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
5149 : LoopEnd->insert(LoopEnd->end(), SILOOP);
5150 : }
5151 0 : }
5152 :
5153 : ArrayRef<std::pair<int, const char *>>
5154 5 : SIInstrInfo::getSerializableTargetIndices() const {
5155 : static const std::pair<int, const char *> TargetIndices[] = {
5156 : {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
5157 : {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
5158 : {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
5159 : {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
5160 : {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
5161 5 : return makeArrayRef(TargetIndices);
5162 : }
5163 :
5164 : /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
5165 : /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
5166 : ScheduleHazardRecognizer *
5167 15844 : SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
5168 : const ScheduleDAG *DAG) const {
5169 15844 : return new GCNHazardRecognizer(DAG->MF);
5170 : }
5171 :
5172 : /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
5173 : /// pass.
5174 : ScheduleHazardRecognizer *
5175 19910 : SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
5176 19910 : return new GCNHazardRecognizer(MF);
5177 : }
5178 :
5179 : std::pair<unsigned, unsigned>
5180 32 : SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
5181 32 : return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
5182 : }
5183 :
5184 : ArrayRef<std::pair<unsigned, const char *>>
5185 45 : SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
5186 : static const std::pair<unsigned, const char *> TargetFlags[] = {
5187 : { MO_GOTPCREL, "amdgpu-gotprel" },
5188 : { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
5189 : { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
5190 : { MO_REL32_LO, "amdgpu-rel32-lo" },
5191 : { MO_REL32_HI, "amdgpu-rel32-hi" }
5192 : };
5193 :
5194 45 : return makeArrayRef(TargetFlags);
5195 : }
5196 :
5197 17320 : bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
5198 31356 : return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
5199 14036 : MI.modifiesRegister(AMDGPU::EXEC, &RI);
5200 : }
5201 :
5202 : MachineInstrBuilder
5203 97 : SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
5204 : MachineBasicBlock::iterator I,
5205 : const DebugLoc &DL,
5206 : unsigned DestReg) const {
5207 97 : if (ST.hasAddNoCarry())
5208 78 : return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
5209 :
5210 58 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5211 58 : unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5212 58 : MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);
5213 :
5214 116 : return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
5215 58 : .addReg(UnusedCarry, RegState::Define | RegState::Dead);
5216 : }
5217 :
5218 127 : bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
5219 127 : switch (Opcode) {
5220 : case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
5221 : case AMDGPU::SI_KILL_I1_TERMINATOR:
5222 : return true;
5223 122 : default:
5224 122 : return false;
5225 : }
5226 : }
5227 :
5228 84 : const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
5229 84 : switch (Opcode) {
5230 52 : case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5231 104 : return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
5232 32 : case AMDGPU::SI_KILL_I1_PSEUDO:
5233 64 : return get(AMDGPU::SI_KILL_I1_TERMINATOR);
5234 0 : default:
5235 0 : llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
5236 : }
5237 : }
5238 :
5239 13962 : bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
5240 13962 : if (!isSMRD(MI))
5241 : return false;
5242 :
5243 : // Check that it is using a buffer resource.
5244 13962 : int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
5245 13962 : if (Idx == -1) // e.g. s_memtime
5246 : return false;
5247 :
5248 13951 : const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
5249 13951 : return RCID == AMDGPU::SReg_128RegClassID;
5250 : }
5251 :
5252 : // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
5253 : enum SIEncodingFamily {
5254 : SI = 0,
5255 : VI = 1,
5256 : SDWA = 2,
5257 : SDWA9 = 3,
5258 : GFX80 = 4,
5259 : GFX9 = 5
5260 : };
5261 :
5262 : static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
5263 1292382 : switch (ST.getGeneration()) {
5264 : default:
5265 : break;
5266 : case AMDGPUSubtarget::SOUTHERN_ISLANDS:
5267 : case AMDGPUSubtarget::SEA_ISLANDS:
5268 : return SIEncodingFamily::SI;
5269 704185 : case AMDGPUSubtarget::VOLCANIC_ISLANDS:
5270 : case AMDGPUSubtarget::GFX9:
5271 : return SIEncodingFamily::VI;
5272 : }
5273 0 : llvm_unreachable("Unknown subtarget generation!");
5274 : }
5275 :
5276 1292382 : int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
5277 1292382 : SIEncodingFamily Gen = subtargetEncodingFamily(ST);
5278 :
5279 2584764 : if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
5280 96220 : ST.getGeneration() >= AMDGPUSubtarget::GFX9)
5281 : Gen = SIEncodingFamily::GFX9;
5282 :
5283 1292382 : if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
5284 5417 : Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
5285 : : SIEncodingFamily::SDWA;
5286 : // Adjust the encoding family to GFX80 for D16 buffer instructions when the
5287 : // subtarget has UnpackedD16VMem feature.
5288 : // TODO: remove this when we discard GFX80 encoding.
5289 1292382 : if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
5290 : Gen = SIEncodingFamily::GFX80;
5291 :
5292 1292382 : int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
5293 :
5294 : // -1 means that Opcode is already a native instruction.
5295 1292382 : if (MCOp == -1)
5296 : return Opcode;
5297 :
5298 : // (uint16_t)-1 means that Opcode is a pseudo instruction that has
5299 : // no encoding in the given subtarget generation.
5300 1081655 : if (MCOp == (uint16_t)-1)
5301 23140 : return -1;
5302 :
5303 : return MCOp;
5304 : }
|