LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the scalar result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 // Ignore comparisons which are only used masked with exec.
156 // This allows some hoisting/sinking of VALU comparisons.
157 if (MI.isCompare()) {
158 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
159 if (!Dst)
160 return true;
161
162 Register DstReg = Dst->getReg();
163 if (!DstReg.isVirtual())
164 return true;
165
166 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
167 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
168 switch (Use.getOpcode()) {
169 case AMDGPU::S_AND_SAVEEXEC_B32:
170 case AMDGPU::S_AND_SAVEEXEC_B64:
171 break;
172 case AMDGPU::S_AND_B32:
173 case AMDGPU::S_AND_B64:
174 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
175 return true;
176 break;
177 default:
178 return true;
179 }
180 }
181 return false;
182 }
183
184 // If it is not convergent it does not depend on EXEC.
185 if (!MI.isConvergent())
186 return false;
187
188 switch (MI.getOpcode()) {
189 default:
190 break;
191 case AMDGPU::V_READFIRSTLANE_B32:
192 return true;
193 }
194
195 return false;
196}
197
199 // Any implicit use of exec by VALU is not a real register read.
200 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
201 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
202}
203
205 MachineBasicBlock *SuccToSinkTo,
206 MachineCycleInfo *CI) const {
207 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
208 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
209 return true;
210
211 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
212 // Check if sinking of MI would create temporal divergent use.
213 for (auto Op : MI.uses()) {
214 if (Op.isReg() && Op.getReg().isVirtual() &&
215 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
216 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
217
218 // SgprDef defined inside cycle
219 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
220 if (FromCycle == nullptr)
221 continue;
222
223 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
224 // Check if there is a FromCycle that contains SgprDef's basic block but
225 // does not contain SuccToSinkTo and also has divergent exit condition.
226 while (FromCycle && !FromCycle->contains(ToCycle)) {
228 FromCycle->getExitingBlocks(ExitingBlocks);
229
230 // FromCycle has divergent exit condition.
231 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
232 if (hasDivergentBranch(ExitingBlock))
233 return false;
234 }
235
236 FromCycle = FromCycle->getParentCycle();
237 }
238 }
239 }
240
241 return true;
242}
243
245 int64_t &Offset0,
246 int64_t &Offset1) const {
247 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
248 return false;
249
250 unsigned Opc0 = Load0->getMachineOpcode();
251 unsigned Opc1 = Load1->getMachineOpcode();
252
253 // Make sure both are actually loads.
254 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
255 return false;
256
257 // A mayLoad instruction without a def is not a load. Likely a prefetch.
258 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
259 return false;
260
261 if (isDS(Opc0) && isDS(Opc1)) {
262
263 // FIXME: Handle this case:
264 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
265 return false;
266
267 // Check base reg.
268 if (Load0->getOperand(0) != Load1->getOperand(0))
269 return false;
270
271 // Skip read2 / write2 variants for simplicity.
272 // TODO: We should report true if the used offsets are adjacent (excluded
273 // st64 versions).
274 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
275 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
276 if (Offset0Idx == -1 || Offset1Idx == -1)
277 return false;
278
279 // XXX - be careful of dataless loads
280 // getNamedOperandIdx returns the index for MachineInstrs. Since they
281 // include the output in the operand list, but SDNodes don't, we need to
282 // subtract the index by one.
283 Offset0Idx -= get(Opc0).NumDefs;
284 Offset1Idx -= get(Opc1).NumDefs;
285 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
286 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
287 return true;
288 }
289
290 if (isSMRD(Opc0) && isSMRD(Opc1)) {
291 // Skip time and cache invalidation instructions.
292 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
293 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
294 return false;
295
296 unsigned NumOps = getNumOperandsNoGlue(Load0);
297 if (NumOps != getNumOperandsNoGlue(Load1))
298 return false;
299
300 // Check base reg.
301 if (Load0->getOperand(0) != Load1->getOperand(0))
302 return false;
303
304 // Match register offsets, if both register and immediate offsets present.
305 assert(NumOps == 4 || NumOps == 5);
306 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
307 return false;
308
309 const ConstantSDNode *Load0Offset =
311 const ConstantSDNode *Load1Offset =
313
314 if (!Load0Offset || !Load1Offset)
315 return false;
316
317 Offset0 = Load0Offset->getZExtValue();
318 Offset1 = Load1Offset->getZExtValue();
319 return true;
320 }
321
322 // MUBUF and MTBUF can access the same addresses.
323 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
324
325 // MUBUF and MTBUF have vaddr at different indices.
326 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
327 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
328 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
329 return false;
330
331 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
332 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
333
334 if (OffIdx0 == -1 || OffIdx1 == -1)
335 return false;
336
337 // getNamedOperandIdx returns the index for MachineInstrs. Since they
338 // include the output in the operand list, but SDNodes don't, we need to
339 // subtract the index by one.
340 OffIdx0 -= get(Opc0).NumDefs;
341 OffIdx1 -= get(Opc1).NumDefs;
342
343 SDValue Off0 = Load0->getOperand(OffIdx0);
344 SDValue Off1 = Load1->getOperand(OffIdx1);
345
346 // The offset might be a FrameIndexSDNode.
347 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
348 return false;
349
350 Offset0 = Off0->getAsZExtVal();
351 Offset1 = Off1->getAsZExtVal();
352 return true;
353 }
354
355 return false;
356}
357
358static bool isStride64(unsigned Opc) {
359 switch (Opc) {
360 case AMDGPU::DS_READ2ST64_B32:
361 case AMDGPU::DS_READ2ST64_B64:
362 case AMDGPU::DS_WRITE2ST64_B32:
363 case AMDGPU::DS_WRITE2ST64_B64:
364 return true;
365 default:
366 return false;
367 }
368}
369
372 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
373 const TargetRegisterInfo *TRI) const {
374 if (!LdSt.mayLoadOrStore())
375 return false;
376
377 unsigned Opc = LdSt.getOpcode();
378 OffsetIsScalable = false;
379 const MachineOperand *BaseOp, *OffsetOp;
380 int DataOpIdx;
381
382 if (isDS(LdSt)) {
383 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
384 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
385 if (OffsetOp) {
386 // Normal, single offset LDS instruction.
387 if (!BaseOp) {
388 // DS_CONSUME/DS_APPEND use M0 for the base address.
389 // TODO: find the implicit use operand for M0 and use that as BaseOp?
390 return false;
391 }
392 BaseOps.push_back(BaseOp);
393 Offset = OffsetOp->getImm();
394 // Get appropriate operand, and compute width accordingly.
395 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
396 if (DataOpIdx == -1)
397 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
398 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
399 Width = LocationSize::precise(64);
400 else
401 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
402 } else {
403 // The 2 offset instructions use offset0 and offset1 instead. We can treat
404 // these as a load with a single offset if the 2 offsets are consecutive.
405 // We will use this for some partially aligned loads.
406 const MachineOperand *Offset0Op =
407 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
408 const MachineOperand *Offset1Op =
409 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
410
411 unsigned Offset0 = Offset0Op->getImm() & 0xff;
412 unsigned Offset1 = Offset1Op->getImm() & 0xff;
413 if (Offset0 + 1 != Offset1)
414 return false;
415
416 // Each of these offsets is in element sized units, so we need to convert
417 // to bytes of the individual reads.
418
419 unsigned EltSize;
420 if (LdSt.mayLoad())
421 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
422 else {
423 assert(LdSt.mayStore());
424 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
425 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
426 }
427
428 if (isStride64(Opc))
429 EltSize *= 64;
430
431 BaseOps.push_back(BaseOp);
432 Offset = EltSize * Offset0;
433 // Get appropriate operand(s), and compute width accordingly.
434 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
435 if (DataOpIdx == -1) {
436 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
439 Width = LocationSize::precise(
440 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
441 } else {
442 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
443 }
444 }
445 return true;
446 }
447
448 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
449 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
450 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
451 return false;
452 BaseOps.push_back(RSrc);
453 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
454 if (BaseOp && !BaseOp->isFI())
455 BaseOps.push_back(BaseOp);
456 const MachineOperand *OffsetImm =
457 getNamedOperand(LdSt, AMDGPU::OpName::offset);
458 Offset = OffsetImm->getImm();
459 const MachineOperand *SOffset =
460 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
461 if (SOffset) {
462 if (SOffset->isReg())
463 BaseOps.push_back(SOffset);
464 else
465 Offset += SOffset->getImm();
466 }
467 // Get appropriate operand, and compute width accordingly.
468 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
469 if (DataOpIdx == -1)
470 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
471 if (DataOpIdx == -1) // LDS DMA
472 return false;
473 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
474 return true;
475 }
476
477 if (isImage(LdSt)) {
478 auto RsrcOpName =
479 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
480 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
481 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
482 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
483 if (VAddr0Idx >= 0) {
484 // GFX10 possible NSA encoding.
485 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
486 BaseOps.push_back(&LdSt.getOperand(I));
487 } else {
488 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
489 }
490 Offset = 0;
491 // Get appropriate operand, and compute width accordingly.
492 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
493 if (DataOpIdx == -1)
494 return false; // no return sampler
495 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
496 return true;
497 }
498
499 if (isSMRD(LdSt)) {
500 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
501 if (!BaseOp) // e.g. S_MEMTIME
502 return false;
503 BaseOps.push_back(BaseOp);
504 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
505 Offset = OffsetOp ? OffsetOp->getImm() : 0;
506 // Get appropriate operand, and compute width accordingly.
507 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
508 if (DataOpIdx == -1)
509 return false;
510 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
511 return true;
512 }
513
514 if (isFLAT(LdSt)) {
515 // Instructions have either vaddr or saddr or both or none.
516 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
517 if (BaseOp)
518 BaseOps.push_back(BaseOp);
519 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
520 if (BaseOp)
521 BaseOps.push_back(BaseOp);
522 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
523 // Get appropriate operand, and compute width accordingly.
524 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
525 if (DataOpIdx == -1)
526 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
527 if (DataOpIdx == -1) // LDS DMA
528 return false;
529 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
530 return true;
531 }
532
533 return false;
534}
535
536static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
538 const MachineInstr &MI2,
540 // Only examine the first "base" operand of each instruction, on the
541 // assumption that it represents the real base address of the memory access.
542 // Other operands are typically offsets or indices from this base address.
543 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
544 return true;
545
546 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
547 return false;
548
549 auto *MO1 = *MI1.memoperands_begin();
550 auto *MO2 = *MI2.memoperands_begin();
551 if (MO1->getAddrSpace() != MO2->getAddrSpace())
552 return false;
553
554 const auto *Base1 = MO1->getValue();
555 const auto *Base2 = MO2->getValue();
556 if (!Base1 || !Base2)
557 return false;
558 Base1 = getUnderlyingObject(Base1);
559 Base2 = getUnderlyingObject(Base2);
560
561 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
562 return false;
563
564 return Base1 == Base2;
565}
566
568 int64_t Offset1, bool OffsetIsScalable1,
570 int64_t Offset2, bool OffsetIsScalable2,
571 unsigned ClusterSize,
572 unsigned NumBytes) const {
573 // If the mem ops (to be clustered) do not have the same base ptr, then they
574 // should not be clustered
575 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
576 if (!BaseOps1.empty() && !BaseOps2.empty()) {
577 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
578 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
579 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
580 return false;
581
582 const SIMachineFunctionInfo *MFI =
583 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
584 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
585 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
586 // If only one base op is empty, they do not have the same base ptr
587 return false;
588 }
589
590 // In order to avoid register pressure, on an average, the number of DWORDS
591 // loaded together by all clustered mem ops should not exceed
592 // MaxMemoryClusterDWords. This is an empirical value based on certain
593 // observations and performance related experiments.
594 // The good thing about this heuristic is - it avoids clustering of too many
595 // sub-word loads, and also avoids clustering of wide loads. Below is the
596 // brief summary of how the heuristic behaves for various `LoadSize` when
597 // MaxMemoryClusterDWords is 8.
598 //
599 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
600 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
601 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
602 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
603 // (5) LoadSize >= 17: do not cluster
604 const unsigned LoadSize = NumBytes / ClusterSize;
605 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
606 return NumDWords <= MaxMemoryClusterDWords;
607}
608
609// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
610// the first 16 loads will be interleaved with the stores, and the next 16 will
611// be clustered as expected. It should really split into 2 16 store batches.
612//
613// Loads are clustered until this returns false, rather than trying to schedule
614// groups of stores. This also means we have to deal with saying different
615// address space loads should be clustered, and ones which might cause bank
616// conflicts.
617//
618// This might be deprecated so it might not be worth that much effort to fix.
620 int64_t Offset0, int64_t Offset1,
621 unsigned NumLoads) const {
622 assert(Offset1 > Offset0 &&
623 "Second offset should be larger than first offset!");
624 // If we have less than 16 loads in a row, and the offsets are within 64
625 // bytes, then schedule together.
626
627 // A cacheline is 64 bytes (for global memory).
628 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
629}
630
633 const DebugLoc &DL, MCRegister DestReg,
634 MCRegister SrcReg, bool KillSrc,
635 const char *Msg = "illegal VGPR to SGPR copy") {
636 MachineFunction *MF = MBB.getParent();
637
639 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
640
641 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
642 .addReg(SrcReg, getKillRegState(KillSrc));
643}
644
645/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
646/// possible to have a direct copy in these cases on GFX908, so an intermediate
647/// VGPR copy is required.
651 const DebugLoc &DL, MCRegister DestReg,
652 MCRegister SrcReg, bool KillSrc,
653 RegScavenger &RS, bool RegsOverlap,
654 Register ImpDefSuperReg = Register(),
655 Register ImpUseSuperReg = Register()) {
656 assert((TII.getSubtarget().hasMAIInsts() &&
657 !TII.getSubtarget().hasGFX90AInsts()) &&
658 "Expected GFX908 subtarget.");
659
660 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
661 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
662 "Source register of the copy should be either an SGPR or an AGPR.");
663
664 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
665 "Destination register of the copy should be an AGPR.");
666
667 const SIRegisterInfo &RI = TII.getRegisterInfo();
668
669 // First try to find defining accvgpr_write to avoid temporary registers.
670 // In the case of copies of overlapping AGPRs, we conservatively do not
671 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
672 // an accvgpr_write used for this same copy due to implicit-defs
673 if (!RegsOverlap) {
674 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
675 --Def;
676
677 if (!Def->modifiesRegister(SrcReg, &RI))
678 continue;
679
680 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
681 Def->getOperand(0).getReg() != SrcReg)
682 break;
683
684 MachineOperand &DefOp = Def->getOperand(1);
685 assert(DefOp.isReg() || DefOp.isImm());
686
687 if (DefOp.isReg()) {
688 bool SafeToPropagate = true;
689 // Check that register source operand is not clobbered before MI.
690 // Immediate operands are always safe to propagate.
691 for (auto I = Def; I != MI && SafeToPropagate; ++I)
692 if (I->modifiesRegister(DefOp.getReg(), &RI))
693 SafeToPropagate = false;
694
695 if (!SafeToPropagate)
696 break;
697
698 for (auto I = Def; I != MI; ++I)
699 I->clearRegisterKills(DefOp.getReg(), &RI);
700 }
701
702 MachineInstrBuilder Builder =
703 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
704 .add(DefOp);
705 if (ImpDefSuperReg)
706 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
707
708 if (ImpUseSuperReg) {
709 Builder.addReg(ImpUseSuperReg,
711 }
712
713 return;
714 }
715 }
716
717 RS.enterBasicBlockEnd(MBB);
718 RS.backward(std::next(MI));
719
720 // Ideally we want to have three registers for a long reg_sequence copy
721 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
722 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
723 *MBB.getParent());
724
725 // Registers in the sequence are allocated contiguously so we can just
726 // use register number to pick one of three round-robin temps.
727 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
728 Register Tmp =
729 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
730 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
731 "VGPR used for an intermediate copy should have been reserved.");
732
733 // Only loop through if there are any free registers left. We don't want to
734 // spill.
735 while (RegNo--) {
736 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
737 /* RestoreAfter */ false, 0,
738 /* AllowSpill */ false);
739 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
740 break;
741 Tmp = Tmp2;
742 RS.setRegUsed(Tmp);
743 }
744
745 // Insert copy to temporary VGPR.
746 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
747 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
748 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
749 } else {
750 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
751 }
752
753 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
754 .addReg(SrcReg, getKillRegState(KillSrc));
755 if (ImpUseSuperReg) {
756 UseBuilder.addReg(ImpUseSuperReg,
758 }
759
760 MachineInstrBuilder DefBuilder
761 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
762 .addReg(Tmp, RegState::Kill);
763
764 if (ImpDefSuperReg)
765 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
766}
767
770 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
771 const TargetRegisterClass *RC, bool Forward) {
772 const SIRegisterInfo &RI = TII.getRegisterInfo();
773 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
775 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
776
777 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
778 int16_t SubIdx = BaseIndices[Idx];
779 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
780 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
781 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
782 unsigned Opcode = AMDGPU::S_MOV_B32;
783
784 // Is SGPR aligned? If so try to combine with next.
785 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
786 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
787 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
788 // Can use SGPR64 copy
789 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
790 SubIdx = RI.getSubRegFromChannel(Channel, 2);
791 DestSubReg = RI.getSubReg(DestReg, SubIdx);
792 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
793 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
794 Opcode = AMDGPU::S_MOV_B64;
795 Idx++;
796 }
797
798 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
799 .addReg(SrcSubReg)
800 .addReg(SrcReg, RegState::Implicit);
801
802 if (!FirstMI)
803 FirstMI = LastMI;
804
805 if (!Forward)
806 I--;
807 }
808
809 assert(FirstMI && LastMI);
810 if (!Forward)
811 std::swap(FirstMI, LastMI);
812
813 FirstMI->addOperand(
814 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
815
816 if (KillSrc)
817 LastMI->addRegisterKilled(SrcReg, &RI);
818}
819
822 const DebugLoc &DL, Register DestReg,
823 Register SrcReg, bool KillSrc, bool RenamableDest,
824 bool RenamableSrc) const {
825 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
826 unsigned Size = RI.getRegSizeInBits(*RC);
827 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
828 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
829
830 // The rest of copyPhysReg assumes Src and Dst size are the same size.
831 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
832 // we remove Fix16BitCopies and this code block?
833 if (Fix16BitCopies) {
834 if (((Size == 16) != (SrcSize == 16))) {
835 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
836 assert(ST.useRealTrue16Insts());
837 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
838 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
839 RegToFix = SubReg;
840
841 if (DestReg == SrcReg) {
842 // Identity copy. Insert empty bundle since ExpandPostRA expects an
843 // instruction here.
844 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
845 return;
846 }
847 RC = RI.getPhysRegBaseClass(DestReg);
848 Size = RI.getRegSizeInBits(*RC);
849 SrcRC = RI.getPhysRegBaseClass(SrcReg);
850 SrcSize = RI.getRegSizeInBits(*SrcRC);
851 }
852 }
853
854 if (RC == &AMDGPU::VGPR_32RegClass) {
855 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
856 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
857 AMDGPU::AGPR_32RegClass.contains(SrcReg));
858 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
859 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
860 BuildMI(MBB, MI, DL, get(Opc), DestReg)
861 .addReg(SrcReg, getKillRegState(KillSrc));
862 return;
863 }
864
865 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
866 RC == &AMDGPU::SReg_32RegClass) {
867 if (SrcReg == AMDGPU::SCC) {
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
869 .addImm(1)
870 .addImm(0);
871 return;
872 }
873
874 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
875 if (DestReg == AMDGPU::VCC_LO) {
876 // FIXME: Hack until VReg_1 removed.
877 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
878 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
879 .addImm(0)
880 .addReg(SrcReg, getKillRegState(KillSrc));
881 return;
882 }
883
884 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
885 return;
886 }
887
888 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 return;
891 }
892
893 if (RC == &AMDGPU::SReg_64RegClass) {
894 if (SrcReg == AMDGPU::SCC) {
895 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
896 .addImm(1)
897 .addImm(0);
898 return;
899 }
900
901 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
902 if (DestReg == AMDGPU::VCC) {
903 // FIXME: Hack until VReg_1 removed.
904 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
905 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
906 .addImm(0)
907 .addReg(SrcReg, getKillRegState(KillSrc));
908 return;
909 }
910
911 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
912 return;
913 }
914
915 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
916 .addReg(SrcReg, getKillRegState(KillSrc));
917 return;
918 }
919
920 if (DestReg == AMDGPU::SCC) {
921 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
922 // but SelectionDAG emits such copies for i1 sources.
923 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
924 // This copy can only be produced by patterns
925 // with explicit SCC, which are known to be enabled
926 // only for subtargets with S_CMP_LG_U64 present.
927 assert(ST.hasScalarCompareEq64());
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 } else {
932 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
933 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
934 .addReg(SrcReg, getKillRegState(KillSrc))
935 .addImm(0);
936 }
937
938 return;
939 }
940
941 if (RC == &AMDGPU::AGPR_32RegClass) {
942 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
943 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
944 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
945 .addReg(SrcReg, getKillRegState(KillSrc));
946 return;
947 }
948
949 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
950 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
951 .addReg(SrcReg, getKillRegState(KillSrc));
952 return;
953 }
954
955 // FIXME: Pass should maintain scavenger to avoid scan through the block on
956 // every AGPR spill.
957 RegScavenger RS;
958 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
959 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
960 return;
961 }
962
963 if (Size == 16) {
964 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
965 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
966 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
967
968 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
969 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
970 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
971 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
972 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
973 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
974 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
975 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
976
977 if (IsSGPRDst) {
978 if (!IsSGPRSrc) {
979 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
980 return;
981 }
982
983 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
984 .addReg(NewSrcReg, getKillRegState(KillSrc));
985 return;
986 }
987
988 if (IsAGPRDst || IsAGPRSrc) {
989 if (!DstLow || !SrcLow) {
990 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
991 "Cannot use hi16 subreg with an AGPR!");
992 }
993
994 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
995 return;
996 }
997
998 if (ST.useRealTrue16Insts()) {
999 if (IsSGPRSrc) {
1000 assert(SrcLow);
1001 SrcReg = NewSrcReg;
1002 }
1003 // Use the smaller instruction encoding if possible.
1004 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1005 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1006 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1007 .addReg(SrcReg);
1008 } else {
1009 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1010 .addImm(0) // src0_modifiers
1011 .addReg(SrcReg)
1012 .addImm(0); // op_sel
1013 }
1014 return;
1015 }
1016
1017 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1018 if (!DstLow || !SrcLow) {
1019 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1020 "Cannot use hi16 subreg on VI!");
1021 }
1022
1023 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1024 .addReg(NewSrcReg, getKillRegState(KillSrc));
1025 return;
1026 }
1027
1028 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1029 .addImm(0) // src0_modifiers
1030 .addReg(NewSrcReg)
1031 .addImm(0) // clamp
1038 // First implicit operand is $exec.
1039 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1040 return;
1041 }
1042
1043 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1044 if (ST.hasMovB64()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1046 .addReg(SrcReg, getKillRegState(KillSrc));
1047 return;
1048 }
1049 if (ST.hasPkMovB32()) {
1050 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1052 .addReg(SrcReg)
1054 .addReg(SrcReg)
1055 .addImm(0) // op_sel_lo
1056 .addImm(0) // op_sel_hi
1057 .addImm(0) // neg_lo
1058 .addImm(0) // neg_hi
1059 .addImm(0) // clamp
1060 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1061 return;
1062 }
1063 }
1064
1065 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1066 if (RI.isSGPRClass(RC)) {
1067 if (!RI.isSGPRClass(SrcRC)) {
1068 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1069 return;
1070 }
1071 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1072 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1073 Forward);
1074 return;
1075 }
1076
1077 unsigned EltSize = 4;
1078 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1079 if (RI.isAGPRClass(RC)) {
1080 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1081 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1082 else if (RI.hasVGPRs(SrcRC) ||
1083 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1084 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1085 else
1086 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1087 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1088 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1089 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1090 (RI.isProperlyAlignedRC(*RC) &&
1091 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1092 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1093 if (ST.hasMovB64()) {
1094 Opcode = AMDGPU::V_MOV_B64_e32;
1095 EltSize = 8;
1096 } else if (ST.hasPkMovB32()) {
1097 Opcode = AMDGPU::V_PK_MOV_B32;
1098 EltSize = 8;
1099 }
1100 }
1101
1102 // For the cases where we need an intermediate instruction/temporary register
1103 // (destination is an AGPR), we need a scavenger.
1104 //
1105 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1106 // whole block for every handled copy.
1107 std::unique_ptr<RegScavenger> RS;
1108 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1109 RS = std::make_unique<RegScavenger>();
1110
1111 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1112
1113 // If there is an overlap, we can't kill the super-register on the last
1114 // instruction, since it will also kill the components made live by this def.
1115 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1116 const bool CanKillSuperReg = KillSrc && !Overlap;
1117
1118 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1119 unsigned SubIdx;
1120 if (Forward)
1121 SubIdx = SubIndices[Idx];
1122 else
1123 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1124 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1125 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1126 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1127
1128 bool IsFirstSubreg = Idx == 0;
1129 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1130
1131 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1132 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1133 Register ImpUseSuper = SrcReg;
1134 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1135 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1136 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1138 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1140 .addReg(SrcSubReg)
1142 .addReg(SrcSubReg)
1143 .addImm(0) // op_sel_lo
1144 .addImm(0) // op_sel_hi
1145 .addImm(0) // neg_lo
1146 .addImm(0) // neg_hi
1147 .addImm(0) // clamp
1148 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1149 if (IsFirstSubreg)
1151 } else {
1152 MachineInstrBuilder Builder =
1153 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1154 if (IsFirstSubreg)
1155 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1156
1157 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1158 }
1159 }
1160}
1161
1162int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1163 int32_t NewOpc;
1164
1165 // Try to map original to commuted opcode
1166 NewOpc = AMDGPU::getCommuteRev(Opcode);
1167 if (NewOpc != -1)
1168 // Check if the commuted (REV) opcode exists on the target.
1169 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1170
1171 // Try to map commuted to original opcode
1172 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1173 if (NewOpc != -1)
1174 // Check if the original (non-REV) opcode exists on the target.
1175 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1176
1177 return Opcode;
1178}
1179
1180const TargetRegisterClass *
1182 return &AMDGPU::VGPR_32RegClass;
1183}
1184
1187 const DebugLoc &DL, Register DstReg,
1189 Register TrueReg,
1190 Register FalseReg) const {
1191 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1192 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1194 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1195 "Not a VGPR32 reg");
1196
1197 if (Cond.size() == 1) {
1198 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1199 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1200 .add(Cond[0]);
1201 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1202 .addImm(0)
1203 .addReg(FalseReg)
1204 .addImm(0)
1205 .addReg(TrueReg)
1206 .addReg(SReg);
1207 } else if (Cond.size() == 2) {
1208 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1209 switch (Cond[0].getImm()) {
1210 case SIInstrInfo::SCC_TRUE: {
1211 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1212 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1213 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1214 .addImm(0)
1215 .addReg(FalseReg)
1216 .addImm(0)
1217 .addReg(TrueReg)
1218 .addReg(SReg);
1219 break;
1220 }
1221 case SIInstrInfo::SCC_FALSE: {
1222 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1223 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1224 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1225 .addImm(0)
1226 .addReg(FalseReg)
1227 .addImm(0)
1228 .addReg(TrueReg)
1229 .addReg(SReg);
1230 break;
1231 }
1232 case SIInstrInfo::VCCNZ: {
1233 MachineOperand RegOp = Cond[1];
1234 RegOp.setImplicit(false);
1235 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1236 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1237 .add(RegOp);
1238 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1239 .addImm(0)
1240 .addReg(FalseReg)
1241 .addImm(0)
1242 .addReg(TrueReg)
1243 .addReg(SReg);
1244 break;
1245 }
1246 case SIInstrInfo::VCCZ: {
1247 MachineOperand RegOp = Cond[1];
1248 RegOp.setImplicit(false);
1249 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1250 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1251 .add(RegOp);
1252 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1253 .addImm(0)
1254 .addReg(TrueReg)
1255 .addImm(0)
1256 .addReg(FalseReg)
1257 .addReg(SReg);
1258 break;
1259 }
1260 case SIInstrInfo::EXECNZ: {
1261 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1262 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1263 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1264 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1265 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1266 .addImm(0)
1267 .addReg(FalseReg)
1268 .addImm(0)
1269 .addReg(TrueReg)
1270 .addReg(SReg);
1271 break;
1272 }
1273 case SIInstrInfo::EXECZ: {
1274 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1275 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1276 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1277 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1278 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1279 .addImm(0)
1280 .addReg(FalseReg)
1281 .addImm(0)
1282 .addReg(TrueReg)
1283 .addReg(SReg);
1284 llvm_unreachable("Unhandled branch predicate EXECZ");
1285 break;
1286 }
1287 default:
1288 llvm_unreachable("invalid branch predicate");
1289 }
1290 } else {
1291 llvm_unreachable("Can only handle Cond size 1 or 2");
1292 }
1293}
1294
1297 const DebugLoc &DL,
1298 Register SrcReg, int Value) const {
1299 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1300 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1301 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1302 .addImm(Value)
1303 .addReg(SrcReg);
1304
1305 return Reg;
1306}
1307
1310 const DebugLoc &DL,
1311 Register SrcReg, int Value) const {
1312 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1313 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1314 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1315 .addImm(Value)
1316 .addReg(SrcReg);
1317
1318 return Reg;
1319}
1320
1322 const Register Reg,
1323 int64_t &ImmVal) const {
1324 switch (MI.getOpcode()) {
1325 case AMDGPU::V_MOV_B32_e32:
1326 case AMDGPU::S_MOV_B32:
1327 case AMDGPU::S_MOVK_I32:
1328 case AMDGPU::S_MOV_B64:
1329 case AMDGPU::V_MOV_B64_e32:
1330 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1331 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1332 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1333 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1334 case AMDGPU::V_MOV_B64_PSEUDO:
1335 case AMDGPU::V_MOV_B16_t16_e32: {
1336 const MachineOperand &Src0 = MI.getOperand(1);
1337 if (Src0.isImm()) {
1338 ImmVal = Src0.getImm();
1339 return MI.getOperand(0).getReg() == Reg;
1340 }
1341
1342 return false;
1343 }
1344 case AMDGPU::V_MOV_B16_t16_e64: {
1345 const MachineOperand &Src0 = MI.getOperand(2);
1346 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1347 ImmVal = Src0.getImm();
1348 return MI.getOperand(0).getReg() == Reg;
1349 }
1350
1351 return false;
1352 }
1353 case AMDGPU::S_BREV_B32:
1354 case AMDGPU::V_BFREV_B32_e32:
1355 case AMDGPU::V_BFREV_B32_e64: {
1356 const MachineOperand &Src0 = MI.getOperand(1);
1357 if (Src0.isImm()) {
1358 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1359 return MI.getOperand(0).getReg() == Reg;
1360 }
1361
1362 return false;
1363 }
1364 case AMDGPU::S_NOT_B32:
1365 case AMDGPU::V_NOT_B32_e32:
1366 case AMDGPU::V_NOT_B32_e64: {
1367 const MachineOperand &Src0 = MI.getOperand(1);
1368 if (Src0.isImm()) {
1369 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1370 return MI.getOperand(0).getReg() == Reg;
1371 }
1372
1373 return false;
1374 }
1375 default:
1376 return false;
1377 }
1378}
1379
1380std::optional<int64_t>
1382 if (Op.isImm())
1383 return Op.getImm();
1384
1385 if (!Op.isReg() || !Op.getReg().isVirtual())
1386 return std::nullopt;
1387 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1388 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1389 if (Def && Def->isMoveImmediate()) {
1390 const MachineOperand &ImmSrc = Def->getOperand(1);
1391 if (ImmSrc.isImm())
1392 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1393 }
1394
1395 return std::nullopt;
1396}
1397
1399
1400 if (RI.isAGPRClass(DstRC))
1401 return AMDGPU::COPY;
1402 if (RI.getRegSizeInBits(*DstRC) == 16) {
1403 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1404 // before RA.
1405 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1406 }
1407 if (RI.getRegSizeInBits(*DstRC) == 32)
1408 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1409 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1410 return AMDGPU::S_MOV_B64;
1411 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1412 return AMDGPU::V_MOV_B64_PSEUDO;
1413 return AMDGPU::COPY;
1414}
1415
1416const MCInstrDesc &
1418 bool IsIndirectSrc) const {
1419 if (IsIndirectSrc) {
1420 if (VecSize <= 32) // 4 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1422 if (VecSize <= 64) // 8 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1424 if (VecSize <= 96) // 12 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1426 if (VecSize <= 128) // 16 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1428 if (VecSize <= 160) // 20 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1430 if (VecSize <= 192) // 24 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1432 if (VecSize <= 224) // 28 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1434 if (VecSize <= 256) // 32 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1436 if (VecSize <= 288) // 36 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1438 if (VecSize <= 320) // 40 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1440 if (VecSize <= 352) // 44 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1442 if (VecSize <= 384) // 48 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1444 if (VecSize <= 512) // 64 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1446 if (VecSize <= 1024) // 128 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1448
1449 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1450 }
1451
1452 if (VecSize <= 32) // 4 bytes
1453 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1454 if (VecSize <= 64) // 8 bytes
1455 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1456 if (VecSize <= 96) // 12 bytes
1457 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1458 if (VecSize <= 128) // 16 bytes
1459 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1460 if (VecSize <= 160) // 20 bytes
1461 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1462 if (VecSize <= 192) // 24 bytes
1463 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1464 if (VecSize <= 224) // 28 bytes
1465 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1466 if (VecSize <= 256) // 32 bytes
1467 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1468 if (VecSize <= 288) // 36 bytes
1469 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1470 if (VecSize <= 320) // 40 bytes
1471 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1472 if (VecSize <= 352) // 44 bytes
1473 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1474 if (VecSize <= 384) // 48 bytes
1475 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1476 if (VecSize <= 512) // 64 bytes
1477 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1478 if (VecSize <= 1024) // 128 bytes
1479 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1480
1481 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1482}
1483
1484static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1485 if (VecSize <= 32) // 4 bytes
1486 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1487 if (VecSize <= 64) // 8 bytes
1488 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1489 if (VecSize <= 96) // 12 bytes
1490 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1491 if (VecSize <= 128) // 16 bytes
1492 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1493 if (VecSize <= 160) // 20 bytes
1494 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1495 if (VecSize <= 192) // 24 bytes
1496 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1497 if (VecSize <= 224) // 28 bytes
1498 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1499 if (VecSize <= 256) // 32 bytes
1500 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1501 if (VecSize <= 288) // 36 bytes
1502 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1503 if (VecSize <= 320) // 40 bytes
1504 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1505 if (VecSize <= 352) // 44 bytes
1506 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1507 if (VecSize <= 384) // 48 bytes
1508 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1509 if (VecSize <= 512) // 64 bytes
1510 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1511 if (VecSize <= 1024) // 128 bytes
1512 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1513
1514 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1515}
1516
1517static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1518 if (VecSize <= 32) // 4 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1520 if (VecSize <= 64) // 8 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1522 if (VecSize <= 96) // 12 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1524 if (VecSize <= 128) // 16 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1526 if (VecSize <= 160) // 20 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1528 if (VecSize <= 192) // 24 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1530 if (VecSize <= 224) // 28 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1534 if (VecSize <= 288) // 36 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1536 if (VecSize <= 320) // 40 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1538 if (VecSize <= 352) // 44 bytes
1539 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1540 if (VecSize <= 384) // 48 bytes
1541 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1542 if (VecSize <= 512) // 64 bytes
1543 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1544 if (VecSize <= 1024) // 128 bytes
1545 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1546
1547 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1548}
1549
1550static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1551 if (VecSize <= 64) // 8 bytes
1552 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1553 if (VecSize <= 128) // 16 bytes
1554 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1555 if (VecSize <= 256) // 32 bytes
1556 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1557 if (VecSize <= 512) // 64 bytes
1558 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1559 if (VecSize <= 1024) // 128 bytes
1560 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1561
1562 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1563}
1564
1565const MCInstrDesc &
1566SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1567 bool IsSGPR) const {
1568 if (IsSGPR) {
1569 switch (EltSize) {
1570 case 32:
1571 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1572 case 64:
1573 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1574 default:
1575 llvm_unreachable("invalid reg indexing elt size");
1576 }
1577 }
1578
1579 assert(EltSize == 32 && "invalid reg indexing elt size");
1581}
1582
1583static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1584 switch (Size) {
1585 case 4:
1586 return AMDGPU::SI_SPILL_S32_SAVE;
1587 case 8:
1588 return AMDGPU::SI_SPILL_S64_SAVE;
1589 case 12:
1590 return AMDGPU::SI_SPILL_S96_SAVE;
1591 case 16:
1592 return AMDGPU::SI_SPILL_S128_SAVE;
1593 case 20:
1594 return AMDGPU::SI_SPILL_S160_SAVE;
1595 case 24:
1596 return AMDGPU::SI_SPILL_S192_SAVE;
1597 case 28:
1598 return AMDGPU::SI_SPILL_S224_SAVE;
1599 case 32:
1600 return AMDGPU::SI_SPILL_S256_SAVE;
1601 case 36:
1602 return AMDGPU::SI_SPILL_S288_SAVE;
1603 case 40:
1604 return AMDGPU::SI_SPILL_S320_SAVE;
1605 case 44:
1606 return AMDGPU::SI_SPILL_S352_SAVE;
1607 case 48:
1608 return AMDGPU::SI_SPILL_S384_SAVE;
1609 case 64:
1610 return AMDGPU::SI_SPILL_S512_SAVE;
1611 case 128:
1612 return AMDGPU::SI_SPILL_S1024_SAVE;
1613 default:
1614 llvm_unreachable("unknown register size");
1615 }
1616}
1617
1618static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1619 switch (Size) {
1620 case 2:
1621 return AMDGPU::SI_SPILL_V16_SAVE;
1622 case 4:
1623 return AMDGPU::SI_SPILL_V32_SAVE;
1624 case 8:
1625 return AMDGPU::SI_SPILL_V64_SAVE;
1626 case 12:
1627 return AMDGPU::SI_SPILL_V96_SAVE;
1628 case 16:
1629 return AMDGPU::SI_SPILL_V128_SAVE;
1630 case 20:
1631 return AMDGPU::SI_SPILL_V160_SAVE;
1632 case 24:
1633 return AMDGPU::SI_SPILL_V192_SAVE;
1634 case 28:
1635 return AMDGPU::SI_SPILL_V224_SAVE;
1636 case 32:
1637 return AMDGPU::SI_SPILL_V256_SAVE;
1638 case 36:
1639 return AMDGPU::SI_SPILL_V288_SAVE;
1640 case 40:
1641 return AMDGPU::SI_SPILL_V320_SAVE;
1642 case 44:
1643 return AMDGPU::SI_SPILL_V352_SAVE;
1644 case 48:
1645 return AMDGPU::SI_SPILL_V384_SAVE;
1646 case 64:
1647 return AMDGPU::SI_SPILL_V512_SAVE;
1648 case 128:
1649 return AMDGPU::SI_SPILL_V1024_SAVE;
1650 default:
1651 llvm_unreachable("unknown register size");
1652 }
1653}
1654
1655static unsigned getAVSpillSaveOpcode(unsigned Size) {
1656 switch (Size) {
1657 case 4:
1658 return AMDGPU::SI_SPILL_AV32_SAVE;
1659 case 8:
1660 return AMDGPU::SI_SPILL_AV64_SAVE;
1661 case 12:
1662 return AMDGPU::SI_SPILL_AV96_SAVE;
1663 case 16:
1664 return AMDGPU::SI_SPILL_AV128_SAVE;
1665 case 20:
1666 return AMDGPU::SI_SPILL_AV160_SAVE;
1667 case 24:
1668 return AMDGPU::SI_SPILL_AV192_SAVE;
1669 case 28:
1670 return AMDGPU::SI_SPILL_AV224_SAVE;
1671 case 32:
1672 return AMDGPU::SI_SPILL_AV256_SAVE;
1673 case 36:
1674 return AMDGPU::SI_SPILL_AV288_SAVE;
1675 case 40:
1676 return AMDGPU::SI_SPILL_AV320_SAVE;
1677 case 44:
1678 return AMDGPU::SI_SPILL_AV352_SAVE;
1679 case 48:
1680 return AMDGPU::SI_SPILL_AV384_SAVE;
1681 case 64:
1682 return AMDGPU::SI_SPILL_AV512_SAVE;
1683 case 128:
1684 return AMDGPU::SI_SPILL_AV1024_SAVE;
1685 default:
1686 llvm_unreachable("unknown register size");
1687 }
1688}
1689
1690static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1691 bool IsVectorSuperClass) {
1692 // Currently, there is only 32-bit WWM register spills needed.
1693 if (Size != 4)
1694 llvm_unreachable("unknown wwm register spill size");
1695
1696 if (IsVectorSuperClass)
1697 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1698
1699 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1700}
1701
1703 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1704 const SIMachineFunctionInfo &MFI) const {
1705 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1706
1707 // Choose the right opcode if spilling a WWM register.
1709 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710
1711 // TODO: Check if AGPRs are available
1712 if (ST.hasMAIInsts())
1713 return getAVSpillSaveOpcode(Size);
1714
1716}
1717
1720 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1721 MachineInstr::MIFlag Flags) const {
1722 MachineFunction *MF = MBB.getParent();
1724 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1725 const DebugLoc &DL = MBB.findDebugLoc(MI);
1726
1727 MachinePointerInfo PtrInfo
1728 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1730 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1731 FrameInfo.getObjectAlign(FrameIndex));
1732 unsigned SpillSize = RI.getSpillSize(*RC);
1733
1734 MachineRegisterInfo &MRI = MF->getRegInfo();
1735 if (RI.isSGPRClass(RC)) {
1736 MFI->setHasSpilledSGPRs();
1737 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1738 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1739 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740
1741 // We are only allowed to create one new instruction when spilling
1742 // registers, so we need to use pseudo instruction for spilling SGPRs.
1743 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1744
1745 // The SGPR spill/restore instructions only work on number sgprs, so we need
1746 // to make sure we are using the correct register class.
1747 if (SrcReg.isVirtual() && SpillSize == 4) {
1748 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1749 }
1750
1751 BuildMI(MBB, MI, DL, OpDesc)
1752 .addReg(SrcReg, getKillRegState(isKill)) // data
1753 .addFrameIndex(FrameIndex) // addr
1754 .addMemOperand(MMO)
1756
1757 if (RI.spillSGPRToVGPR())
1758 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1759 return;
1760 }
1761
1762 unsigned Opcode =
1763 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1764 MFI->setHasSpilledVGPRs();
1765
1766 BuildMI(MBB, MI, DL, get(Opcode))
1767 .addReg(SrcReg, getKillRegState(isKill)) // data
1768 .addFrameIndex(FrameIndex) // addr
1769 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1770 .addImm(0) // offset
1771 .addMemOperand(MMO);
1772}
1773
1774static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1775 switch (Size) {
1776 case 4:
1777 return AMDGPU::SI_SPILL_S32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_S64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_S96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_S128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_S160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_S192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_S224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_S256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_S288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_S320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_S352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_S384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_S512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_S1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 2:
1812 return AMDGPU::SI_SPILL_V16_RESTORE;
1813 case 4:
1814 return AMDGPU::SI_SPILL_V32_RESTORE;
1815 case 8:
1816 return AMDGPU::SI_SPILL_V64_RESTORE;
1817 case 12:
1818 return AMDGPU::SI_SPILL_V96_RESTORE;
1819 case 16:
1820 return AMDGPU::SI_SPILL_V128_RESTORE;
1821 case 20:
1822 return AMDGPU::SI_SPILL_V160_RESTORE;
1823 case 24:
1824 return AMDGPU::SI_SPILL_V192_RESTORE;
1825 case 28:
1826 return AMDGPU::SI_SPILL_V224_RESTORE;
1827 case 32:
1828 return AMDGPU::SI_SPILL_V256_RESTORE;
1829 case 36:
1830 return AMDGPU::SI_SPILL_V288_RESTORE;
1831 case 40:
1832 return AMDGPU::SI_SPILL_V320_RESTORE;
1833 case 44:
1834 return AMDGPU::SI_SPILL_V352_RESTORE;
1835 case 48:
1836 return AMDGPU::SI_SPILL_V384_RESTORE;
1837 case 64:
1838 return AMDGPU::SI_SPILL_V512_RESTORE;
1839 case 128:
1840 return AMDGPU::SI_SPILL_V1024_RESTORE;
1841 default:
1842 llvm_unreachable("unknown register size");
1843 }
1844}
1845
1846static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1847 switch (Size) {
1848 case 4:
1849 return AMDGPU::SI_SPILL_AV32_RESTORE;
1850 case 8:
1851 return AMDGPU::SI_SPILL_AV64_RESTORE;
1852 case 12:
1853 return AMDGPU::SI_SPILL_AV96_RESTORE;
1854 case 16:
1855 return AMDGPU::SI_SPILL_AV128_RESTORE;
1856 case 20:
1857 return AMDGPU::SI_SPILL_AV160_RESTORE;
1858 case 24:
1859 return AMDGPU::SI_SPILL_AV192_RESTORE;
1860 case 28:
1861 return AMDGPU::SI_SPILL_AV224_RESTORE;
1862 case 32:
1863 return AMDGPU::SI_SPILL_AV256_RESTORE;
1864 case 36:
1865 return AMDGPU::SI_SPILL_AV288_RESTORE;
1866 case 40:
1867 return AMDGPU::SI_SPILL_AV320_RESTORE;
1868 case 44:
1869 return AMDGPU::SI_SPILL_AV352_RESTORE;
1870 case 48:
1871 return AMDGPU::SI_SPILL_AV384_RESTORE;
1872 case 64:
1873 return AMDGPU::SI_SPILL_AV512_RESTORE;
1874 case 128:
1875 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1876 default:
1877 llvm_unreachable("unknown register size");
1878 }
1879}
1880
1881static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1882 bool IsVectorSuperClass) {
1883 // Currently, there is only 32-bit WWM register spills needed.
1884 if (Size != 4)
1885 llvm_unreachable("unknown wwm register spill size");
1886
1887 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1888 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1889
1890 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1891}
1892
1894 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1895 const SIMachineFunctionInfo &MFI) const {
1896 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1897
1898 // Choose the right opcode if restoring a WWM register.
1900 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1901
1902 // TODO: Check if AGPRs are available
1903 if (ST.hasMAIInsts())
1905
1906 assert(!RI.isAGPRClass(RC));
1908}
1909
1912 Register DestReg, int FrameIndex,
1913 const TargetRegisterClass *RC,
1914 Register VReg, unsigned SubReg,
1915 MachineInstr::MIFlag Flags) const {
1916 MachineFunction *MF = MBB.getParent();
1918 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1919 const DebugLoc &DL = MBB.findDebugLoc(MI);
1920 unsigned SpillSize = RI.getSpillSize(*RC);
1921
1922 MachinePointerInfo PtrInfo
1923 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1924
1926 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1927 FrameInfo.getObjectAlign(FrameIndex));
1928
1929 if (RI.isSGPRClass(RC)) {
1930 MFI->setHasSpilledSGPRs();
1931 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1932 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1933 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1934
1935 // FIXME: Maybe this should not include a memoperand because it will be
1936 // lowered to non-memory instructions.
1937 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1938 if (DestReg.isVirtual() && SpillSize == 4) {
1939 MachineRegisterInfo &MRI = MF->getRegInfo();
1940 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1941 }
1942
1943 if (RI.spillSGPRToVGPR())
1944 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1945 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1946 .addFrameIndex(FrameIndex) // addr
1947 .addMemOperand(MMO)
1949
1950 return;
1951 }
1952
1953 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1954 SpillSize, *MFI);
1955 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1956 .addFrameIndex(FrameIndex) // vaddr
1957 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1958 .addImm(0) // offset
1959 .addMemOperand(MMO);
1960}
1961
1966
1969 unsigned Quantity) const {
1970 DebugLoc DL = MBB.findDebugLoc(MI);
1971 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1972 while (Quantity > 0) {
1973 unsigned Arg = std::min(Quantity, MaxSNopCount);
1974 Quantity -= Arg;
1975 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1976 }
1977}
1978
1980 auto *MF = MBB.getParent();
1981 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1982
1983 assert(Info->isEntryFunction());
1984
1985 if (MBB.succ_empty()) {
1986 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1987 if (HasNoTerminator) {
1988 if (Info->returnsVoid()) {
1989 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1990 } else {
1991 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1992 }
1993 }
1994 }
1995}
1996
2000 const DebugLoc &DL) const {
2001 MachineFunction *MF = MBB.getParent();
2002 constexpr unsigned DoorbellIDMask = 0x3ff;
2003 constexpr unsigned ECQueueWaveAbort = 0x400;
2004
2005 MachineBasicBlock *TrapBB = &MBB;
2006 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2007
2008 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2009 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2010 TrapBB = MF->CreateMachineBasicBlock();
2011 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2012 MF->push_back(TrapBB);
2013 MBB.addSuccessor(TrapBB);
2014 }
2015 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2016 // will be a nop.
2017 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2018 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2019 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2020 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2021 DoorbellReg)
2023 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2024 .addUse(AMDGPU::M0);
2025 Register DoorbellRegMasked =
2026 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2027 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2028 .addUse(DoorbellReg)
2029 .addImm(DoorbellIDMask);
2030 Register SetWaveAbortBit =
2031 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2032 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2033 .addUse(DoorbellRegMasked)
2034 .addImm(ECQueueWaveAbort);
2035 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2036 .addUse(SetWaveAbortBit);
2037 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2039 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2040 .addUse(AMDGPU::TTMP2);
2041 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2042 TrapBB->addSuccessor(HaltLoopBB);
2043
2044 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2045 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2046 .addMBB(HaltLoopBB);
2047 MF->push_back(HaltLoopBB);
2048 HaltLoopBB->addSuccessor(HaltLoopBB);
2049
2050 return MBB.getNextNode();
2051}
2052
2054 switch (MI.getOpcode()) {
2055 default:
2056 if (MI.isMetaInstruction())
2057 return 0;
2058 return 1; // FIXME: Do wait states equal cycles?
2059
2060 case AMDGPU::S_NOP:
2061 return MI.getOperand(0).getImm() + 1;
2062 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2063 // hazard, even if one exist, won't really be visible. Should we handle it?
2064 }
2065}
2066
2068 MachineBasicBlock &MBB = *MI.getParent();
2069 DebugLoc DL = MBB.findDebugLoc(MI);
2071 switch (MI.getOpcode()) {
2072 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2073 case AMDGPU::S_MOV_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_MOV_B64));
2077 break;
2078
2079 case AMDGPU::S_MOV_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_MOV_B32));
2083 break;
2084
2085 case AMDGPU::S_XOR_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_XOR_B64));
2089 break;
2090
2091 case AMDGPU::S_XOR_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_XOR_B32));
2095 break;
2096 case AMDGPU::S_OR_B64_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_OR_B64));
2100 break;
2101 case AMDGPU::S_OR_B32_term:
2102 // This is only a terminator to get the correct spill code placement during
2103 // register allocation.
2104 MI.setDesc(get(AMDGPU::S_OR_B32));
2105 break;
2106
2107 case AMDGPU::S_ANDN2_B64_term:
2108 // This is only a terminator to get the correct spill code placement during
2109 // register allocation.
2110 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2111 break;
2112
2113 case AMDGPU::S_ANDN2_B32_term:
2114 // This is only a terminator to get the correct spill code placement during
2115 // register allocation.
2116 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2117 break;
2118
2119 case AMDGPU::S_AND_B64_term:
2120 // This is only a terminator to get the correct spill code placement during
2121 // register allocation.
2122 MI.setDesc(get(AMDGPU::S_AND_B64));
2123 break;
2124
2125 case AMDGPU::S_AND_B32_term:
2126 // This is only a terminator to get the correct spill code placement during
2127 // register allocation.
2128 MI.setDesc(get(AMDGPU::S_AND_B32));
2129 break;
2130
2131 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2132 // This is only a terminator to get the correct spill code placement during
2133 // register allocation.
2134 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2135 break;
2136
2137 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2138 // This is only a terminator to get the correct spill code placement during
2139 // register allocation.
2140 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2141 break;
2142
2143 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2144 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2145 break;
2146
2147 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2148 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2149 break;
2150 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2151 Register Dst = MI.getOperand(0).getReg();
2152 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2153 MI.setDesc(
2154 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2155 break;
2156 }
2157 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2158 Register Dst = MI.getOperand(0).getReg();
2159 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2160 int64_t Imm = MI.getOperand(1).getImm();
2161
2162 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2163 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2164 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2165 .addImm(SignExtend64<32>(Imm));
2166 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2167 .addImm(SignExtend64<32>(Imm >> 32));
2168 MI.eraseFromParent();
2169 break;
2170 }
2171
2172 [[fallthrough]];
2173 }
2174 case AMDGPU::V_MOV_B64_PSEUDO: {
2175 Register Dst = MI.getOperand(0).getReg();
2176 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2177 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2178
2179 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2180 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2181
2182 const MachineOperand &SrcOp = MI.getOperand(1);
2183 // FIXME: Will this work for 64-bit floating point immediates?
2184 assert(!SrcOp.isFPImm());
2185 if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
2186 MI.setDesc(Mov64Desc);
2187 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2188 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2189 break;
2190 }
2191 if (SrcOp.isImm()) {
2192 APInt Imm(64, SrcOp.getImm());
2193 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2194 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2195 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2196 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2197
2198 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2199 PkMovRC->contains(Dst)) {
2200 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2202 .addImm(Lo.getSExtValue())
2204 .addImm(Lo.getSExtValue())
2205 .addImm(0) // op_sel_lo
2206 .addImm(0) // op_sel_hi
2207 .addImm(0) // neg_lo
2208 .addImm(0) // neg_hi
2209 .addImm(0); // clamp
2210 } else {
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2212 .addImm(Lo.getSExtValue());
2213 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2214 .addImm(Hi.getSExtValue());
2215 }
2216 } else {
2217 assert(SrcOp.isReg());
2218 if (ST.hasPkMovB32() &&
2219 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2220 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2221 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2222 .addReg(SrcOp.getReg())
2224 .addReg(SrcOp.getReg())
2225 .addImm(0) // op_sel_lo
2226 .addImm(0) // op_sel_hi
2227 .addImm(0) // neg_lo
2228 .addImm(0) // neg_hi
2229 .addImm(0); // clamp
2230 } else {
2231 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2232 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2234 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
2235 }
2236 }
2237 MI.eraseFromParent();
2238 break;
2239 }
2240 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2242 break;
2243 }
2244 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2245 const MachineOperand &SrcOp = MI.getOperand(1);
2246 assert(!SrcOp.isFPImm());
2247
2248 if (ST.has64BitLiterals()) {
2249 MI.setDesc(get(AMDGPU::S_MOV_B64));
2250 break;
2251 }
2252
2253 APInt Imm(64, SrcOp.getImm());
2254 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2255 MI.setDesc(get(AMDGPU::S_MOV_B64));
2256 break;
2257 }
2258
2259 Register Dst = MI.getOperand(0).getReg();
2260 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2261 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2262
2263 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2264 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2265 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2266 .addImm(Lo.getSExtValue());
2267 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2268 .addImm(Hi.getSExtValue());
2269 MI.eraseFromParent();
2270 break;
2271 }
2272 case AMDGPU::V_SET_INACTIVE_B32: {
2273 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2274 Register DstReg = MI.getOperand(0).getReg();
2275 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2276 .add(MI.getOperand(3))
2277 .add(MI.getOperand(4))
2278 .add(MI.getOperand(1))
2279 .add(MI.getOperand(2))
2280 .add(MI.getOperand(5));
2281 MI.eraseFromParent();
2282 break;
2283 }
2284 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2285 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2286 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2287 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2288 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2289 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2290 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2298 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2299 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2300 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2301 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2302 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2303 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2304 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2317 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2318
2319 unsigned Opc;
2320 if (RI.hasVGPRs(EltRC)) {
2321 Opc = AMDGPU::V_MOVRELD_B32_e32;
2322 } else {
2323 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2324 : AMDGPU::S_MOVRELD_B32;
2325 }
2326
2327 const MCInstrDesc &OpDesc = get(Opc);
2328 Register VecReg = MI.getOperand(0).getReg();
2329 bool IsUndef = MI.getOperand(1).isUndef();
2330 unsigned SubReg = MI.getOperand(3).getImm();
2331 assert(VecReg == MI.getOperand(1).getReg());
2332
2334 BuildMI(MBB, MI, DL, OpDesc)
2335 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2336 .add(MI.getOperand(2))
2338 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2339
2340 const int ImpDefIdx =
2341 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2342 const int ImpUseIdx = ImpDefIdx + 1;
2343 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2344 MI.eraseFromParent();
2345 break;
2346 }
2347 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2348 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2349 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2350 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2351 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2361 assert(ST.useVGPRIndexMode());
2362 Register VecReg = MI.getOperand(0).getReg();
2363 bool IsUndef = MI.getOperand(1).isUndef();
2364 MachineOperand &Idx = MI.getOperand(3);
2365 Register SubReg = MI.getOperand(4).getImm();
2366
2367 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2368 .add(Idx)
2370 SetOn->getOperand(3).setIsUndef();
2371
2372 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2374 BuildMI(MBB, MI, DL, OpDesc)
2375 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2376 .add(MI.getOperand(2))
2378 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2379
2380 const int ImpDefIdx =
2381 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2382 const int ImpUseIdx = ImpDefIdx + 1;
2383 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2384
2385 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2386
2387 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2388
2389 MI.eraseFromParent();
2390 break;
2391 }
2392 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2393 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2394 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2395 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2406 assert(ST.useVGPRIndexMode());
2407 Register Dst = MI.getOperand(0).getReg();
2408 Register VecReg = MI.getOperand(1).getReg();
2409 bool IsUndef = MI.getOperand(1).isUndef();
2410 Register SubReg = MI.getOperand(3).getImm();
2411
2412 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2413 .add(MI.getOperand(2))
2415 SetOn->getOperand(3).setIsUndef();
2416
2417 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2418 .addDef(Dst)
2419 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2420 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2421
2422 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2423
2424 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2425
2426 MI.eraseFromParent();
2427 break;
2428 }
2429 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2430 MachineFunction &MF = *MBB.getParent();
2431 Register Reg = MI.getOperand(0).getReg();
2432 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2433 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2434 MachineOperand OpLo = MI.getOperand(1);
2435 MachineOperand OpHi = MI.getOperand(2);
2436
2437 // Create a bundle so these instructions won't be re-ordered by the
2438 // post-RA scheduler.
2439 MIBundleBuilder Bundler(MBB, MI);
2440 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2441
2442 // What we want here is an offset from the value returned by s_getpc (which
2443 // is the address of the s_add_u32 instruction) to the global variable, but
2444 // since the encoding of $symbol starts 4 bytes after the start of the
2445 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2446 // small. This requires us to add 4 to the global variable offset in order
2447 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2448 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2449 // instruction.
2450
2451 int64_t Adjust = 0;
2452 if (ST.hasGetPCZeroExtension()) {
2453 // Fix up hardware that does not sign-extend the 48-bit PC value by
2454 // inserting: s_sext_i32_i16 reghi, reghi
2455 Bundler.append(
2456 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2457 Adjust += 4;
2458 }
2459
2460 if (OpLo.isGlobal())
2461 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2462 Bundler.append(
2463 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2464
2465 if (OpHi.isGlobal())
2466 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2467 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2468 .addReg(RegHi)
2469 .add(OpHi));
2470
2471 finalizeBundle(MBB, Bundler.begin());
2472
2473 MI.eraseFromParent();
2474 break;
2475 }
2476 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2477 MachineFunction &MF = *MBB.getParent();
2478 Register Reg = MI.getOperand(0).getReg();
2479 MachineOperand Op = MI.getOperand(1);
2480
2481 // Create a bundle so these instructions won't be re-ordered by the
2482 // post-RA scheduler.
2483 MIBundleBuilder Bundler(MBB, MI);
2484 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2485 if (Op.isGlobal())
2486 Op.setOffset(Op.getOffset() + 4);
2487 Bundler.append(
2488 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2489
2490 finalizeBundle(MBB, Bundler.begin());
2491
2492 MI.eraseFromParent();
2493 break;
2494 }
2495 case AMDGPU::ENTER_STRICT_WWM: {
2496 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2497 // Whole Wave Mode is entered.
2498 MI.setDesc(get(LMC.OrSaveExecOpc));
2499 break;
2500 }
2501 case AMDGPU::ENTER_STRICT_WQM: {
2502 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2503 // STRICT_WQM is entered.
2504 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2505 .addReg(LMC.ExecReg);
2506 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2507
2508 MI.eraseFromParent();
2509 break;
2510 }
2511 case AMDGPU::EXIT_STRICT_WWM:
2512 case AMDGPU::EXIT_STRICT_WQM: {
2513 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2514 // WWM/STICT_WQM is exited.
2515 MI.setDesc(get(LMC.MovOpc));
2516 break;
2517 }
2518 case AMDGPU::SI_RETURN: {
2519 const MachineFunction *MF = MBB.getParent();
2520 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2521 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2522 // Hiding the return address use with SI_RETURN may lead to extra kills in
2523 // the function and missing live-ins. We are fine in practice because callee
2524 // saved register handling ensures the register value is restored before
2525 // RET, but we need the undef flag here to appease the MachineVerifier
2526 // liveness checks.
2528 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2529 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2530
2531 MIB.copyImplicitOps(MI);
2532 MI.eraseFromParent();
2533 break;
2534 }
2535
2536 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2537 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2538 MI.setDesc(get(AMDGPU::S_MUL_U64));
2539 break;
2540
2541 case AMDGPU::S_GETPC_B64_pseudo:
2542 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2543 if (ST.hasGetPCZeroExtension()) {
2544 Register Dst = MI.getOperand(0).getReg();
2545 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2546 // Fix up hardware that does not sign-extend the 48-bit PC value by
2547 // inserting: s_sext_i32_i16 dsthi, dsthi
2548 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2549 DstHi)
2550 .addReg(DstHi);
2551 }
2552 break;
2553
2554 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2555 assert(ST.hasBF16PackedInsts());
2556 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2557 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2558 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2559 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2560 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2561 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2562 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2563 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2564 break;
2565 }
2566
2567 case AMDGPU::GET_STACK_BASE:
2568 // The stack starts at offset 0 unless we need to reserve some space at the
2569 // bottom.
2570 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2571 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2572 // some of the VGPRs. The size of the required scratch space has already
2573 // been computed by prolog epilog insertion.
2574 const SIMachineFunctionInfo *MFI =
2575 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2576 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2577 Register DestReg = MI.getOperand(0).getReg();
2578 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2581 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2582 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2583 // SCC, so we need to check for 0 manually.
2584 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2585 // Change the implicif-def of SCC to an explicit use (but first remove
2586 // the dead flag if present).
2587 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2588 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2589 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2590 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2591 } else {
2592 MI.setDesc(get(AMDGPU::S_MOV_B32));
2593 MI.addOperand(MachineOperand::CreateImm(0));
2594 MI.removeOperand(
2595 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2596 }
2597 break;
2598 }
2599
2600 return true;
2601}
2602
2605 unsigned SubIdx, const MachineInstr &Orig,
2606 LaneBitmask UsedLanes) const {
2607
2608 // Try shrinking the instruction to remat only the part needed for current
2609 // context.
2610 // TODO: Handle more cases.
2611 unsigned Opcode = Orig.getOpcode();
2612 switch (Opcode) {
2613 case AMDGPU::S_MOV_B64:
2614 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2615 if (SubIdx != 0)
2616 break;
2617
2618 if (!Orig.getOperand(1).isImm())
2619 break;
2620
2621 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2622 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2623 if (UsedLanes.all())
2624 break;
2625
2626 // Determine which half of the 64-bit immediate corresponds to the use.
2627 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2628 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2629 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2630
2631 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2632 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2633
2634 if (NeedLo && NeedHi)
2635 break;
2636
2637 int64_t Imm64 = Orig.getOperand(1).getImm();
2638 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2639
2640 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2641
2642 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2643 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2644 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2645 .addImm(Imm32);
2646 return;
2647 }
2648
2649 case AMDGPU::S_LOAD_DWORDX16_IMM:
2650 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2651 if (SubIdx != 0)
2652 break;
2653
2654 if (I == MBB.end())
2655 break;
2656
2657 if (I->isBundled())
2658 break;
2659
2660 // Look for a single use of the register that is also a subreg.
2661 Register RegToFind = Orig.getOperand(0).getReg();
2662 MachineOperand *UseMO = nullptr;
2663 for (auto &CandMO : I->operands()) {
2664 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2665 continue;
2666 if (UseMO) {
2667 UseMO = nullptr;
2668 break;
2669 }
2670 UseMO = &CandMO;
2671 }
2672 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2673 break;
2674
2675 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2676 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2677
2678 MachineFunction *MF = MBB.getParent();
2679 MachineRegisterInfo &MRI = MF->getRegInfo();
2680 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2681
2682 unsigned NewOpcode = -1;
2683 if (SubregSize == 256)
2684 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2685 else if (SubregSize == 128)
2686 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2687 else
2688 break;
2689
2690 const MCInstrDesc &TID = get(NewOpcode);
2691 const TargetRegisterClass *NewRC =
2692 RI.getAllocatableClass(getRegClass(TID, 0));
2693 MRI.setRegClass(DestReg, NewRC);
2694
2695 UseMO->setReg(DestReg);
2696 UseMO->setSubReg(AMDGPU::NoSubRegister);
2697
2698 // Use a smaller load with the desired size, possibly with updated offset.
2699 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2700 MI->setDesc(TID);
2701 MI->getOperand(0).setReg(DestReg);
2702 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2703 if (Offset) {
2704 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2705 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2706 OffsetMO->setImm(FinalOffset);
2707 }
2709 for (const MachineMemOperand *MemOp : Orig.memoperands())
2710 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2711 SubregSize / 8));
2712 MI->setMemRefs(*MF, NewMMOs);
2713
2714 MBB.insert(I, MI);
2715 return;
2716 }
2717
2718 default:
2719 break;
2720 }
2721
2722 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2723}
2724
2725std::pair<MachineInstr*, MachineInstr*>
2727 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2728
2729 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2731 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2732 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2733 return std::pair(&MI, nullptr);
2734 }
2735
2736 MachineBasicBlock &MBB = *MI.getParent();
2737 DebugLoc DL = MBB.findDebugLoc(MI);
2738 MachineFunction *MF = MBB.getParent();
2739 MachineRegisterInfo &MRI = MF->getRegInfo();
2740 Register Dst = MI.getOperand(0).getReg();
2741 unsigned Part = 0;
2742 MachineInstr *Split[2];
2743
2744 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2745 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2746 if (Dst.isPhysical()) {
2747 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2748 } else {
2749 assert(MRI.isSSA());
2750 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2751 MovDPP.addDef(Tmp);
2752 }
2753
2754 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2755 const MachineOperand &SrcOp = MI.getOperand(I);
2756 assert(!SrcOp.isFPImm());
2757 if (SrcOp.isImm()) {
2758 APInt Imm(64, SrcOp.getImm());
2759 Imm.ashrInPlace(Part * 32);
2760 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2761 } else {
2762 assert(SrcOp.isReg());
2763 Register Src = SrcOp.getReg();
2764 if (Src.isPhysical())
2765 MovDPP.addReg(RI.getSubReg(Src, Sub));
2766 else
2767 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2768 }
2769 }
2770
2771 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2772 MovDPP.addImm(MO.getImm());
2773
2774 Split[Part] = MovDPP;
2775 ++Part;
2776 }
2777
2778 if (Dst.isVirtual())
2779 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2780 .addReg(Split[0]->getOperand(0).getReg())
2781 .addImm(AMDGPU::sub0)
2782 .addReg(Split[1]->getOperand(0).getReg())
2783 .addImm(AMDGPU::sub1);
2784
2785 MI.eraseFromParent();
2786 return std::pair(Split[0], Split[1]);
2787}
2788
2789std::optional<DestSourcePair>
2791 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2792 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2793
2794 return std::nullopt;
2795}
2796
2798 AMDGPU::OpName Src0OpName,
2799 MachineOperand &Src1,
2800 AMDGPU::OpName Src1OpName) const {
2801 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2802 if (!Src0Mods)
2803 return false;
2804
2805 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2806 assert(Src1Mods &&
2807 "All commutable instructions have both src0 and src1 modifiers");
2808
2809 int Src0ModsVal = Src0Mods->getImm();
2810 int Src1ModsVal = Src1Mods->getImm();
2811
2812 Src1Mods->setImm(Src0ModsVal);
2813 Src0Mods->setImm(Src1ModsVal);
2814 return true;
2815}
2816
2818 MachineOperand &RegOp,
2819 MachineOperand &NonRegOp) {
2820 Register Reg = RegOp.getReg();
2821 unsigned SubReg = RegOp.getSubReg();
2822 bool IsKill = RegOp.isKill();
2823 bool IsDead = RegOp.isDead();
2824 bool IsUndef = RegOp.isUndef();
2825 bool IsDebug = RegOp.isDebug();
2826
2827 if (NonRegOp.isImm())
2828 RegOp.ChangeToImmediate(NonRegOp.getImm());
2829 else if (NonRegOp.isFI())
2830 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2831 else if (NonRegOp.isGlobal()) {
2832 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2833 NonRegOp.getTargetFlags());
2834 } else
2835 return nullptr;
2836
2837 // Make sure we don't reinterpret a subreg index in the target flags.
2838 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2839
2840 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2841 NonRegOp.setSubReg(SubReg);
2842
2843 return &MI;
2844}
2845
2847 MachineOperand &NonRegOp1,
2848 MachineOperand &NonRegOp2) {
2849 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2850 int64_t NonRegVal = NonRegOp1.getImm();
2851
2852 NonRegOp1.setImm(NonRegOp2.getImm());
2853 NonRegOp2.setImm(NonRegVal);
2854 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2855 NonRegOp2.setTargetFlags(TargetFlags);
2856 return &MI;
2857}
2858
2859bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2860 unsigned OpIdx1) const {
2861 const MCInstrDesc &InstDesc = MI.getDesc();
2862 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2863 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2864
2865 unsigned Opc = MI.getOpcode();
2866 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2867
2868 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2869 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2870
2871 // Swap doesn't breach constant bus or literal limits
2872 // It may move literal to position other than src0, this is not allowed
2873 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2874 // FIXME: After gfx9, literal can be in place other than Src0
2875 if (isVALU(MI)) {
2876 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2877 !isInlineConstant(MO0, OpInfo1))
2878 return false;
2879 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2880 !isInlineConstant(MO1, OpInfo0))
2881 return false;
2882 }
2883
2884 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2885 if (OpInfo1.RegClass == -1)
2886 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2887 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2888 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2889 }
2890 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2891 if (OpInfo0.RegClass == -1)
2892 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2893 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2894 isLegalRegOperand(MI, OpIdx0, MO1);
2895 }
2896
2897 // No need to check 64-bit literals since swapping does not bring new
2898 // 64-bit literals into current instruction to fold to 32-bit
2899
2900 return isImmOperandLegal(MI, OpIdx1, MO0);
2901}
2902
2904 unsigned Src0Idx,
2905 unsigned Src1Idx) const {
2906 assert(!NewMI && "this should never be used");
2907
2908 unsigned Opc = MI.getOpcode();
2909 int CommutedOpcode = commuteOpcode(Opc);
2910 if (CommutedOpcode == -1)
2911 return nullptr;
2912
2913 if (Src0Idx > Src1Idx)
2914 std::swap(Src0Idx, Src1Idx);
2915
2916 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2917 static_cast<int>(Src0Idx) &&
2918 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2919 static_cast<int>(Src1Idx) &&
2920 "inconsistency with findCommutedOpIndices");
2921
2922 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2923 return nullptr;
2924
2925 MachineInstr *CommutedMI = nullptr;
2926 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2927 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2928 if (Src0.isReg() && Src1.isReg()) {
2929 // Be sure to copy the source modifiers to the right place.
2930 CommutedMI =
2931 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2932 } else if (Src0.isReg() && !Src1.isReg()) {
2933 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2934 } else if (!Src0.isReg() && Src1.isReg()) {
2935 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2936 } else if (Src0.isImm() && Src1.isImm()) {
2937 CommutedMI = swapImmOperands(MI, Src0, Src1);
2938 } else {
2939 // FIXME: Found two non registers to commute. This does happen.
2940 return nullptr;
2941 }
2942
2943 if (CommutedMI) {
2944 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2945 Src1, AMDGPU::OpName::src1_modifiers);
2946
2947 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2948 AMDGPU::OpName::src1_sel);
2949
2950 CommutedMI->setDesc(get(CommutedOpcode));
2951 }
2952
2953 return CommutedMI;
2954}
2955
2956// This needs to be implemented because the source modifiers may be inserted
2957// between the true commutable operands, and the base
2958// TargetInstrInfo::commuteInstruction uses it.
2960 unsigned &SrcOpIdx0,
2961 unsigned &SrcOpIdx1) const {
2962 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2963}
2964
2966 unsigned &SrcOpIdx0,
2967 unsigned &SrcOpIdx1) const {
2968 if (!Desc.isCommutable())
2969 return false;
2970
2971 unsigned Opc = Desc.getOpcode();
2972 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2973 if (Src0Idx == -1)
2974 return false;
2975
2976 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2977 if (Src1Idx == -1)
2978 return false;
2979
2980 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2981}
2982
2984 int64_t BrOffset) const {
2985 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2986 // because its dest block is unanalyzable.
2987 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2988
2989 // Convert to dwords.
2990 BrOffset /= 4;
2991
2992 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2993 // from the next instruction.
2994 BrOffset -= 1;
2995
2996 return isIntN(BranchOffsetBits, BrOffset);
2997}
2998
3001 return MI.getOperand(0).getMBB();
3002}
3003
3005 for (const MachineInstr &MI : MBB->terminators()) {
3006 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
3007 MI.getOpcode() == AMDGPU::SI_LOOP)
3008 return true;
3009 }
3010 return false;
3011}
3012
3014 MachineBasicBlock &DestBB,
3015 MachineBasicBlock &RestoreBB,
3016 const DebugLoc &DL, int64_t BrOffset,
3017 RegScavenger *RS) const {
3018 assert(MBB.empty() &&
3019 "new block should be inserted for expanding unconditional branch");
3020 assert(MBB.pred_size() == 1);
3021 assert(RestoreBB.empty() &&
3022 "restore block should be inserted for restoring clobbered registers");
3023
3024 MachineFunction *MF = MBB.getParent();
3025 MachineRegisterInfo &MRI = MF->getRegInfo();
3027 auto I = MBB.end();
3028 auto &MCCtx = MF->getContext();
3029
3030 if (ST.useAddPC64Inst()) {
3031 MCSymbol *Offset =
3032 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
3033 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
3035 MCSymbol *PostAddPCLabel =
3036 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
3037 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
3038 auto *OffsetExpr = MCBinaryExpr::createSub(
3039 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
3040 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
3041 Offset->setVariableValue(OffsetExpr);
3042 return;
3043 }
3044
3045 assert(RS && "RegScavenger required for long branching");
3046
3047 // FIXME: Virtual register workaround for RegScavenger not working with empty
3048 // blocks.
3049 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3050
3051 // Note: as this is used after hazard recognizer we need to apply some hazard
3052 // workarounds directly.
3053 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3054 ST.hasVALUReadSGPRHazard();
3055 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3056 if (FlushSGPRWrites)
3057 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
3059 };
3060
3061 // We need to compute the offset relative to the instruction immediately after
3062 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3063 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3064 ApplyHazardWorkarounds();
3065
3066 MCSymbol *PostGetPCLabel =
3067 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3068 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3069
3070 MCSymbol *OffsetLo =
3071 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3072 MCSymbol *OffsetHi =
3073 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3074 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3075 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3076 .addReg(PCReg, {}, AMDGPU::sub0)
3077 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3078 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3079 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3080 .addReg(PCReg, {}, AMDGPU::sub1)
3081 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3082 ApplyHazardWorkarounds();
3083
3084 // Insert the indirect branch after the other terminator.
3085 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3086 .addReg(PCReg);
3087
3088 // If a spill is needed for the pc register pair, we need to insert a spill
3089 // restore block right before the destination block, and insert a short branch
3090 // into the old destination block's fallthrough predecessor.
3091 // e.g.:
3092 //
3093 // s_cbranch_scc0 skip_long_branch:
3094 //
3095 // long_branch_bb:
3096 // spill s[8:9]
3097 // s_getpc_b64 s[8:9]
3098 // s_add_u32 s8, s8, restore_bb
3099 // s_addc_u32 s9, s9, 0
3100 // s_setpc_b64 s[8:9]
3101 //
3102 // skip_long_branch:
3103 // foo;
3104 //
3105 // .....
3106 //
3107 // dest_bb_fallthrough_predecessor:
3108 // bar;
3109 // s_branch dest_bb
3110 //
3111 // restore_bb:
3112 // restore s[8:9]
3113 // fallthrough dest_bb
3114 ///
3115 // dest_bb:
3116 // buzz;
3117
3118 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3119 Register Scav;
3120
3121 // If we've previously reserved a register for long branches
3122 // avoid running the scavenger and just use those registers
3123 if (LongBranchReservedReg) {
3124 RS->enterBasicBlock(MBB);
3125 Scav = LongBranchReservedReg;
3126 } else {
3127 RS->enterBasicBlockEnd(MBB);
3128 Scav = RS->scavengeRegisterBackwards(
3129 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3130 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3131 }
3132 if (Scav) {
3133 RS->setRegUsed(Scav);
3134 MRI.replaceRegWith(PCReg, Scav);
3135 MRI.clearVirtRegs();
3136 } else {
3137 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3138 // SGPR spill.
3139 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3140 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3141 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3142 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3143 MRI.clearVirtRegs();
3144 }
3145
3146 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3147 // Now, the distance could be defined.
3149 MCSymbolRefExpr::create(DestLabel, MCCtx),
3150 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3151 // Add offset assignments.
3152 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3153 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3154 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3155 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3156}
3157
3158unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3159 switch (Cond) {
3160 case SIInstrInfo::SCC_TRUE:
3161 return AMDGPU::S_CBRANCH_SCC1;
3162 case SIInstrInfo::SCC_FALSE:
3163 return AMDGPU::S_CBRANCH_SCC0;
3164 case SIInstrInfo::VCCNZ:
3165 return AMDGPU::S_CBRANCH_VCCNZ;
3166 case SIInstrInfo::VCCZ:
3167 return AMDGPU::S_CBRANCH_VCCZ;
3168 case SIInstrInfo::EXECNZ:
3169 return AMDGPU::S_CBRANCH_EXECNZ;
3170 case SIInstrInfo::EXECZ:
3171 return AMDGPU::S_CBRANCH_EXECZ;
3172 default:
3173 llvm_unreachable("invalid branch predicate");
3174 }
3175}
3176
3177SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3178 switch (Opcode) {
3179 case AMDGPU::S_CBRANCH_SCC0:
3180 return SCC_FALSE;
3181 case AMDGPU::S_CBRANCH_SCC1:
3182 return SCC_TRUE;
3183 case AMDGPU::S_CBRANCH_VCCNZ:
3184 return VCCNZ;
3185 case AMDGPU::S_CBRANCH_VCCZ:
3186 return VCCZ;
3187 case AMDGPU::S_CBRANCH_EXECNZ:
3188 return EXECNZ;
3189 case AMDGPU::S_CBRANCH_EXECZ:
3190 return EXECZ;
3191 default:
3192 return INVALID_BR;
3193 }
3194}
3195
3199 MachineBasicBlock *&FBB,
3201 bool AllowModify) const {
3202 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3203 // Unconditional Branch
3204 TBB = I->getOperand(0).getMBB();
3205 return false;
3206 }
3207
3208 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3209 if (Pred == INVALID_BR)
3210 return true;
3211
3212 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3213 Cond.push_back(MachineOperand::CreateImm(Pred));
3214 Cond.push_back(I->getOperand(1)); // Save the branch register.
3215
3216 ++I;
3217
3218 if (I == MBB.end()) {
3219 // Conditional branch followed by fall-through.
3220 TBB = CondBB;
3221 return false;
3222 }
3223
3224 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3225 TBB = CondBB;
3226 FBB = I->getOperand(0).getMBB();
3227 return false;
3228 }
3229
3230 return true;
3231}
3232
3234 MachineBasicBlock *&FBB,
3236 bool AllowModify) const {
3237 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3238 auto E = MBB.end();
3239 if (I == E)
3240 return false;
3241
3242 // Skip over the instructions that are artificially terminators for special
3243 // exec management.
3244 while (I != E && !I->isBranch() && !I->isReturn()) {
3245 switch (I->getOpcode()) {
3246 case AMDGPU::S_MOV_B64_term:
3247 case AMDGPU::S_XOR_B64_term:
3248 case AMDGPU::S_OR_B64_term:
3249 case AMDGPU::S_ANDN2_B64_term:
3250 case AMDGPU::S_AND_B64_term:
3251 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3252 case AMDGPU::S_MOV_B32_term:
3253 case AMDGPU::S_XOR_B32_term:
3254 case AMDGPU::S_OR_B32_term:
3255 case AMDGPU::S_ANDN2_B32_term:
3256 case AMDGPU::S_AND_B32_term:
3257 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3258 break;
3259 case AMDGPU::SI_IF:
3260 case AMDGPU::SI_ELSE:
3261 case AMDGPU::SI_KILL_I1_TERMINATOR:
3262 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3263 // FIXME: It's messy that these need to be considered here at all.
3264 return true;
3265 default:
3266 llvm_unreachable("unexpected non-branch terminator inst");
3267 }
3268
3269 ++I;
3270 }
3271
3272 if (I == E)
3273 return false;
3274
3275 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3276}
3277
3279 int *BytesRemoved) const {
3280 unsigned Count = 0;
3281 unsigned RemovedSize = 0;
3282 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3283 // Skip over artificial terminators when removing instructions.
3284 if (MI.isBranch() || MI.isReturn()) {
3285 RemovedSize += getInstSizeInBytes(MI);
3286 MI.eraseFromParent();
3287 ++Count;
3288 }
3289 }
3290
3291 if (BytesRemoved)
3292 *BytesRemoved = RemovedSize;
3293
3294 return Count;
3295}
3296
3297// Copy the flags onto the implicit condition register operand.
3299 const MachineOperand &OrigCond) {
3300 CondReg.setIsUndef(OrigCond.isUndef());
3301 CondReg.setIsKill(OrigCond.isKill());
3302}
3303
3306 MachineBasicBlock *FBB,
3308 const DebugLoc &DL,
3309 int *BytesAdded) const {
3310 if (!FBB && Cond.empty()) {
3311 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3312 .addMBB(TBB);
3313 if (BytesAdded)
3314 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3315 return 1;
3316 }
3317
3318 assert(TBB && Cond[0].isImm());
3319
3320 unsigned Opcode
3321 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3322
3323 if (!FBB) {
3324 MachineInstr *CondBr =
3325 BuildMI(&MBB, DL, get(Opcode))
3326 .addMBB(TBB);
3327
3328 // Copy the flags onto the implicit condition register operand.
3329 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3330 fixImplicitOperands(*CondBr);
3331
3332 if (BytesAdded)
3333 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3334 return 1;
3335 }
3336
3337 assert(TBB && FBB);
3338
3339 MachineInstr *CondBr =
3340 BuildMI(&MBB, DL, get(Opcode))
3341 .addMBB(TBB);
3342 fixImplicitOperands(*CondBr);
3343 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3344 .addMBB(FBB);
3345
3346 MachineOperand &CondReg = CondBr->getOperand(1);
3347 CondReg.setIsUndef(Cond[1].isUndef());
3348 CondReg.setIsKill(Cond[1].isKill());
3349
3350 if (BytesAdded)
3351 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3352
3353 return 2;
3354}
3355
3358 if (Cond.size() != 2) {
3359 return true;
3360 }
3361
3362 if (Cond[0].isImm()) {
3363 Cond[0].setImm(-Cond[0].getImm());
3364 return false;
3365 }
3366
3367 return true;
3368}
3369
3372 Register DstReg, Register TrueReg,
3373 Register FalseReg, int &CondCycles,
3374 int &TrueCycles, int &FalseCycles) const {
3375 switch (Cond[0].getImm()) {
3376 case VCCNZ:
3377 case VCCZ: {
3378 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3379 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3380 if (MRI.getRegClass(FalseReg) != RC)
3381 return false;
3382
3383 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3384 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3385
3386 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3387 return RI.hasVGPRs(RC) && NumInsts <= 6;
3388 }
3389 case SCC_TRUE:
3390 case SCC_FALSE: {
3391 // FIXME: We could insert for VGPRs if we could replace the original compare
3392 // with a vector one.
3393 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3394 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3395 if (MRI.getRegClass(FalseReg) != RC)
3396 return false;
3397
3398 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3399
3400 // Multiples of 8 can do s_cselect_b64
3401 if (NumInsts % 2 == 0)
3402 NumInsts /= 2;
3403
3404 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3405 return RI.isSGPRClass(RC);
3406 }
3407 default:
3408 return false;
3409 }
3410}
3411
3415 Register TrueReg, Register FalseReg) const {
3416 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3417 if (Pred == VCCZ || Pred == SCC_FALSE) {
3418 Pred = static_cast<BranchPredicate>(-Pred);
3419 std::swap(TrueReg, FalseReg);
3420 }
3421
3422 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3423 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3424 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3425
3426 if (DstSize == 32) {
3428 if (Pred == SCC_TRUE) {
3429 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3430 .addReg(TrueReg)
3431 .addReg(FalseReg);
3432 } else {
3433 // Instruction's operands are backwards from what is expected.
3434 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3435 .addReg(FalseReg)
3436 .addReg(TrueReg);
3437 }
3438
3439 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3440 return;
3441 }
3442
3443 if (DstSize == 64 && Pred == SCC_TRUE) {
3445 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3446 .addReg(TrueReg)
3447 .addReg(FalseReg);
3448
3449 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3450 return;
3451 }
3452
3453 static const int16_t Sub0_15[] = {
3454 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3455 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3456 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3457 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3458 };
3459
3460 static const int16_t Sub0_15_64[] = {
3461 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3462 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3463 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3464 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3465 };
3466
3467 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3468 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3469 const int16_t *SubIndices = Sub0_15;
3470 int NElts = DstSize / 32;
3471
3472 // 64-bit select is only available for SALU.
3473 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3474 if (Pred == SCC_TRUE) {
3475 if (NElts % 2) {
3476 SelOp = AMDGPU::S_CSELECT_B32;
3477 EltRC = &AMDGPU::SGPR_32RegClass;
3478 } else {
3479 SelOp = AMDGPU::S_CSELECT_B64;
3480 EltRC = &AMDGPU::SGPR_64RegClass;
3481 SubIndices = Sub0_15_64;
3482 NElts /= 2;
3483 }
3484 }
3485
3487 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3488
3489 I = MIB->getIterator();
3490
3492 for (int Idx = 0; Idx != NElts; ++Idx) {
3493 Register DstElt = MRI.createVirtualRegister(EltRC);
3494 Regs.push_back(DstElt);
3495
3496 unsigned SubIdx = SubIndices[Idx];
3497
3499 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3500 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3501 .addReg(FalseReg, {}, SubIdx)
3502 .addReg(TrueReg, {}, SubIdx);
3503 } else {
3504 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3505 .addReg(TrueReg, {}, SubIdx)
3506 .addReg(FalseReg, {}, SubIdx);
3507 }
3508
3509 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3511
3512 MIB.addReg(DstElt)
3513 .addImm(SubIdx);
3514 }
3515}
3516
3518 switch (MI.getOpcode()) {
3519 case AMDGPU::V_MOV_B16_t16_e32:
3520 case AMDGPU::V_MOV_B16_t16_e64:
3521 case AMDGPU::V_MOV_B32_e32:
3522 case AMDGPU::V_MOV_B32_e64:
3523 case AMDGPU::V_MOV_B64_PSEUDO:
3524 case AMDGPU::V_MOV_B64_e32:
3525 case AMDGPU::V_MOV_B64_e64:
3526 case AMDGPU::S_MOV_B32:
3527 case AMDGPU::S_MOV_B64:
3528 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3529 case AMDGPU::COPY:
3530 case AMDGPU::WWM_COPY:
3531 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3532 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3533 case AMDGPU::V_ACCVGPR_MOV_B32:
3534 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3535 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3536 return true;
3537 default:
3538 return false;
3539 }
3540}
3541
3543 switch (MI.getOpcode()) {
3544 case AMDGPU::V_MOV_B16_t16_e32:
3545 case AMDGPU::V_MOV_B16_t16_e64:
3546 return 2;
3547 case AMDGPU::V_MOV_B32_e32:
3548 case AMDGPU::V_MOV_B32_e64:
3549 case AMDGPU::V_MOV_B64_PSEUDO:
3550 case AMDGPU::V_MOV_B64_e32:
3551 case AMDGPU::V_MOV_B64_e64:
3552 case AMDGPU::S_MOV_B32:
3553 case AMDGPU::S_MOV_B64:
3554 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3555 case AMDGPU::COPY:
3556 case AMDGPU::WWM_COPY:
3557 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3558 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3559 case AMDGPU::V_ACCVGPR_MOV_B32:
3560 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3561 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3562 return 1;
3563 default:
3564 llvm_unreachable("MI is not a foldable copy");
3565 }
3566}
3567
3568static constexpr AMDGPU::OpName ModifierOpNames[] = {
3569 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3570 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3571 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3572
3574 unsigned Opc = MI.getOpcode();
3575 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3576 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3577 if (Idx >= 0)
3578 MI.removeOperand(Idx);
3579 }
3580}
3581
3583 const MCInstrDesc &NewDesc) const {
3584 MI.setDesc(NewDesc);
3585
3586 // Remove any leftover implicit operands from mutating the instruction. e.g.
3587 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3588 // anymore.
3589 const MCInstrDesc &Desc = MI.getDesc();
3590 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3591 Desc.implicit_defs().size();
3592
3593 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3594 MI.removeOperand(I);
3595}
3596
3597std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3598 unsigned SubRegIndex) {
3599 switch (SubRegIndex) {
3600 case AMDGPU::NoSubRegister:
3601 return Imm;
3602 case AMDGPU::sub0:
3603 return SignExtend64<32>(Imm);
3604 case AMDGPU::sub1:
3605 return SignExtend64<32>(Imm >> 32);
3606 case AMDGPU::lo16:
3607 return SignExtend64<16>(Imm);
3608 case AMDGPU::hi16:
3609 return SignExtend64<16>(Imm >> 16);
3610 case AMDGPU::sub1_lo16:
3611 return SignExtend64<16>(Imm >> 32);
3612 case AMDGPU::sub1_hi16:
3613 return SignExtend64<16>(Imm >> 48);
3614 default:
3615 return std::nullopt;
3616 }
3617
3618 llvm_unreachable("covered subregister switch");
3619}
3620
3621static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3622 switch (Opc) {
3623 case AMDGPU::V_MAC_F16_e32:
3624 case AMDGPU::V_MAC_F16_e64:
3625 case AMDGPU::V_MAD_F16_e64:
3626 return AMDGPU::V_MADAK_F16;
3627 case AMDGPU::V_MAC_F32_e32:
3628 case AMDGPU::V_MAC_F32_e64:
3629 case AMDGPU::V_MAD_F32_e64:
3630 return AMDGPU::V_MADAK_F32;
3631 case AMDGPU::V_FMAC_F32_e32:
3632 case AMDGPU::V_FMAC_F32_e64:
3633 case AMDGPU::V_FMA_F32_e64:
3634 return AMDGPU::V_FMAAK_F32;
3635 case AMDGPU::V_FMAC_F16_e32:
3636 case AMDGPU::V_FMAC_F16_e64:
3637 case AMDGPU::V_FMAC_F16_t16_e64:
3638 case AMDGPU::V_FMAC_F16_fake16_e64:
3639 case AMDGPU::V_FMAC_F16_t16_e32:
3640 case AMDGPU::V_FMAC_F16_fake16_e32:
3641 case AMDGPU::V_FMA_F16_e64:
3642 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3643 ? AMDGPU::V_FMAAK_F16_t16
3644 : AMDGPU::V_FMAAK_F16_fake16
3645 : AMDGPU::V_FMAAK_F16;
3646 case AMDGPU::V_FMAC_F64_e32:
3647 case AMDGPU::V_FMAC_F64_e64:
3648 case AMDGPU::V_FMA_F64_e64:
3649 return AMDGPU::V_FMAAK_F64;
3650 default:
3651 llvm_unreachable("invalid instruction");
3652 }
3653}
3654
3655static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3656 switch (Opc) {
3657 case AMDGPU::V_MAC_F16_e32:
3658 case AMDGPU::V_MAC_F16_e64:
3659 case AMDGPU::V_MAD_F16_e64:
3660 return AMDGPU::V_MADMK_F16;
3661 case AMDGPU::V_MAC_F32_e32:
3662 case AMDGPU::V_MAC_F32_e64:
3663 case AMDGPU::V_MAD_F32_e64:
3664 return AMDGPU::V_MADMK_F32;
3665 case AMDGPU::V_FMAC_F32_e32:
3666 case AMDGPU::V_FMAC_F32_e64:
3667 case AMDGPU::V_FMA_F32_e64:
3668 return AMDGPU::V_FMAMK_F32;
3669 case AMDGPU::V_FMAC_F16_e32:
3670 case AMDGPU::V_FMAC_F16_e64:
3671 case AMDGPU::V_FMAC_F16_t16_e64:
3672 case AMDGPU::V_FMAC_F16_fake16_e64:
3673 case AMDGPU::V_FMAC_F16_t16_e32:
3674 case AMDGPU::V_FMAC_F16_fake16_e32:
3675 case AMDGPU::V_FMA_F16_e64:
3676 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3677 ? AMDGPU::V_FMAMK_F16_t16
3678 : AMDGPU::V_FMAMK_F16_fake16
3679 : AMDGPU::V_FMAMK_F16;
3680 case AMDGPU::V_FMAC_F64_e32:
3681 case AMDGPU::V_FMAC_F64_e64:
3682 case AMDGPU::V_FMA_F64_e64:
3683 return AMDGPU::V_FMAMK_F64;
3684 default:
3685 llvm_unreachable("invalid instruction");
3686 }
3687}
3688
3690 Register Reg, MachineRegisterInfo *MRI) const {
3691 int64_t Imm;
3692 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3693 return false;
3694
3695 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3696
3697 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3698
3699 unsigned Opc = UseMI.getOpcode();
3700 if (Opc == AMDGPU::COPY) {
3701 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3702
3703 Register DstReg = UseMI.getOperand(0).getReg();
3704 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3705
3706 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3707
3708 if (HasMultipleUses) {
3709 // TODO: This should fold in more cases with multiple use, but we need to
3710 // more carefully consider what those uses are.
3711 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3712
3713 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3714 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3715 return false;
3716
3717 // Most of the time folding a 32-bit inline constant is free (though this
3718 // might not be true if we can't later fold it into a real user).
3719 //
3720 // FIXME: This isInlineConstant check is imprecise if
3721 // getConstValDefinedInReg handled the tricky non-mov cases.
3722 if (ImmDefSize == 32 &&
3724 return false;
3725 }
3726
3727 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3728 RI.getSubRegIdxSize(UseSubReg) == 16;
3729
3730 if (Is16Bit) {
3731 if (RI.hasVGPRs(DstRC))
3732 return false; // Do not clobber vgpr_hi16
3733
3734 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3735 return false;
3736 }
3737
3738 MachineFunction *MF = UseMI.getMF();
3739
3740 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3741 MCRegister MovDstPhysReg =
3742 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3743
3744 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3745
3746 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3747 for (unsigned MovOp :
3748 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3749 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3750 const MCInstrDesc &MovDesc = get(MovOp);
3751
3752 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3753 if (Is16Bit) {
3754 // We just need to find a correctly sized register class, so the
3755 // subregister index compatibility doesn't matter since we're statically
3756 // extracting the immediate value.
3757 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3758 if (!MovDstRC)
3759 continue;
3760
3761 if (MovDstPhysReg) {
3762 // FIXME: We probably should not do this. If there is a live value in
3763 // the high half of the register, it will be corrupted.
3764 MovDstPhysReg =
3765 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3766 if (!MovDstPhysReg)
3767 continue;
3768 }
3769 }
3770
3771 // Result class isn't the right size, try the next instruction.
3772 if (MovDstPhysReg) {
3773 if (!MovDstRC->contains(MovDstPhysReg))
3774 return false;
3775 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3776 // TODO: This will be overly conservative in the case of 16-bit virtual
3777 // SGPRs. We could hack up the virtual register uses to use a compatible
3778 // 32-bit class.
3779 continue;
3780 }
3781
3782 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3783
3784 // Ensure the interpreted immediate value is a valid operand in the new
3785 // mov.
3786 //
3787 // FIXME: isImmOperandLegal should have form that doesn't require existing
3788 // MachineInstr or MachineOperand
3789 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3790 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3791 break;
3792
3793 NewOpc = MovOp;
3794 break;
3795 }
3796
3797 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3798 return false;
3799
3800 if (Is16Bit) {
3801 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3802 if (MovDstPhysReg)
3803 UseMI.getOperand(0).setReg(MovDstPhysReg);
3804 assert(UseMI.getOperand(1).getReg().isVirtual());
3805 }
3806
3807 const MCInstrDesc &NewMCID = get(NewOpc);
3808 UseMI.setDesc(NewMCID);
3809 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3810 UseMI.addImplicitDefUseOperands(*MF);
3811 return true;
3812 }
3813
3814 if (HasMultipleUses)
3815 return false;
3816
3817 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3818 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3819 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3820 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3821 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3822 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3823 Opc == AMDGPU::V_FMAC_F64_e64) {
3824 // Don't fold if we are using source or output modifiers. The new VOP2
3825 // instructions don't have them.
3827 return false;
3828
3829 // If this is a free constant, there's no reason to do this.
3830 // TODO: We could fold this here instead of letting SIFoldOperands do it
3831 // later.
3832 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3833
3834 // Any src operand can be used for the legality check.
3835 if (isInlineConstant(UseMI, Src0Idx, Imm))
3836 return false;
3837
3838 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3839
3840 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3841 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3842
3843 auto CopyRegOperandToNarrowerRC =
3844 [MRI, this](MachineInstr &MI, unsigned OpNo,
3845 const TargetRegisterClass *NewRC) -> void {
3846 if (!MI.getOperand(OpNo).isReg())
3847 return;
3848 Register Reg = MI.getOperand(OpNo).getReg();
3849 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3850 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3851 return;
3852 Register Tmp = MRI->createVirtualRegister(NewRC);
3853 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3854 get(AMDGPU::COPY), Tmp)
3855 .addReg(Reg);
3856 MI.getOperand(OpNo).setReg(Tmp);
3857 MI.getOperand(OpNo).setIsKill();
3858 };
3859
3860 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3861 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3862 (Src1->isReg() && Src1->getReg() == Reg)) {
3863 MachineOperand *RegSrc =
3864 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3865 if (!RegSrc->isReg())
3866 return false;
3867 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3868 ST.getConstantBusLimit(Opc) < 2)
3869 return false;
3870
3871 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3872 return false;
3873
3874 // If src2 is also a literal constant then we have to choose which one to
3875 // fold. In general it is better to choose madak so that the other literal
3876 // can be materialized in an sgpr instead of a vgpr:
3877 // s_mov_b32 s0, literal
3878 // v_madak_f32 v0, s0, v0, literal
3879 // Instead of:
3880 // v_mov_b32 v1, literal
3881 // v_madmk_f32 v0, v0, literal, v1
3882 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3883 if (Def && Def->isMoveImmediate() &&
3884 !isInlineConstant(Def->getOperand(1)))
3885 return false;
3886
3887 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3888 if (pseudoToMCOpcode(NewOpc) == -1)
3889 return false;
3890
3891 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3892 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3893
3894 // FIXME: This would be a lot easier if we could return a new instruction
3895 // instead of having to modify in place.
3896
3897 Register SrcReg = RegSrc->getReg();
3898 unsigned SrcSubReg = RegSrc->getSubReg();
3899 Src0->setReg(SrcReg);
3900 Src0->setSubReg(SrcSubReg);
3901 Src0->setIsKill(RegSrc->isKill());
3902
3903 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3904 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3905 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3906 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3907 UseMI.untieRegOperand(
3908 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3909
3910 Src1->ChangeToImmediate(*SubRegImm);
3911
3913 UseMI.setDesc(get(NewOpc));
3914
3915 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3916 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3917 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3918 Register Tmp = MRI->createVirtualRegister(NewRC);
3919 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3920 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3921 UseMI.getOperand(0).getReg())
3922 .addReg(Tmp, RegState::Kill);
3923 UseMI.getOperand(0).setReg(Tmp);
3924 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3925 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3926 }
3927
3928 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3929 if (DeleteDef)
3930 DefMI.eraseFromParent();
3931
3932 return true;
3933 }
3934
3935 // Added part is the constant: Use v_madak_{f16, f32}.
3936 if (Src2->isReg() && Src2->getReg() == Reg) {
3937 if (ST.getConstantBusLimit(Opc) < 2) {
3938 // Not allowed to use constant bus for another operand.
3939 // We can however allow an inline immediate as src0.
3940 bool Src0Inlined = false;
3941 if (Src0->isReg()) {
3942 // Try to inline constant if possible.
3943 // If the Def moves immediate and the use is single
3944 // We are saving VGPR here.
3945 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3946 if (Def && Def->isMoveImmediate() &&
3947 isInlineConstant(Def->getOperand(1)) &&
3948 MRI->hasOneNonDBGUse(Src0->getReg())) {
3949 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3950 Src0Inlined = true;
3951 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3952 RI.isSGPRReg(*MRI, Src0->getReg())) {
3953 return false;
3954 }
3955 // VGPR is okay as Src0 - fallthrough
3956 }
3957
3958 if (Src1->isReg() && !Src0Inlined) {
3959 // We have one slot for inlinable constant so far - try to fill it
3960 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3961 if (Def && Def->isMoveImmediate() &&
3962 isInlineConstant(Def->getOperand(1)) &&
3963 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3964 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3965 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3966 return false;
3967 // VGPR is okay as Src1 - fallthrough
3968 }
3969 }
3970
3971 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3972 if (pseudoToMCOpcode(NewOpc) == -1)
3973 return false;
3974
3975 // FIXME: This would be a lot easier if we could return a new instruction
3976 // instead of having to modify in place.
3977
3978 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3979 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3980 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3981 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3982 UseMI.untieRegOperand(
3983 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3984
3985 const std::optional<int64_t> SubRegImm =
3986 extractSubregFromImm(Imm, Src2->getSubReg());
3987
3988 // ChangingToImmediate adds Src2 back to the instruction.
3989 Src2->ChangeToImmediate(*SubRegImm);
3990
3991 // These come before src2.
3993 UseMI.setDesc(get(NewOpc));
3994
3995 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3996 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3997 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3998 Register Tmp = MRI->createVirtualRegister(NewRC);
3999 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
4000 UseMI.getDebugLoc(), get(AMDGPU::COPY),
4001 UseMI.getOperand(0).getReg())
4002 .addReg(Tmp, RegState::Kill);
4003 UseMI.getOperand(0).setReg(Tmp);
4004 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
4005 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
4006 }
4007
4008 // It might happen that UseMI was commuted
4009 // and we now have SGPR as SRC1. If so 2 inlined
4010 // constant and SGPR are illegal.
4012
4013 bool DeleteDef = MRI->use_nodbg_empty(Reg);
4014 if (DeleteDef)
4015 DefMI.eraseFromParent();
4016
4017 return true;
4018 }
4019 }
4020
4021 return false;
4022}
4023
4024static bool
4027 if (BaseOps1.size() != BaseOps2.size())
4028 return false;
4029 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4030 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
4031 return false;
4032 }
4033 return true;
4034}
4035
4036static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4037 LocationSize WidthB, int OffsetB) {
4038 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4039 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4040 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4041 return LowWidth.hasValue() &&
4042 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4043}
4044
4045bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4046 const MachineInstr &MIb) const {
4047 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4048 int64_t Offset0, Offset1;
4049 LocationSize Dummy0 = LocationSize::precise(0);
4050 LocationSize Dummy1 = LocationSize::precise(0);
4051 bool Offset0IsScalable, Offset1IsScalable;
4052 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
4053 Dummy0, &RI) ||
4054 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
4055 Dummy1, &RI))
4056 return false;
4057
4058 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
4059 return false;
4060
4061 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4062 // FIXME: Handle ds_read2 / ds_write2.
4063 return false;
4064 }
4065 LocationSize Width0 = MIa.memoperands().front()->getSize();
4066 LocationSize Width1 = MIb.memoperands().front()->getSize();
4067 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4068}
4069
4071 const MachineInstr &MIb) const {
4072 assert(MIa.mayLoadOrStore() &&
4073 "MIa must load from or modify a memory location");
4074 assert(MIb.mayLoadOrStore() &&
4075 "MIb must load from or modify a memory location");
4076
4078 return false;
4079
4080 // XXX - Can we relax this between address spaces?
4081 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4082 return false;
4083
4084 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4085 return false;
4086
4087 if (MIa.isBundle() || MIb.isBundle())
4088 return false;
4089
4090 // TODO: Should we check the address space from the MachineMemOperand? That
4091 // would allow us to distinguish objects we know don't alias based on the
4092 // underlying address space, even if it was lowered to a different one,
4093 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4094 // buffer.
4095 if (isDS(MIa)) {
4096 if (isDS(MIb))
4097 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4098
4099 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4100 }
4101
4102 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4103 if (isMUBUF(MIb) || isMTBUF(MIb))
4104 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4105
4106 if (isFLAT(MIb))
4107 return isFLATScratch(MIb);
4108
4109 return !isSMRD(MIb);
4110 }
4111
4112 if (isSMRD(MIa)) {
4113 if (isSMRD(MIb))
4114 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4115
4116 if (isFLAT(MIb))
4117 return isFLATScratch(MIb);
4118
4119 return !isMUBUF(MIb) && !isMTBUF(MIb);
4120 }
4121
4122 if (isFLAT(MIa)) {
4123 if (isFLAT(MIb)) {
4124 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4125 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4126 return true;
4127
4128 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4129 }
4130
4131 return false;
4132 }
4133
4134 return false;
4135}
4136
4138 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4139 if (Reg.isPhysical())
4140 return false;
4141 auto *Def = MRI.getUniqueVRegDef(Reg);
4142 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4143 Imm = Def->getOperand(1).getImm();
4144 if (DefMI)
4145 *DefMI = Def;
4146 return true;
4147 }
4148 return false;
4149}
4150
4151static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4152 MachineInstr **DefMI = nullptr) {
4153 if (!MO->isReg())
4154 return false;
4155 const MachineFunction *MF = MO->getParent()->getMF();
4156 const MachineRegisterInfo &MRI = MF->getRegInfo();
4157 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4158}
4159
4161 MachineInstr &NewMI) {
4162 if (LV) {
4163 unsigned NumOps = MI.getNumOperands();
4164 for (unsigned I = 1; I < NumOps; ++I) {
4165 MachineOperand &Op = MI.getOperand(I);
4166 if (Op.isReg() && Op.isKill())
4167 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4168 }
4169 }
4170}
4171
4172static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4173 switch (Opc) {
4174 case AMDGPU::V_MAC_F16_e32:
4175 case AMDGPU::V_MAC_F16_e64:
4176 return AMDGPU::V_MAD_F16_e64;
4177 case AMDGPU::V_MAC_F32_e32:
4178 case AMDGPU::V_MAC_F32_e64:
4179 return AMDGPU::V_MAD_F32_e64;
4180 case AMDGPU::V_MAC_LEGACY_F32_e32:
4181 case AMDGPU::V_MAC_LEGACY_F32_e64:
4182 return AMDGPU::V_MAD_LEGACY_F32_e64;
4183 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4184 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4185 return AMDGPU::V_FMA_LEGACY_F32_e64;
4186 case AMDGPU::V_FMAC_F16_e32:
4187 case AMDGPU::V_FMAC_F16_e64:
4188 case AMDGPU::V_FMAC_F16_t16_e64:
4189 case AMDGPU::V_FMAC_F16_fake16_e64:
4190 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4191 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4192 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4193 : AMDGPU::V_FMA_F16_gfx9_e64;
4194 case AMDGPU::V_FMAC_F32_e32:
4195 case AMDGPU::V_FMAC_F32_e64:
4196 return AMDGPU::V_FMA_F32_e64;
4197 case AMDGPU::V_FMAC_F64_e32:
4198 case AMDGPU::V_FMAC_F64_e64:
4199 return AMDGPU::V_FMA_F64_e64;
4200 default:
4201 llvm_unreachable("invalid instruction");
4202 }
4203}
4204
4205/// Helper struct for the implementation of 3-address conversion to communicate
4206/// updates made to instruction operands.
4208 /// Other instruction whose def is no longer used by the converted
4209 /// instruction.
4211};
4212
4214 LiveVariables *LV,
4215 LiveIntervals *LIS) const {
4216 MachineBasicBlock &MBB = *MI.getParent();
4217 MachineInstr *CandidateMI = &MI;
4218
4219 if (MI.isBundle()) {
4220 // This is a temporary placeholder for bundle handling that enables us to
4221 // exercise the relevant code paths in the two-address instruction pass.
4222 if (MI.getBundleSize() != 1)
4223 return nullptr;
4224 CandidateMI = MI.getNextNode();
4225 }
4226
4228 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4229 if (!NewMI)
4230 return nullptr;
4231
4232 if (MI.isBundle()) {
4233 CandidateMI->eraseFromBundle();
4234
4235 for (MachineOperand &MO : MI.all_defs()) {
4236 if (MO.isTied())
4237 MI.untieRegOperand(MO.getOperandNo());
4238 }
4239 } else {
4240 updateLiveVariables(LV, MI, *NewMI);
4241 if (LIS) {
4242 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4243 // SlotIndex of defs needs to be updated when converting to early-clobber
4244 MachineOperand &Def = NewMI->getOperand(0);
4245 if (Def.isEarlyClobber() && Def.isReg() &&
4246 LIS->hasInterval(Def.getReg())) {
4247 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4248 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4249 auto &LI = LIS->getInterval(Def.getReg());
4250 auto UpdateDefIndex = [&](LiveRange &LR) {
4251 auto *S = LR.find(OldIndex);
4252 if (S != LR.end() && S->start == OldIndex) {
4253 assert(S->valno && S->valno->def == OldIndex);
4254 S->start = NewIndex;
4255 S->valno->def = NewIndex;
4256 }
4257 };
4258 UpdateDefIndex(LI);
4259 for (auto &SR : LI.subranges())
4260 UpdateDefIndex(SR);
4261 }
4262 }
4263 }
4264
4265 if (U.RemoveMIUse) {
4266 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4267 // The only user is the instruction which will be killed.
4268 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4269
4270 if (MRI.hasOneNonDBGUse(DefReg)) {
4271 // We cannot just remove the DefMI here, calling pass will crash.
4272 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4273 U.RemoveMIUse->getOperand(0).setIsDead(true);
4274 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4275 U.RemoveMIUse->removeOperand(I);
4276 if (LV)
4277 LV->getVarInfo(DefReg).AliveBlocks.clear();
4278 }
4279
4280 if (MI.isBundle()) {
4281 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4282 if (!VRI.Reads && !VRI.Writes) {
4283 for (MachineOperand &MO : MI.all_uses()) {
4284 if (MO.isReg() && MO.getReg() == DefReg) {
4285 assert(MO.getSubReg() == 0 &&
4286 "tied sub-registers in bundles currently not supported");
4287 MI.removeOperand(MO.getOperandNo());
4288 break;
4289 }
4290 }
4291
4292 if (LIS)
4293 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4294 }
4295 } else if (LIS) {
4296 LiveInterval &DefLI = LIS->getInterval(DefReg);
4297
4298 // We cannot delete the original instruction here, so hack out the use
4299 // in the original instruction with a dummy register so we can use
4300 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4301 // not have the complexity of deleting a use to consider here.
4302 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4303 for (MachineOperand &MIOp : MI.uses()) {
4304 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4305 MIOp.setIsUndef(true);
4306 MIOp.setReg(DummyReg);
4307 }
4308 }
4309
4310 if (MI.isBundle()) {
4311 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4312 if (!VRI.Reads && !VRI.Writes) {
4313 for (MachineOperand &MIOp : MI.uses()) {
4314 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4315 MIOp.setIsUndef(true);
4316 MIOp.setReg(DummyReg);
4317 }
4318 }
4319 }
4320
4321 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4322 false, /*isUndef=*/true));
4323 }
4324
4325 LIS->shrinkToUses(&DefLI);
4326 }
4327 }
4328
4329 return MI.isBundle() ? &MI : NewMI;
4330}
4331
4333SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4334 ThreeAddressUpdates &U) const {
4335 MachineBasicBlock &MBB = *MI.getParent();
4336 unsigned Opc = MI.getOpcode();
4337
4338 // Handle MFMA.
4339 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4340 if (NewMFMAOpc != -1) {
4342 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4343 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4344 MIB.add(MI.getOperand(I));
4345 return MIB;
4346 }
4347
4348 if (SIInstrInfo::isWMMA(MI)) {
4349 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4350 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4351 .setMIFlags(MI.getFlags());
4352 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4353 MIB->addOperand(MI.getOperand(I));
4354 return MIB;
4355 }
4356
4357 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4358 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4359 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4360 "present pre-RA");
4361
4362 // Handle MAC/FMAC.
4363 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4364 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4365 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4366 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4367 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4368 bool Src0Literal = false;
4369
4370 switch (Opc) {
4371 default:
4372 return nullptr;
4373 case AMDGPU::V_MAC_F16_e64:
4374 case AMDGPU::V_FMAC_F16_e64:
4375 case AMDGPU::V_FMAC_F16_t16_e64:
4376 case AMDGPU::V_FMAC_F16_fake16_e64:
4377 case AMDGPU::V_MAC_F32_e64:
4378 case AMDGPU::V_MAC_LEGACY_F32_e64:
4379 case AMDGPU::V_FMAC_F32_e64:
4380 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4381 case AMDGPU::V_FMAC_F64_e64:
4382 break;
4383 case AMDGPU::V_MAC_F16_e32:
4384 case AMDGPU::V_FMAC_F16_e32:
4385 case AMDGPU::V_MAC_F32_e32:
4386 case AMDGPU::V_MAC_LEGACY_F32_e32:
4387 case AMDGPU::V_FMAC_F32_e32:
4388 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4389 case AMDGPU::V_FMAC_F64_e32: {
4390 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4391 AMDGPU::OpName::src0);
4392 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4393 if (!Src0->isReg() && !Src0->isImm())
4394 return nullptr;
4395
4396 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4397 Src0Literal = true;
4398
4399 break;
4400 }
4401 }
4402
4403 MachineInstrBuilder MIB;
4404 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4405 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4406 const MachineOperand *Src0Mods =
4407 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4408 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4409 const MachineOperand *Src1Mods =
4410 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4411 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4412 const MachineOperand *Src2Mods =
4413 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4414 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4415 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4416 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4417
4418 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4419 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4420 // If we have an SGPR input, we will violate the constant bus restriction.
4421 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4422 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4423 MachineInstr *DefMI;
4424
4425 int64_t Imm;
4426 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4427 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4428 if (pseudoToMCOpcode(NewOpc) != -1) {
4429 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4430 .add(*Dst)
4431 .add(*Src0)
4432 .add(*Src1)
4433 .addImm(Imm)
4434 .setMIFlags(MI.getFlags());
4435 U.RemoveMIUse = DefMI;
4436 return MIB;
4437 }
4438 }
4439 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4440 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4441 if (pseudoToMCOpcode(NewOpc) != -1) {
4442 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4443 .add(*Dst)
4444 .add(*Src0)
4445 .addImm(Imm)
4446 .add(*Src2)
4447 .setMIFlags(MI.getFlags());
4448 U.RemoveMIUse = DefMI;
4449 return MIB;
4450 }
4451 }
4452 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4453 if (Src0Literal) {
4454 Imm = Src0->getImm();
4455 DefMI = nullptr;
4456 }
4457 if (pseudoToMCOpcode(NewOpc) != -1 &&
4459 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4460 Src1)) {
4461 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4462 .add(*Dst)
4463 .add(*Src1)
4464 .addImm(Imm)
4465 .add(*Src2)
4466 .setMIFlags(MI.getFlags());
4467 U.RemoveMIUse = DefMI;
4468 return MIB;
4469 }
4470 }
4471 }
4472
4473 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4474 // if VOP3 does not allow a literal operand.
4475 if (Src0Literal && !ST.hasVOP3Literal())
4476 return nullptr;
4477
4478 unsigned NewOpc = getNewFMAInst(ST, Opc);
4479
4480 if (pseudoToMCOpcode(NewOpc) == -1)
4481 return nullptr;
4482
4483 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4484 .add(*Dst)
4485 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4486 .add(*Src0)
4487 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4488 .add(*Src1)
4489 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4490 .add(*Src2)
4491 .addImm(Clamp ? Clamp->getImm() : 0)
4492 .addImm(Omod ? Omod->getImm() : 0)
4493 .setMIFlags(MI.getFlags());
4494 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4495 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4496 return MIB;
4497}
4498
4499// It's not generally safe to move VALU instructions across these since it will
4500// start using the register as a base index rather than directly.
4501// XXX - Why isn't hasSideEffects sufficient for these?
4503 switch (MI.getOpcode()) {
4504 case AMDGPU::S_SET_GPR_IDX_ON:
4505 case AMDGPU::S_SET_GPR_IDX_MODE:
4506 case AMDGPU::S_SET_GPR_IDX_OFF:
4507 return true;
4508 default:
4509 return false;
4510 }
4511}
4512
4514 const MachineBasicBlock *MBB,
4515 const MachineFunction &MF) const {
4516 // Skipping the check for SP writes in the base implementation. The reason it
4517 // was added was apparently due to compile time concerns.
4518 //
4519 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4520 // but is probably avoidable.
4521
4522 // Copied from base implementation.
4523 // Terminators and labels can't be scheduled around.
4524 if (MI.isTerminator() || MI.isPosition())
4525 return true;
4526
4527 // INLINEASM_BR can jump to another block
4528 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4529 return true;
4530
4531 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4532 return true;
4533
4534 // Target-independent instructions do not have an implicit-use of EXEC, even
4535 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4536 // boundaries prevents incorrect movements of such instructions.
4537 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4538 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4539 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4540 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4541 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4543}
4544
4546 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4547 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4548 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4549}
4550
4552 // Instructions that access scratch use FLAT encoding or BUF encodings.
4553 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4554 return false;
4555
4556 // SCRATCH instructions always access scratch.
4557 if (isFLATScratch(MI))
4558 return true;
4559
4560 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4561 // via the aperture.
4562 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4563 return false;
4564
4565 // If there are no memory operands then conservatively assume the flat
4566 // operation may access scratch.
4567 if (MI.memoperands_empty())
4568 return true;
4569
4570 // See if any memory operand specifies an address space that involves scratch.
4571 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4572 unsigned AS = Memop->getAddrSpace();
4573 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4574 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4575 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4576 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4577 }
4578 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4579 });
4580}
4581
4583 assert(isFLAT(MI));
4584
4585 // All flat instructions use the VMEM counter except prefetch.
4586 if (!usesVM_CNT(MI))
4587 return false;
4588
4589 // If there are no memory operands then conservatively assume the flat
4590 // operation may access VMEM.
4591 if (MI.memoperands_empty())
4592 return true;
4593
4594 // See if any memory operand specifies an address space that involves VMEM.
4595 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4596 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4597 // (GDS) address space is not supported by flat operations. Therefore, simply
4598 // return true unless only the LDS address space is found.
4599 for (const MachineMemOperand *Memop : MI.memoperands()) {
4600 unsigned AS = Memop->getAddrSpace();
4602 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4603 return true;
4604 }
4605
4606 return false;
4607}
4608
4610 assert(isFLAT(MI));
4611
4612 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4613 if (!usesLGKM_CNT(MI))
4614 return false;
4615
4616 // If in tgsplit mode then there can be no use of LDS.
4617 if (ST.isTgSplitEnabled())
4618 return false;
4619
4620 // If there are no memory operands then conservatively assume the flat
4621 // operation may access LDS.
4622 if (MI.memoperands_empty())
4623 return true;
4624
4625 // See if any memory operand specifies an address space that involves LDS.
4626 for (const MachineMemOperand *Memop : MI.memoperands()) {
4627 unsigned AS = Memop->getAddrSpace();
4629 return true;
4630 }
4631
4632 return false;
4633}
4634
4636 // Skip the full operand and register alias search modifiesRegister
4637 // does. There's only a handful of instructions that touch this, it's only an
4638 // implicit def, and doesn't alias any other registers.
4639 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4640}
4641
4643 unsigned Opcode = MI.getOpcode();
4644
4645 if (MI.mayStore() && isSMRD(MI))
4646 return true; // scalar store or atomic
4647
4648 // This will terminate the function when other lanes may need to continue.
4649 if (MI.isReturn())
4650 return true;
4651
4652 // These instructions cause shader I/O that may cause hardware lockups
4653 // when executed with an empty EXEC mask.
4654 //
4655 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4656 // EXEC = 0, but checking for that case here seems not worth it
4657 // given the typical code patterns.
4658 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4659 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4660 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4661 Opcode == AMDGPU::S_SETHALT)
4662 return true;
4663
4664 if (MI.isCall() || MI.isInlineAsm())
4665 return true; // conservative assumption
4666
4667 // Assume that barrier interactions are only intended with active lanes.
4668 if (isBarrier(Opcode))
4669 return true;
4670
4671 // A mode change is a scalar operation that influences vector instructions.
4673 return true;
4674
4675 // These are like SALU instructions in terms of effects, so it's questionable
4676 // whether we should return true for those.
4677 //
4678 // However, executing them with EXEC = 0 causes them to operate on undefined
4679 // data, which we avoid by returning true here.
4680 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4681 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4682 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4683 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4684 return true;
4685
4686 return false;
4687}
4688
4690 const MachineInstr &MI) const {
4691 if (MI.isMetaInstruction())
4692 return false;
4693
4694 // This won't read exec if this is an SGPR->SGPR copy.
4695 if (MI.isCopyLike()) {
4696 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4697 return true;
4698
4699 // Make sure this isn't copying exec as a normal operand
4700 return MI.readsRegister(AMDGPU::EXEC, &RI);
4701 }
4702
4703 // Make a conservative assumption about the callee.
4704 if (MI.isCall())
4705 return true;
4706
4707 // Be conservative with any unhandled generic opcodes.
4708 if (!isTargetSpecificOpcode(MI.getOpcode()))
4709 return true;
4710
4711 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4712}
4713
4714bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4715 switch (Imm.getBitWidth()) {
4716 case 1: // This likely will be a condition code mask.
4717 return true;
4718
4719 case 32:
4720 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4721 ST.hasInv2PiInlineImm());
4722 case 64:
4723 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4724 ST.hasInv2PiInlineImm());
4725 case 16:
4726 return ST.has16BitInsts() &&
4727 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4728 ST.hasInv2PiInlineImm());
4729 default:
4730 llvm_unreachable("invalid bitwidth");
4731 }
4732}
4733
4735 APInt IntImm = Imm.bitcastToAPInt();
4736 int64_t IntImmVal = IntImm.getSExtValue();
4737 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4738 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4739 default:
4740 llvm_unreachable("invalid fltSemantics");
4743 return isInlineConstant(IntImm);
4745 return ST.has16BitInsts() &&
4746 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4748 return ST.has16BitInsts() &&
4749 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4750 }
4751}
4752
4753bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4754 // MachineOperand provides no way to tell the true operand size, since it only
4755 // records a 64-bit value. We need to know the size to determine if a 32-bit
4756 // floating point immediate bit pattern is legal for an integer immediate. It
4757 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4758 switch (OperandType) {
4768 int32_t Trunc = static_cast<int32_t>(Imm);
4769 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4770 }
4776 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4779 // We would expect inline immediates to not be concerned with an integer/fp
4780 // distinction. However, in the case of 16-bit integer operations, the
4781 // "floating point" values appear to not work. It seems read the low 16-bits
4782 // of 32-bit immediates, which happens to always work for the integer
4783 // values.
4784 //
4785 // See llvm bugzilla 46302.
4786 //
4787 // TODO: Theoretically we could use op-sel to use the high bits of the
4788 // 32-bit FP values.
4797 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4802 return false;
4805 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4806 // A few special case instructions have 16-bit operands on subtargets
4807 // where 16-bit instructions are not legal.
4808 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4809 // constants in these cases
4810 int16_t Trunc = static_cast<int16_t>(Imm);
4811 return ST.has16BitInsts() &&
4812 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4813 }
4814
4815 return false;
4816 }
4819 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4820 int16_t Trunc = static_cast<int16_t>(Imm);
4821 return ST.has16BitInsts() &&
4822 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4823 }
4824 return false;
4825 }
4829 return false;
4831 return isLegalAV64PseudoImm(Imm);
4834 // Always embedded in the instruction for free.
4835 return true;
4845 // Just ignore anything else.
4846 return true;
4847 default:
4848 llvm_unreachable("invalid operand type");
4849 }
4850}
4851
4852static bool compareMachineOp(const MachineOperand &Op0,
4853 const MachineOperand &Op1) {
4854 if (Op0.getType() != Op1.getType())
4855 return false;
4856
4857 switch (Op0.getType()) {
4859 return Op0.getReg() == Op1.getReg();
4861 return Op0.getImm() == Op1.getImm();
4862 default:
4863 llvm_unreachable("Didn't expect to be comparing these operand types");
4864 }
4865}
4866
4868 const MCOperandInfo &OpInfo) const {
4869 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4870 return true;
4871
4872 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4873 return false;
4874
4875 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4876 return true;
4877
4878 return ST.hasVOP3Literal();
4879}
4880
4881bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4882 int64_t ImmVal) const {
4883 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4884 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4885 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4886 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4887 AMDGPU::OpName::src2))
4888 return false;
4889 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4890 }
4891
4892 return isLiteralOperandLegal(InstDesc, OpInfo);
4893}
4894
4895bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4896 const MachineOperand &MO) const {
4897 if (MO.isImm())
4898 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4899
4900 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4901 "unexpected imm-like operand kind");
4902 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4903 return isLiteralOperandLegal(InstDesc, OpInfo);
4904}
4905
4907 // 2 32-bit inline constants packed into one.
4908 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4909 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4910}
4911
4912bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4913 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4914 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4915 return false;
4916
4917 int Op32 = AMDGPU::getVOPe32(Opcode);
4918 if (Op32 == -1)
4919 return false;
4920
4921 return pseudoToMCOpcode(Op32) != -1;
4922}
4923
4924bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4925 // The src0_modifier operand is present on all instructions
4926 // that have modifiers.
4927
4928 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4929}
4930
4932 AMDGPU::OpName OpName) const {
4933 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4934 return Mods && Mods->getImm();
4935}
4936
4938 return any_of(ModifierOpNames,
4939 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4940}
4941
4943 const MachineRegisterInfo &MRI) const {
4944 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4945 // Can't shrink instruction with three operands.
4946 if (Src2) {
4947 switch (MI.getOpcode()) {
4948 default: return false;
4949
4950 case AMDGPU::V_ADDC_U32_e64:
4951 case AMDGPU::V_SUBB_U32_e64:
4952 case AMDGPU::V_SUBBREV_U32_e64: {
4953 const MachineOperand *Src1
4954 = getNamedOperand(MI, AMDGPU::OpName::src1);
4955 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4956 return false;
4957 // Additional verification is needed for sdst/src2.
4958 return true;
4959 }
4960 case AMDGPU::V_MAC_F16_e64:
4961 case AMDGPU::V_MAC_F32_e64:
4962 case AMDGPU::V_MAC_LEGACY_F32_e64:
4963 case AMDGPU::V_FMAC_F16_e64:
4964 case AMDGPU::V_FMAC_F16_t16_e64:
4965 case AMDGPU::V_FMAC_F16_fake16_e64:
4966 case AMDGPU::V_FMAC_F32_e64:
4967 case AMDGPU::V_FMAC_F64_e64:
4968 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4969 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4970 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4971 return false;
4972 break;
4973
4974 case AMDGPU::V_CNDMASK_B32_e64:
4975 break;
4976 }
4977 }
4978
4979 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4980 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4981 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4982 return false;
4983
4984 // We don't need to check src0, all input types are legal, so just make sure
4985 // src0 isn't using any modifiers.
4986 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4987 return false;
4988
4989 // Can it be shrunk to a valid 32 bit opcode?
4990 if (!hasVALU32BitEncoding(MI.getOpcode()))
4991 return false;
4992
4993 // Check output modifiers
4994 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4995 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4996 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4997 // TODO: Can we avoid checking bound_ctrl/fi here?
4998 // They are only used by permlane*_swap special case.
4999 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
5000 !hasModifiersSet(MI, AMDGPU::OpName::fi);
5001}
5002
5003// Set VCC operand with all flags from \p Orig, except for setting it as
5004// implicit.
5006 const MachineOperand &Orig) {
5007
5008 for (MachineOperand &Use : MI.implicit_operands()) {
5009 if (Use.isUse() &&
5010 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
5011 Use.setIsUndef(Orig.isUndef());
5012 Use.setIsKill(Orig.isKill());
5013 return;
5014 }
5015 }
5016}
5017
5019 unsigned Op32) const {
5020 MachineBasicBlock *MBB = MI.getParent();
5021
5022 const MCInstrDesc &Op32Desc = get(Op32);
5023 MachineInstrBuilder Inst32 =
5024 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
5025 .setMIFlags(MI.getFlags());
5026
5027 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
5028 // For VOPC instructions, this is replaced by an implicit def of vcc.
5029
5030 // We assume the defs of the shrunk opcode are in the same order, and the
5031 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5032 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5033 Inst32.add(MI.getOperand(I));
5034
5035 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
5036
5037 int Idx = MI.getNumExplicitDefs();
5038 for (const MachineOperand &Use : MI.explicit_uses()) {
5039 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5041 continue;
5042
5043 if (&Use == Src2) {
5044 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
5045 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5046 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5047 // of vcc was already added during the initial BuildMI, but we
5048 // 1) may need to change vcc to vcc_lo to preserve the original register
5049 // 2) have to preserve the original flags.
5050 copyFlagsToImplicitVCC(*Inst32, *Src2);
5051 continue;
5052 }
5053 }
5054
5055 Inst32.add(Use);
5056 }
5057
5058 // FIXME: Losing implicit operands
5059 fixImplicitOperands(*Inst32);
5060 return Inst32;
5061}
5062
5064 // Null is free
5065 Register Reg = RegOp.getReg();
5066 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5067 return false;
5068
5069 // SGPRs use the constant bus
5070
5071 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5072 // physical register operands should also count, except for exec.
5073 if (RegOp.isImplicit())
5074 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5075
5076 // SGPRs use the constant bus
5077 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5078 AMDGPU::SReg_64RegClass.contains(Reg);
5079}
5080
5082 const MachineRegisterInfo &MRI) const {
5083 Register Reg = RegOp.getReg();
5084 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5085 : physRegUsesConstantBus(RegOp);
5086}
5087
5089 const MachineOperand &MO,
5090 const MCOperandInfo &OpInfo) const {
5091 // Literal constants use the constant bus.
5092 if (!MO.isReg())
5093 return !isInlineConstant(MO, OpInfo);
5094
5095 Register Reg = MO.getReg();
5096 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5098}
5099
5101 for (const MachineOperand &MO : MI.implicit_operands()) {
5102 // We only care about reads.
5103 if (MO.isDef())
5104 continue;
5105
5106 switch (MO.getReg()) {
5107 case AMDGPU::VCC:
5108 case AMDGPU::VCC_LO:
5109 case AMDGPU::VCC_HI:
5110 case AMDGPU::M0:
5111 case AMDGPU::FLAT_SCR:
5112 return MO.getReg();
5113
5114 default:
5115 break;
5116 }
5117 }
5118
5119 return Register();
5120}
5121
5122static bool shouldReadExec(const MachineInstr &MI) {
5123 if (SIInstrInfo::isVALU(MI)) {
5124 switch (MI.getOpcode()) {
5125 case AMDGPU::V_READLANE_B32:
5126 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5127 case AMDGPU::V_WRITELANE_B32:
5128 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5129 return false;
5130 }
5131
5132 return true;
5133 }
5134
5135 if (MI.isPreISelOpcode() ||
5136 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5139 return false;
5140
5141 return true;
5142}
5143
5144static bool isRegOrFI(const MachineOperand &MO) {
5145 return MO.isReg() || MO.isFI();
5146}
5147
5148static bool isSubRegOf(const SIRegisterInfo &TRI,
5149 const MachineOperand &SuperVec,
5150 const MachineOperand &SubReg) {
5151 if (SubReg.getReg().isPhysical())
5152 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5153
5154 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5155 SubReg.getReg() == SuperVec.getReg();
5156}
5157
5158// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5159bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5160 const MachineRegisterInfo &MRI,
5161 StringRef &ErrInfo) const {
5162 Register DstReg = MI.getOperand(0).getReg();
5163 Register SrcReg = MI.getOperand(1).getReg();
5164 // This is a check for copy from vector register to SGPR
5165 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5166 ErrInfo = "illegal copy from vector register to SGPR";
5167 return false;
5168 }
5169 return true;
5170}
5171
5173 StringRef &ErrInfo) const {
5174 uint32_t Opcode = MI.getOpcode();
5175 const MachineFunction *MF = MI.getMF();
5176 const MachineRegisterInfo &MRI = MF->getRegInfo();
5177
5178 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5179 // Find a better property to recognize the point where instruction selection
5180 // is just done.
5181 // We can only enforce this check after SIFixSGPRCopies pass so that the
5182 // illegal copies are legalized and thereafter we don't expect a pass
5183 // inserting similar copies.
5184 if (!MRI.isSSA() && MI.isCopy())
5185 return verifyCopy(MI, MRI, ErrInfo);
5186
5187 if (SIInstrInfo::isGenericOpcode(Opcode))
5188 return true;
5189
5190 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5191 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5192 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5193 int Src3Idx = -1;
5194 if (Src0Idx == -1) {
5195 // VOPD V_DUAL_* instructions use different operand names.
5196 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5197 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5198 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5199 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5200 }
5201
5202 // Make sure the number of operands is correct.
5203 const MCInstrDesc &Desc = get(Opcode);
5204 if (!Desc.isVariadic() &&
5205 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5206 ErrInfo = "Instruction has wrong number of operands.";
5207 return false;
5208 }
5209
5210 if (MI.isInlineAsm()) {
5211 // Verify register classes for inlineasm constraints.
5212 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5213 I != E; ++I) {
5214 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5215 if (!RC)
5216 continue;
5217
5218 const MachineOperand &Op = MI.getOperand(I);
5219 if (!Op.isReg())
5220 continue;
5221
5222 Register Reg = Op.getReg();
5223 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5224 ErrInfo = "inlineasm operand has incorrect register class.";
5225 return false;
5226 }
5227 }
5228
5229 return true;
5230 }
5231
5232 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5233 ErrInfo = "missing memory operand from image instruction.";
5234 return false;
5235 }
5236
5237 // Make sure the register classes are correct.
5238 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5239 const MachineOperand &MO = MI.getOperand(i);
5240 if (MO.isFPImm()) {
5241 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5242 "all fp values to integers.";
5243 return false;
5244 }
5245
5246 const MCOperandInfo &OpInfo = Desc.operands()[i];
5247 int16_t RegClass = getOpRegClassID(OpInfo);
5248
5249 switch (OpInfo.OperandType) {
5251 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5252 ErrInfo = "Illegal immediate value for operand.";
5253 return false;
5254 }
5255 break;
5269 break;
5271 break;
5272 break;
5286 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5287 ErrInfo = "Illegal immediate value for operand.";
5288 return false;
5289 }
5290 break;
5291 }
5294 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5295 ErrInfo = "Expected inline constant for operand.";
5296 return false;
5297 }
5298 break;
5301 break;
5306 // Check if this operand is an immediate.
5307 // FrameIndex operands will be replaced by immediates, so they are
5308 // allowed.
5309 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5310 ErrInfo = "Expected immediate, but got non-immediate";
5311 return false;
5312 }
5313 break;
5317 break;
5318 default:
5319 if (OpInfo.isGenericType())
5320 continue;
5321 break;
5322 }
5323
5324 if (!MO.isReg())
5325 continue;
5326 Register Reg = MO.getReg();
5327 if (!Reg)
5328 continue;
5329
5330 // FIXME: Ideally we would have separate instruction definitions with the
5331 // aligned register constraint.
5332 // FIXME: We do not verify inline asm operands, but custom inline asm
5333 // verification is broken anyway
5334 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5335 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5336 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5337 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5338 if (const TargetRegisterClass *SubRC =
5339 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5340 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5341 if (RC)
5342 RC = SubRC;
5343 }
5344 }
5345
5346 // Check that this is the aligned version of the class.
5347 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5348 ErrInfo = "Subtarget requires even aligned vector registers";
5349 return false;
5350 }
5351 }
5352
5353 if (RegClass != -1) {
5354 if (Reg.isVirtual())
5355 continue;
5356
5357 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5358 if (!RC->contains(Reg)) {
5359 ErrInfo = "Operand has incorrect register class.";
5360 return false;
5361 }
5362 }
5363 }
5364
5365 // Verify SDWA
5366 if (isSDWA(MI)) {
5367 if (!ST.hasSDWA()) {
5368 ErrInfo = "SDWA is not supported on this target";
5369 return false;
5370 }
5371
5372 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5373 AMDGPU::OpName::dst_sel}) {
5374 const MachineOperand *MO = getNamedOperand(MI, Op);
5375 if (!MO)
5376 continue;
5377 int64_t Imm = MO->getImm();
5378 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5379 ErrInfo = "Invalid SDWA selection";
5380 return false;
5381 }
5382 }
5383
5384 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5385
5386 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5387 if (OpIdx == -1)
5388 continue;
5389 const MachineOperand &MO = MI.getOperand(OpIdx);
5390
5391 if (!ST.hasSDWAScalar()) {
5392 // Only VGPRS on VI
5393 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5394 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5395 return false;
5396 }
5397 } else {
5398 // No immediates on GFX9
5399 if (!MO.isReg()) {
5400 ErrInfo =
5401 "Only reg allowed as operands in SDWA instructions on GFX9+";
5402 return false;
5403 }
5404 }
5405 }
5406
5407 if (!ST.hasSDWAOmod()) {
5408 // No omod allowed on VI
5409 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5410 if (OMod != nullptr &&
5411 (!OMod->isImm() || OMod->getImm() != 0)) {
5412 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5413 return false;
5414 }
5415 }
5416
5417 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5418 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5419 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5420 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5421 const MachineOperand *Src0ModsMO =
5422 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5423 unsigned Mods = Src0ModsMO->getImm();
5424 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5425 Mods & SISrcMods::SEXT) {
5426 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5427 return false;
5428 }
5429 }
5430
5431 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5432 if (isVOPC(BasicOpcode)) {
5433 if (!ST.hasSDWASdst() && DstIdx != -1) {
5434 // Only vcc allowed as dst on VI for VOPC
5435 const MachineOperand &Dst = MI.getOperand(DstIdx);
5436 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5437 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5438 return false;
5439 }
5440 } else if (!ST.hasSDWAOutModsVOPC()) {
5441 // No clamp allowed on GFX9 for VOPC
5442 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5443 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5444 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5445 return false;
5446 }
5447
5448 // No omod allowed on GFX9 for VOPC
5449 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5450 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5451 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5452 return false;
5453 }
5454 }
5455 }
5456
5457 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5458 if (DstUnused && DstUnused->isImm() &&
5459 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5460 const MachineOperand &Dst = MI.getOperand(DstIdx);
5461 if (!Dst.isReg() || !Dst.isTied()) {
5462 ErrInfo = "Dst register should have tied register";
5463 return false;
5464 }
5465
5466 const MachineOperand &TiedMO =
5467 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5468 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5469 ErrInfo =
5470 "Dst register should be tied to implicit use of preserved register";
5471 return false;
5472 }
5473 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5474 ErrInfo = "Dst register should use same physical register as preserved";
5475 return false;
5476 }
5477 }
5478 }
5479
5480 // Verify MIMG / VIMAGE / VSAMPLE
5481 if (isImage(Opcode) && !MI.mayStore()) {
5482 // Ensure that the return type used is large enough for all the options
5483 // being used TFE/LWE require an extra result register.
5484 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5485 if (DMask) {
5486 uint64_t DMaskImm = DMask->getImm();
5487 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5488 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5489 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5490 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5491
5492 // Adjust for packed 16 bit values
5493 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5494 RegCount = divideCeil(RegCount, 2);
5495
5496 // Adjust if using LWE or TFE
5497 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5498 RegCount += 1;
5499
5500 const uint32_t DstIdx =
5501 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5502 const MachineOperand &Dst = MI.getOperand(DstIdx);
5503 if (Dst.isReg()) {
5504 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5505 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5506 if (RegCount > DstSize) {
5507 ErrInfo = "Image instruction returns too many registers for dst "
5508 "register class";
5509 return false;
5510 }
5511 }
5512 }
5513 }
5514
5515 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5516 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5517 unsigned ConstantBusCount = 0;
5518 bool UsesLiteral = false;
5519 const MachineOperand *LiteralVal = nullptr;
5520
5521 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5522 if (ImmIdx != -1) {
5523 ++ConstantBusCount;
5524 UsesLiteral = true;
5525 LiteralVal = &MI.getOperand(ImmIdx);
5526 }
5527
5528 SmallVector<Register, 2> SGPRsUsed;
5529 Register SGPRUsed;
5530
5531 // Only look at the true operands. Only a real operand can use the constant
5532 // bus, and we don't want to check pseudo-operands like the source modifier
5533 // flags.
5534 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5535 if (OpIdx == -1)
5536 continue;
5537 const MachineOperand &MO = MI.getOperand(OpIdx);
5538 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5539 if (MO.isReg()) {
5540 SGPRUsed = MO.getReg();
5541 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5542 ++ConstantBusCount;
5543 SGPRsUsed.push_back(SGPRUsed);
5544 }
5545 } else if (!MO.isFI()) { // Treat FI like a register.
5546 if (!UsesLiteral) {
5547 ++ConstantBusCount;
5548 UsesLiteral = true;
5549 LiteralVal = &MO;
5550 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5551 assert(isVOP2(MI) || isVOP3(MI));
5552 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5553 return false;
5554 }
5555 }
5556 }
5557 }
5558
5559 SGPRUsed = findImplicitSGPRRead(MI);
5560 if (SGPRUsed) {
5561 // Implicit uses may safely overlap true operands
5562 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5563 return !RI.regsOverlap(SGPRUsed, SGPR);
5564 })) {
5565 ++ConstantBusCount;
5566 SGPRsUsed.push_back(SGPRUsed);
5567 }
5568 }
5569
5570 // v_writelane_b32 is an exception from constant bus restriction:
5571 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5572 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5573 Opcode != AMDGPU::V_WRITELANE_B32) {
5574 ErrInfo = "VOP* instruction violates constant bus restriction";
5575 return false;
5576 }
5577
5578 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5579 ErrInfo = "VOP3 instruction uses literal";
5580 return false;
5581 }
5582 }
5583
5584 // Special case for writelane - this can break the multiple constant bus rule,
5585 // but still can't use more than one SGPR register
5586 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5587 unsigned SGPRCount = 0;
5588 Register SGPRUsed;
5589
5590 for (int OpIdx : {Src0Idx, Src1Idx}) {
5591 if (OpIdx == -1)
5592 break;
5593
5594 const MachineOperand &MO = MI.getOperand(OpIdx);
5595
5596 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5597 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5598 if (MO.getReg() != SGPRUsed)
5599 ++SGPRCount;
5600 SGPRUsed = MO.getReg();
5601 }
5602 }
5603 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5604 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5605 return false;
5606 }
5607 }
5608 }
5609
5610 // Verify misc. restrictions on specific instructions.
5611 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5612 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5613 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5614 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5615 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5616 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5617 if (!compareMachineOp(Src0, Src1) &&
5618 !compareMachineOp(Src0, Src2)) {
5619 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5620 return false;
5621 }
5622 }
5623 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5624 SISrcMods::ABS) ||
5625 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5626 SISrcMods::ABS) ||
5627 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5628 SISrcMods::ABS)) {
5629 ErrInfo = "ABS not allowed in VOP3B instructions";
5630 return false;
5631 }
5632 }
5633
5634 if (isSOP2(MI) || isSOPC(MI)) {
5635 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5636 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5637
5638 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5639 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5640 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5641 !Src0.isIdenticalTo(Src1)) {
5642 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5643 return false;
5644 }
5645 }
5646
5647 if (isSOPK(MI)) {
5648 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5649 if (Desc.isBranch()) {
5650 if (!Op->isMBB()) {
5651 ErrInfo = "invalid branch target for SOPK instruction";
5652 return false;
5653 }
5654 } else {
5655 uint64_t Imm = Op->getImm();
5656 if (sopkIsZext(Opcode)) {
5657 if (!isUInt<16>(Imm)) {
5658 ErrInfo = "invalid immediate for SOPK instruction";
5659 return false;
5660 }
5661 } else {
5662 if (!isInt<16>(Imm)) {
5663 ErrInfo = "invalid immediate for SOPK instruction";
5664 return false;
5665 }
5666 }
5667 }
5668 }
5669
5670 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5671 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5672 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5673 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5674 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5675 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5676
5677 const unsigned StaticNumOps =
5678 Desc.getNumOperands() + Desc.implicit_uses().size();
5679 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5680
5681 // Require additional implicit operands. This allows a fixup done by the
5682 // post RA scheduler where the main implicit operand is killed and
5683 // implicit-defs are added for sub-registers that remain live after this
5684 // instruction.
5685 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5686 ErrInfo = "missing implicit register operands";
5687 return false;
5688 }
5689
5690 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5691 if (IsDst) {
5692 if (!Dst->isUse()) {
5693 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5694 return false;
5695 }
5696
5697 unsigned UseOpIdx;
5698 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5699 UseOpIdx != StaticNumOps + 1) {
5700 ErrInfo = "movrel implicit operands should be tied";
5701 return false;
5702 }
5703 }
5704
5705 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5706 const MachineOperand &ImpUse
5707 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5708 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5709 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5710 ErrInfo = "src0 should be subreg of implicit vector use";
5711 return false;
5712 }
5713 }
5714
5715 // Make sure we aren't losing exec uses in the td files. This mostly requires
5716 // being careful when using let Uses to try to add other use registers.
5717 if (shouldReadExec(MI)) {
5718 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5719 ErrInfo = "VALU instruction does not implicitly read exec mask";
5720 return false;
5721 }
5722 }
5723
5724 if (isSMRD(MI)) {
5725 if (MI.mayStore() &&
5726 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5727 // The register offset form of scalar stores may only use m0 as the
5728 // soffset register.
5729 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5730 if (Soff && Soff->getReg() != AMDGPU::M0) {
5731 ErrInfo = "scalar stores must use m0 as offset register";
5732 return false;
5733 }
5734 }
5735 }
5736
5737 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5738 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5739 if (Offset->getImm() != 0) {
5740 ErrInfo = "subtarget does not support offsets in flat instructions";
5741 return false;
5742 }
5743 }
5744
5745 if (isDS(MI) && !ST.hasGDS()) {
5746 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5747 if (GDSOp && GDSOp->getImm() != 0) {
5748 ErrInfo = "GDS is not supported on this subtarget";
5749 return false;
5750 }
5751 }
5752
5753 if (isImage(MI)) {
5754 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5755 if (DimOp) {
5756 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5757 AMDGPU::OpName::vaddr0);
5758 AMDGPU::OpName RSrcOpName =
5759 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5760 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5761 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5762 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5763 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5764 const AMDGPU::MIMGDimInfo *Dim =
5766
5767 if (!Dim) {
5768 ErrInfo = "dim is out of range";
5769 return false;
5770 }
5771
5772 bool IsA16 = false;
5773 if (ST.hasR128A16()) {
5774 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5775 IsA16 = R128A16->getImm() != 0;
5776 } else if (ST.hasA16()) {
5777 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5778 IsA16 = A16->getImm() != 0;
5779 }
5780
5781 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5782
5783 unsigned AddrWords =
5784 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5785
5786 unsigned VAddrWords;
5787 if (IsNSA) {
5788 VAddrWords = RsrcIdx - VAddr0Idx;
5789 if (ST.hasPartialNSAEncoding() &&
5790 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5791 unsigned LastVAddrIdx = RsrcIdx - 1;
5792 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5793 }
5794 } else {
5795 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5796 if (AddrWords > 12)
5797 AddrWords = 16;
5798 }
5799
5800 if (VAddrWords != AddrWords) {
5801 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5802 << " but got " << VAddrWords << "\n");
5803 ErrInfo = "bad vaddr size";
5804 return false;
5805 }
5806 }
5807 }
5808
5809 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5810 if (DppCt) {
5811 using namespace AMDGPU::DPP;
5812
5813 unsigned DC = DppCt->getImm();
5814 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5815 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5816 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5817 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5818 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5819 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5820 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5821 ErrInfo = "Invalid dpp_ctrl value";
5822 return false;
5823 }
5824 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5825 !ST.hasDPPWavefrontShifts()) {
5826 ErrInfo = "Invalid dpp_ctrl value: "
5827 "wavefront shifts are not supported on GFX10+";
5828 return false;
5829 }
5830 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5831 !ST.hasDPPBroadcasts()) {
5832 ErrInfo = "Invalid dpp_ctrl value: "
5833 "broadcasts are not supported on GFX10+";
5834 return false;
5835 }
5836 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5837 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5838 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5839 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5840 !ST.hasGFX90AInsts()) {
5841 ErrInfo = "Invalid dpp_ctrl value: "
5842 "row_newbroadcast/row_share is not supported before "
5843 "GFX90A/GFX10";
5844 return false;
5845 }
5846 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5847 ErrInfo = "Invalid dpp_ctrl value: "
5848 "row_share and row_xmask are not supported before GFX10";
5849 return false;
5850 }
5851 }
5852
5853 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5855 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5856 ErrInfo = "Invalid dpp_ctrl value: "
5857 "DP ALU dpp only support row_newbcast";
5858 return false;
5859 }
5860 }
5861
5862 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5863 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5864 AMDGPU::OpName DataName =
5865 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5866 const MachineOperand *Data = getNamedOperand(MI, DataName);
5867 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5868 if (Data && !Data->isReg())
5869 Data = nullptr;
5870
5871 if (ST.hasGFX90AInsts()) {
5872 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5873 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5874 ErrInfo = "Invalid register class: "
5875 "vdata and vdst should be both VGPR or AGPR";
5876 return false;
5877 }
5878 if (Data && Data2 &&
5879 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5880 ErrInfo = "Invalid register class: "
5881 "both data operands should be VGPR or AGPR";
5882 return false;
5883 }
5884 } else {
5885 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5886 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5887 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5888 ErrInfo = "Invalid register class: "
5889 "agpr loads and stores not supported on this GPU";
5890 return false;
5891 }
5892 }
5893 }
5894
5895 if (ST.needsAlignedVGPRs()) {
5896 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5898 if (!Op)
5899 return true;
5900 Register Reg = Op->getReg();
5901 if (Reg.isPhysical())
5902 return !(RI.getHWRegIndex(Reg) & 1);
5903 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5904 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5905 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5906 };
5907
5908 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5909 Opcode == AMDGPU::DS_GWS_BARRIER) {
5910
5911 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5912 ErrInfo = "Subtarget requires even aligned vector registers "
5913 "for DS_GWS instructions";
5914 return false;
5915 }
5916 }
5917
5918 if (isMIMG(MI)) {
5919 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5920 ErrInfo = "Subtarget requires even aligned vector registers "
5921 "for vaddr operand of image instructions";
5922 return false;
5923 }
5924 }
5925 }
5926
5927 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5928 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5929 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5930 ErrInfo = "Invalid register class: "
5931 "v_accvgpr_write with an SGPR is not supported on this GPU";
5932 return false;
5933 }
5934 }
5935
5936 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5937 const MachineOperand &SrcOp = MI.getOperand(1);
5938 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5939 ErrInfo = "pseudo expects only physical SGPRs";
5940 return false;
5941 }
5942 }
5943
5944 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5945 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5946 if (!ST.hasScaleOffset()) {
5947 ErrInfo = "Subtarget does not support offset scaling";
5948 return false;
5949 }
5950 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5951 ErrInfo = "Instruction does not support offset scaling";
5952 return false;
5953 }
5954 }
5955 }
5956
5957 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5958 // information.
5959 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5960 for (unsigned I = 0; I < 3; ++I) {
5962 return false;
5963 }
5964 }
5965
5966 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5967 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5968 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5969 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5970 &AMDGPU::SReg_64RegClass) ||
5971 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5972 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5973 return false;
5974 }
5975 }
5976
5977 return true;
5978}
5979
5981 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5982 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5983 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
5984 ? AMDGPU::COPY
5985 : AMDGPU::V_MOV_B32_e32;
5986 }
5987 return getVALUOp(MI.getOpcode());
5988}
5989
5990// It is more readable to list mapped opcodes on the same line.
5991// clang-format off
5992
5993unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
5994 switch (Opc) {
5995 default: return AMDGPU::INSTRUCTION_LIST_END;
5996 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5997 case AMDGPU::COPY: return AMDGPU::COPY;
5998 case AMDGPU::PHI: return AMDGPU::PHI;
5999 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
6000 case AMDGPU::WQM: return AMDGPU::WQM;
6001 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
6002 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
6003 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
6004 case AMDGPU::S_ADD_I32:
6005 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
6006 case AMDGPU::S_ADDC_U32:
6007 return AMDGPU::V_ADDC_U32_e32;
6008 case AMDGPU::S_SUB_I32:
6009 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
6010 // FIXME: These are not consistently handled, and selected when the carry is
6011 // used.
6012 case AMDGPU::S_ADD_U32:
6013 return AMDGPU::V_ADD_CO_U32_e32;
6014 case AMDGPU::S_SUB_U32:
6015 return AMDGPU::V_SUB_CO_U32_e32;
6016 case AMDGPU::S_ADD_U64_PSEUDO:
6017 return AMDGPU::V_ADD_U64_PSEUDO;
6018 case AMDGPU::S_SUB_U64_PSEUDO:
6019 return AMDGPU::V_SUB_U64_PSEUDO;
6020 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
6021 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
6022 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
6023 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
6024 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
6025 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
6026 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
6027 case AMDGPU::S_XNOR_B32:
6028 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
6029 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
6030 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
6031 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
6032 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6033 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6034 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6035 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6036 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6037 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6038 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6039 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6040 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6041 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6042 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6043 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6044 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6045 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6046 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6047 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6048 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6049 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6050 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6051 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6052 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6053 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6054 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6055 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6056 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6057 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6058 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6059 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6060 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6061 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6062 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6063 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6064 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6065 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6066 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6067 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6068 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6069 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6070 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6071 case AMDGPU::S_CVT_F32_F16:
6072 case AMDGPU::S_CVT_HI_F32_F16:
6073 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6074 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6075 case AMDGPU::S_CVT_F16_F32:
6076 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6077 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6078 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6079 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6080 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6081 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6082 case AMDGPU::S_CEIL_F16:
6083 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6084 : AMDGPU::V_CEIL_F16_fake16_e64;
6085 case AMDGPU::S_FLOOR_F16:
6086 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6087 : AMDGPU::V_FLOOR_F16_fake16_e64;
6088 case AMDGPU::S_TRUNC_F16:
6089 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6090 : AMDGPU::V_TRUNC_F16_fake16_e64;
6091 case AMDGPU::S_RNDNE_F16:
6092 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6093 : AMDGPU::V_RNDNE_F16_fake16_e64;
6094 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6095 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6096 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6097 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6098 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6099 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6100 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6101 case AMDGPU::S_ADD_F16:
6102 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6103 : AMDGPU::V_ADD_F16_fake16_e64;
6104 case AMDGPU::S_SUB_F16:
6105 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6106 : AMDGPU::V_SUB_F16_fake16_e64;
6107 case AMDGPU::S_MIN_F16:
6108 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6109 : AMDGPU::V_MIN_F16_fake16_e64;
6110 case AMDGPU::S_MAX_F16:
6111 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6112 : AMDGPU::V_MAX_F16_fake16_e64;
6113 case AMDGPU::S_MINIMUM_F16:
6114 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6115 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6116 case AMDGPU::S_MAXIMUM_F16:
6117 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6118 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6119 case AMDGPU::S_MUL_F16:
6120 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6121 : AMDGPU::V_MUL_F16_fake16_e64;
6122 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6123 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6124 case AMDGPU::S_FMAC_F16:
6125 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6126 : AMDGPU::V_FMAC_F16_fake16_e64;
6127 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6128 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6129 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6130 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6131 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6132 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6133 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6134 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6135 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6136 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6137 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6138 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6139 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6140 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6141 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6142 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6143 case AMDGPU::S_CMP_LT_F16:
6144 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6145 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6146 case AMDGPU::S_CMP_EQ_F16:
6147 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6148 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6149 case AMDGPU::S_CMP_LE_F16:
6150 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6151 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6152 case AMDGPU::S_CMP_GT_F16:
6153 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6154 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6155 case AMDGPU::S_CMP_LG_F16:
6156 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6157 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6158 case AMDGPU::S_CMP_GE_F16:
6159 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6160 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6161 case AMDGPU::S_CMP_O_F16:
6162 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6163 : AMDGPU::V_CMP_O_F16_fake16_e64;
6164 case AMDGPU::S_CMP_U_F16:
6165 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6166 : AMDGPU::V_CMP_U_F16_fake16_e64;
6167 case AMDGPU::S_CMP_NGE_F16:
6168 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6169 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6170 case AMDGPU::S_CMP_NLG_F16:
6171 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6172 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6173 case AMDGPU::S_CMP_NGT_F16:
6174 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6175 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6176 case AMDGPU::S_CMP_NLE_F16:
6177 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6178 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6179 case AMDGPU::S_CMP_NEQ_F16:
6180 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6181 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6182 case AMDGPU::S_CMP_NLT_F16:
6183 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6184 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6185 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6186 case AMDGPU::V_S_EXP_F16_e64:
6187 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6188 : AMDGPU::V_EXP_F16_fake16_e64;
6189 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6190 case AMDGPU::V_S_LOG_F16_e64:
6191 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6192 : AMDGPU::V_LOG_F16_fake16_e64;
6193 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6194 case AMDGPU::V_S_RCP_F16_e64:
6195 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6196 : AMDGPU::V_RCP_F16_fake16_e64;
6197 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6198 case AMDGPU::V_S_RSQ_F16_e64:
6199 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6200 : AMDGPU::V_RSQ_F16_fake16_e64;
6201 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6202 case AMDGPU::V_S_SQRT_F16_e64:
6203 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6204 : AMDGPU::V_SQRT_F16_fake16_e64;
6205 }
6207 "Unexpected scalar opcode without corresponding vector one!");
6208}
6209
6210// clang-format on
6211
6215 const DebugLoc &DL, Register Reg,
6216 bool IsSCCLive,
6217 SlotIndexes *Indexes) const {
6218 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6219 const SIInstrInfo *TII = ST.getInstrInfo();
6221 if (IsSCCLive) {
6222 // Insert two move instructions, one to save the original value of EXEC and
6223 // the other to turn on all bits in EXEC. This is required as we can't use
6224 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6225 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6227 auto FlipExecMI =
6228 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6229 if (Indexes) {
6230 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6231 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6232 }
6233 } else {
6234 auto SaveExec =
6235 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6236 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6237 if (Indexes)
6238 Indexes->insertMachineInstrInMaps(*SaveExec);
6239 }
6240}
6241
6244 const DebugLoc &DL, Register Reg,
6245 SlotIndexes *Indexes) const {
6247 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6248 .addReg(Reg, RegState::Kill);
6249 if (Indexes)
6250 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6251}
6252
6256 "Not a whole wave func");
6257 MachineBasicBlock &MBB = *MF.begin();
6258 for (MachineInstr &MI : MBB)
6259 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6260 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6261 return &MI;
6262
6263 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6264}
6265
6267 unsigned OpNo) const {
6268 const MCInstrDesc &Desc = get(MI.getOpcode());
6269 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6270 Desc.operands()[OpNo].RegClass == -1) {
6271 Register Reg = MI.getOperand(OpNo).getReg();
6272
6273 if (Reg.isVirtual()) {
6274 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6275 return MRI.getRegClass(Reg);
6276 }
6277 return RI.getPhysRegBaseClass(Reg);
6278 }
6279
6280 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6281 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6282}
6283
6286 MachineBasicBlock *MBB = MI.getParent();
6287 MachineOperand &MO = MI.getOperand(OpIdx);
6288 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6289 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6290 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6291 unsigned Size = RI.getRegSizeInBits(*RC);
6292 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6293 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6294 : AMDGPU::V_MOV_B32_e32;
6295 if (MO.isReg())
6296 Opcode = AMDGPU::COPY;
6297 else if (RI.isSGPRClass(RC))
6298 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6299
6300 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6301 Register Reg = MRI.createVirtualRegister(VRC);
6302 DebugLoc DL = MBB->findDebugLoc(I);
6303 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6304 MO.ChangeToRegister(Reg, false);
6305}
6306
6309 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6310 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6311 if (!SuperReg.getReg().isVirtual())
6312 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6313
6314 MachineBasicBlock *MBB = MI->getParent();
6315 const DebugLoc &DL = MI->getDebugLoc();
6316 Register SubReg = MRI.createVirtualRegister(SubRC);
6317
6318 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6319 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6320 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6321 return SubReg;
6322}
6323
6326 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6327 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6328 if (Op.isImm()) {
6329 if (SubIdx == AMDGPU::sub0)
6330 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6331 if (SubIdx == AMDGPU::sub1)
6332 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6333
6334 llvm_unreachable("Unhandled register index for immediate");
6335 }
6336
6337 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6338 SubIdx, SubRC);
6339 return MachineOperand::CreateReg(SubReg, false);
6340}
6341
6342// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6343void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6344 assert(Inst.getNumExplicitOperands() == 3);
6345 MachineOperand Op1 = Inst.getOperand(1);
6346 Inst.removeOperand(1);
6347 Inst.addOperand(Op1);
6348}
6349
6351 const MCOperandInfo &OpInfo,
6352 const MachineOperand &MO) const {
6353 if (!MO.isReg())
6354 return false;
6355
6356 Register Reg = MO.getReg();
6357
6358 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6359 if (Reg.isPhysical())
6360 return DRC->contains(Reg);
6361
6362 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6363
6364 if (MO.getSubReg()) {
6365 const MachineFunction *MF = MO.getParent()->getMF();
6366 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6367 if (!SuperRC)
6368 return false;
6369 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6370 }
6371
6372 return RI.getCommonSubClass(DRC, RC) != nullptr;
6373}
6374
6376 const MachineOperand &MO) const {
6377 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6378 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6379 unsigned Opc = MI.getOpcode();
6380
6381 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6382 // information.
6383 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6384 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6385 constexpr AMDGPU::OpName OpNames[] = {
6386 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6387
6388 for (auto [I, OpName] : enumerate(OpNames)) {
6389 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6390 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6392 return false;
6393 }
6394 }
6395
6396 if (!isLegalRegOperand(MRI, OpInfo, MO))
6397 return false;
6398
6399 // check Accumulate GPR operand
6400 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6401 if (IsAGPR && !ST.hasMAIInsts())
6402 return false;
6403 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6404 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6405 return false;
6406 // Atomics should have both vdst and vdata either vgpr or agpr.
6407 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6408 const int DataIdx = AMDGPU::getNamedOperandIdx(
6409 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6410 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6411 MI.getOperand(DataIdx).isReg() &&
6412 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6413 return false;
6414 if ((int)OpIdx == DataIdx) {
6415 if (VDstIdx != -1 &&
6416 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6417 return false;
6418 // DS instructions with 2 src operands also must have tied RC.
6419 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6420 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6421 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6422 return false;
6423 }
6424
6425 // Check V_ACCVGPR_WRITE_B32_e64
6426 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6427 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6428 RI.isSGPRReg(MRI, MO.getReg()))
6429 return false;
6430
6431 if (ST.hasFlatScratchHiInB64InstHazard() &&
6432 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6433 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6434 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6435 64)
6436 return false;
6437 }
6438 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6439 return false;
6440 }
6441
6442 return true;
6443}
6444
6446 const MCOperandInfo &OpInfo,
6447 const MachineOperand &MO) const {
6448 if (MO.isReg())
6449 return isLegalRegOperand(MRI, OpInfo, MO);
6450
6451 // Handle non-register types that are treated like immediates.
6452 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6453 return true;
6454}
6455
6457 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6458 const MachineOperand *MO) const {
6459 constexpr unsigned NumOps = 3;
6460 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6461 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6462 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6463 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6464
6465 assert(SrcN < NumOps);
6466
6467 if (!MO) {
6468 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6469 if (SrcIdx == -1)
6470 return true;
6471 MO = &MI.getOperand(SrcIdx);
6472 }
6473
6474 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6475 return true;
6476
6477 int ModsIdx =
6478 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6479 if (ModsIdx == -1)
6480 return true;
6481
6482 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6483 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6484 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6485
6486 return !OpSel && !OpSelHi;
6487}
6488
6490 const MachineOperand *MO) const {
6491 const MachineFunction &MF = *MI.getMF();
6492 const MachineRegisterInfo &MRI = MF.getRegInfo();
6493 const MCInstrDesc &InstDesc = MI.getDesc();
6494 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6495 int64_t RegClass = getOpRegClassID(OpInfo);
6496 const TargetRegisterClass *DefinedRC =
6497 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6498 if (!MO)
6499 MO = &MI.getOperand(OpIdx);
6500
6501 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6502
6503 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6504 const MachineOperand *UsedLiteral = nullptr;
6505
6506 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6507 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6508
6509 // TODO: Be more permissive with frame indexes.
6510 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6511 if (!LiteralLimit--)
6512 return false;
6513
6514 UsedLiteral = MO;
6515 }
6516
6518 if (MO->isReg())
6519 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6520
6521 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6522 if (i == OpIdx)
6523 continue;
6524 const MachineOperand &Op = MI.getOperand(i);
6525 if (Op.isReg()) {
6526 if (Op.isUse()) {
6527 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6528 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6529 if (--ConstantBusLimit <= 0)
6530 return false;
6531 }
6532 }
6533 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6534 !isInlineConstant(Op, InstDesc.operands()[i])) {
6535 // The same literal may be used multiple times.
6536 if (!UsedLiteral)
6537 UsedLiteral = &Op;
6538 else if (UsedLiteral->isIdenticalTo(Op))
6539 continue;
6540
6541 if (!LiteralLimit--)
6542 return false;
6543 if (--ConstantBusLimit <= 0)
6544 return false;
6545 }
6546 }
6547 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6548 // There can be at most one literal operand, but it can be repeated.
6549 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6550 if (i == OpIdx)
6551 continue;
6552 const MachineOperand &Op = MI.getOperand(i);
6553 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6554 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6555 !Op.isIdenticalTo(*MO))
6556 return false;
6557
6558 // Do not fold a non-inlineable and non-register operand into an
6559 // instruction that already has a frame index. The frame index handling
6560 // code could not handle well when a frame index co-exists with another
6561 // non-register operand, unless that operand is an inlineable immediate.
6562 if (Op.isFI())
6563 return false;
6564 }
6565 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6566 isF16PseudoScalarTrans(MI.getOpcode())) {
6567 return false;
6568 }
6569
6570 if (MO->isReg()) {
6571 if (!DefinedRC)
6572 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6573 return isLegalRegOperand(MI, OpIdx, *MO);
6574 }
6575
6576 if (MO->isImm()) {
6577 uint64_t Imm = MO->getImm();
6578 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6579 bool Is64BitOp = Is64BitFPOp ||
6580 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6581 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6582 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6583 if (Is64BitOp &&
6584 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6585 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6586 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6587 return false;
6588
6589 // FIXME: We can use sign extended 64-bit literals, but only for signed
6590 // operands. At the moment we do not know if an operand is signed.
6591 // Such operand will be encoded as its low 32 bits and then either
6592 // correctly sign extended or incorrectly zero extended by HW.
6593 // If 64-bit literals are supported and the literal will be encoded
6594 // as full 64 bit we still can use it.
6595 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6596 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6597 return false;
6598 }
6599 }
6600
6601 // Handle non-register types that are treated like immediates.
6602 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6603
6604 if (!DefinedRC) {
6605 // This operand expects an immediate.
6606 return true;
6607 }
6608
6609 return isImmOperandLegal(MI, OpIdx, *MO);
6610}
6611
6613 bool IsGFX950Only = ST.hasGFX950Insts();
6614 bool IsGFX940Only = ST.hasGFX940Insts();
6615
6616 if (!IsGFX950Only && !IsGFX940Only)
6617 return false;
6618
6619 if (!isVALU(MI))
6620 return false;
6621
6622 // V_COS, V_EXP, V_RCP, etc.
6623 if (isTRANS(MI))
6624 return true;
6625
6626 // DOT2, DOT2C, DOT4, etc.
6627 if (isDOT(MI))
6628 return true;
6629
6630 // MFMA, SMFMA
6631 if (isMFMA(MI))
6632 return true;
6633
6634 unsigned Opcode = MI.getOpcode();
6635 switch (Opcode) {
6636 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6637 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6638 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6639 case AMDGPU::V_MQSAD_U32_U8_e64:
6640 case AMDGPU::V_PK_ADD_F16:
6641 case AMDGPU::V_PK_ADD_F32:
6642 case AMDGPU::V_PK_ADD_I16:
6643 case AMDGPU::V_PK_ADD_U16:
6644 case AMDGPU::V_PK_ASHRREV_I16:
6645 case AMDGPU::V_PK_FMA_F16:
6646 case AMDGPU::V_PK_FMA_F32:
6647 case AMDGPU::V_PK_FMAC_F16_e32:
6648 case AMDGPU::V_PK_FMAC_F16_e64:
6649 case AMDGPU::V_PK_LSHLREV_B16:
6650 case AMDGPU::V_PK_LSHRREV_B16:
6651 case AMDGPU::V_PK_MAD_I16:
6652 case AMDGPU::V_PK_MAD_U16:
6653 case AMDGPU::V_PK_MAX_F16:
6654 case AMDGPU::V_PK_MAX_I16:
6655 case AMDGPU::V_PK_MAX_U16:
6656 case AMDGPU::V_PK_MIN_F16:
6657 case AMDGPU::V_PK_MIN_I16:
6658 case AMDGPU::V_PK_MIN_U16:
6659 case AMDGPU::V_PK_MOV_B32:
6660 case AMDGPU::V_PK_MUL_F16:
6661 case AMDGPU::V_PK_MUL_F32:
6662 case AMDGPU::V_PK_MUL_LO_U16:
6663 case AMDGPU::V_PK_SUB_I16:
6664 case AMDGPU::V_PK_SUB_U16:
6665 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6666 return true;
6667 default:
6668 return false;
6669 }
6670}
6671
6673 MachineInstr &MI) const {
6674 unsigned Opc = MI.getOpcode();
6675 const MCInstrDesc &InstrDesc = get(Opc);
6676
6677 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6678 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6679
6680 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6681 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6682
6683 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6684 // we need to only have one constant bus use before GFX10.
6685 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6686 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6687 RI.isSGPRReg(MRI, Src0.getReg()))
6688 legalizeOpWithMove(MI, Src0Idx);
6689
6690 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6691 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6692 // src0/src1 with V_READFIRSTLANE.
6693 if (Opc == AMDGPU::V_WRITELANE_B32) {
6694 const DebugLoc &DL = MI.getDebugLoc();
6695 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6696 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6697 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6698 .add(Src0);
6699 Src0.ChangeToRegister(Reg, false);
6700 }
6701 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6702 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6703 const DebugLoc &DL = MI.getDebugLoc();
6704 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6705 .add(Src1);
6706 Src1.ChangeToRegister(Reg, false);
6707 }
6708 return;
6709 }
6710
6711 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6712 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6713 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6714 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6715 legalizeOpWithMove(MI, Src2Idx);
6716 }
6717
6718 // VOP2 src0 instructions support all operand types, so we don't need to check
6719 // their legality. If src1 is already legal, we don't need to do anything.
6720 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6721 return;
6722
6723 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6724 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6725 // select is uniform.
6726 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6727 RI.isVGPR(MRI, Src1.getReg())) {
6728 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6729 const DebugLoc &DL = MI.getDebugLoc();
6730 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6731 .add(Src1);
6732 Src1.ChangeToRegister(Reg, false);
6733 return;
6734 }
6735
6736 // We do not use commuteInstruction here because it is too aggressive and will
6737 // commute if it is possible. We only want to commute here if it improves
6738 // legality. This can be called a fairly large number of times so don't waste
6739 // compile time pointlessly swapping and checking legality again.
6740 if (HasImplicitSGPR || !MI.isCommutable()) {
6741 legalizeOpWithMove(MI, Src1Idx);
6742 return;
6743 }
6744
6745 // If src0 can be used as src1, commuting will make the operands legal.
6746 // Otherwise we have to give up and insert a move.
6747 //
6748 // TODO: Other immediate-like operand kinds could be commuted if there was a
6749 // MachineOperand::ChangeTo* for them.
6750 if ((!Src1.isImm() && !Src1.isReg()) ||
6751 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6752 legalizeOpWithMove(MI, Src1Idx);
6753 return;
6754 }
6755
6756 int CommutedOpc = commuteOpcode(MI);
6757 if (CommutedOpc == -1) {
6758 legalizeOpWithMove(MI, Src1Idx);
6759 return;
6760 }
6761
6762 MI.setDesc(get(CommutedOpc));
6763
6764 Register Src0Reg = Src0.getReg();
6765 unsigned Src0SubReg = Src0.getSubReg();
6766 bool Src0Kill = Src0.isKill();
6767
6768 if (Src1.isImm())
6769 Src0.ChangeToImmediate(Src1.getImm());
6770 else if (Src1.isReg()) {
6771 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6772 Src0.setSubReg(Src1.getSubReg());
6773 } else
6774 llvm_unreachable("Should only have register or immediate operands");
6775
6776 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6777 Src1.setSubReg(Src0SubReg);
6779}
6780
6781// Legalize VOP3 operands. All operand types are supported for any operand
6782// but only one literal constant and only starting from GFX10.
6784 MachineInstr &MI) const {
6785 unsigned Opc = MI.getOpcode();
6786
6787 int VOP3Idx[3] = {
6788 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6789 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6790 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6791 };
6792
6793 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6794 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6795 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6796 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6797 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6798 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6799 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6800 // src1 and src2 must be scalar
6801 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6802 const DebugLoc &DL = MI.getDebugLoc();
6803 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6804 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6805 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6806 .add(Src1);
6807 Src1.ChangeToRegister(Reg, false);
6808 }
6809 if (VOP3Idx[2] != -1) {
6810 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6811 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6812 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6813 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6814 .add(Src2);
6815 Src2.ChangeToRegister(Reg, false);
6816 }
6817 }
6818 }
6819
6820 // Find the one SGPR operand we are allowed to use.
6821 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6822 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6823 SmallDenseSet<unsigned> SGPRsUsed;
6824 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6825 if (SGPRReg) {
6826 SGPRsUsed.insert(SGPRReg);
6827 --ConstantBusLimit;
6828 }
6829
6830 for (int Idx : VOP3Idx) {
6831 if (Idx == -1)
6832 break;
6833 MachineOperand &MO = MI.getOperand(Idx);
6834
6835 if (!MO.isReg()) {
6836 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6837 continue;
6838
6839 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6840 --LiteralLimit;
6841 --ConstantBusLimit;
6842 continue;
6843 }
6844
6845 --LiteralLimit;
6846 --ConstantBusLimit;
6847 legalizeOpWithMove(MI, Idx);
6848 continue;
6849 }
6850
6851 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6852 continue; // VGPRs are legal
6853
6854 // We can use one SGPR in each VOP3 instruction prior to GFX10
6855 // and two starting from GFX10.
6856 if (SGPRsUsed.count(MO.getReg()))
6857 continue;
6858 if (ConstantBusLimit > 0) {
6859 SGPRsUsed.insert(MO.getReg());
6860 --ConstantBusLimit;
6861 continue;
6862 }
6863
6864 // If we make it this far, then the operand is not legal and we must
6865 // legalize it.
6866 legalizeOpWithMove(MI, Idx);
6867 }
6868
6869 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6870 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6871 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6872 legalizeOpWithMove(MI, VOP3Idx[2]);
6873
6874 // Fix the register class of packed FP32 instructions on gfx12+. See
6875 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6877 for (unsigned I = 0; I < 3; ++I) {
6878 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6879 legalizeOpWithMove(MI, VOP3Idx[I]);
6880 }
6881 }
6882}
6883
6886 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6887 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6888 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6889 if (DstRC)
6890 SRC = RI.getCommonSubClass(SRC, DstRC);
6891
6892 Register DstReg = MRI.createVirtualRegister(SRC);
6893 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6894
6895 if (RI.hasAGPRs(VRC)) {
6896 VRC = RI.getEquivalentVGPRClass(VRC);
6897 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6898 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6899 get(TargetOpcode::COPY), NewSrcReg)
6900 .addReg(SrcReg);
6901 SrcReg = NewSrcReg;
6902 }
6903
6904 if (SubRegs == 1) {
6905 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6906 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6907 .addReg(SrcReg);
6908 return DstReg;
6909 }
6910
6912 for (unsigned i = 0; i < SubRegs; ++i) {
6913 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6914 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6915 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6916 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6917 SRegs.push_back(SGPR);
6918 }
6919
6921 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6922 get(AMDGPU::REG_SEQUENCE), DstReg);
6923 for (unsigned i = 0; i < SubRegs; ++i) {
6924 MIB.addReg(SRegs[i]);
6925 MIB.addImm(RI.getSubRegFromChannel(i));
6926 }
6927 return DstReg;
6928}
6929
6931 MachineInstr &MI) const {
6932
6933 // If the pointer is store in VGPRs, then we need to move them to
6934 // SGPRs using v_readfirstlane. This is safe because we only select
6935 // loads with uniform pointers to SMRD instruction so we know the
6936 // pointer value is uniform.
6937 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6938 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6939 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6940 SBase->setReg(SGPR);
6941 }
6942 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6943 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6944 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6945 SOff->setReg(SGPR);
6946 }
6947}
6948
6950 unsigned Opc = Inst.getOpcode();
6951 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6952 if (OldSAddrIdx < 0)
6953 return false;
6954
6955 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6956
6957 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6958 if (NewOpc < 0)
6960 if (NewOpc < 0)
6961 return false;
6962
6963 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6964 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6965 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6966 return false;
6967
6968 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6969 if (NewVAddrIdx < 0)
6970 return false;
6971
6972 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6973
6974 // Check vaddr, it shall be zero or absent.
6975 MachineInstr *VAddrDef = nullptr;
6976 if (OldVAddrIdx >= 0) {
6977 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6978 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6979 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6980 !VAddrDef->getOperand(1).isImm() ||
6981 VAddrDef->getOperand(1).getImm() != 0)
6982 return false;
6983 }
6984
6985 const MCInstrDesc &NewDesc = get(NewOpc);
6986 Inst.setDesc(NewDesc);
6987
6988 // Callers expect iterator to be valid after this call, so modify the
6989 // instruction in place.
6990 if (OldVAddrIdx == NewVAddrIdx) {
6991 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6992 // Clear use list from the old vaddr holding a zero register.
6993 MRI.removeRegOperandFromUseList(&NewVAddr);
6994 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6995 Inst.removeOperand(OldSAddrIdx);
6996 // Update the use list with the pointer we have just moved from vaddr to
6997 // saddr position. Otherwise new vaddr will be missing from the use list.
6998 MRI.removeRegOperandFromUseList(&NewVAddr);
6999 MRI.addRegOperandToUseList(&NewVAddr);
7000 } else {
7001 assert(OldSAddrIdx == NewVAddrIdx);
7002
7003 if (OldVAddrIdx >= 0) {
7004 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
7005 AMDGPU::OpName::vdst_in);
7006
7007 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
7008 // it asserts. Untie the operands for now and retie them afterwards.
7009 if (NewVDstIn != -1) {
7010 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
7011 Inst.untieRegOperand(OldVDstIn);
7012 }
7013
7014 Inst.removeOperand(OldVAddrIdx);
7015
7016 if (NewVDstIn != -1) {
7017 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
7018 Inst.tieOperands(NewVDst, NewVDstIn);
7019 }
7020 }
7021 }
7022
7023 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
7024 VAddrDef->eraseFromParent();
7025
7026 return true;
7027}
7028
7029// FIXME: Remove this when SelectionDAG is obsoleted.
7031 MachineInstr &MI) const {
7032 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7033 return;
7034
7035 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7036 // thinks they are uniform, so a readfirstlane should be valid.
7037 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
7038 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
7039 return;
7040
7042 return;
7043
7044 const TargetRegisterClass *DeclaredRC =
7045 getRegClass(MI.getDesc(), SAddr->getOperandNo());
7046
7047 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
7048 SAddr->setReg(ToSGPR);
7049}
7050
7053 const TargetRegisterClass *DstRC,
7056 const DebugLoc &DL) const {
7057 Register OpReg = Op.getReg();
7058 unsigned OpSubReg = Op.getSubReg();
7059
7060 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7061 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7062
7063 // Check if operand is already the correct register class.
7064 if (DstRC == OpRC)
7065 return;
7066
7067 Register DstReg = MRI.createVirtualRegister(DstRC);
7068 auto Copy =
7069 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7070 Op.setReg(DstReg);
7071
7072 MachineInstr *Def = MRI.getVRegDef(OpReg);
7073 if (!Def)
7074 return;
7075
7076 // Try to eliminate the copy if it is copying an immediate value.
7077 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7078 foldImmediate(*Copy, *Def, OpReg, &MRI);
7079
7080 bool ImpDef = Def->isImplicitDef();
7081 while (!ImpDef && Def && Def->isCopy()) {
7082 if (Def->getOperand(1).getReg().isPhysical())
7083 break;
7084 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7085 ImpDef = Def && Def->isImplicitDef();
7086 }
7087 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7088 !ImpDef)
7089 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7090}
7091
7092// Emit the actual waterfall loop, executing the wrapped instruction for each
7093// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7094// iteration, in the worst case we execute 64 (once per lane).
7095static void
7098 MachineBasicBlock &LoopBB,
7099 MachineBasicBlock &BodyBB,
7100 const DebugLoc &DL,
7101 ArrayRef<MachineOperand *> ScalarOps) {
7102 MachineFunction &MF = *LoopBB.getParent();
7103 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7104 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7106 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7107
7109 Register CondReg;
7110
7111 for (MachineOperand *ScalarOp : ScalarOps) {
7112 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7113 unsigned NumSubRegs = RegSize / 32;
7114 Register VScalarOp = ScalarOp->getReg();
7115
7116 if (NumSubRegs == 1) {
7117 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7118
7119 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7120 .addReg(VScalarOp);
7121
7122 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7123
7124 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7125 .addReg(CurReg)
7126 .addReg(VScalarOp);
7127
7128 // Combine the comparison results with AND.
7129 if (!CondReg) // First.
7130 CondReg = NewCondReg;
7131 else { // If not the first, we create an AND.
7132 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7133 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7134 .addReg(CondReg)
7135 .addReg(NewCondReg);
7136 CondReg = AndReg;
7137 }
7138
7139 // Update ScalarOp operand to use the SGPR ScalarOp.
7140 ScalarOp->setReg(CurReg);
7141 ScalarOp->setIsKill();
7142 } else {
7143 SmallVector<Register, 8> ReadlanePieces;
7144 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7145 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7146 "Unhandled register size");
7147
7148 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7149 Register CurRegLo =
7150 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7151 Register CurRegHi =
7152 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7153
7154 // Read the next variant <- also loop target.
7155 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7156 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7157
7158 // Read the next variant <- also loop target.
7159 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7160 .addReg(VScalarOp, VScalarOpUndef,
7161 TRI->getSubRegFromChannel(Idx + 1));
7162
7163 ReadlanePieces.push_back(CurRegLo);
7164 ReadlanePieces.push_back(CurRegHi);
7165
7166 // Comparison is to be done as 64-bit.
7167 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7168 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7169 .addReg(CurRegLo)
7170 .addImm(AMDGPU::sub0)
7171 .addReg(CurRegHi)
7172 .addImm(AMDGPU::sub1);
7173
7174 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7175 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7176 NewCondReg)
7177 .addReg(CurReg);
7178 if (NumSubRegs <= 2)
7179 Cmp.addReg(VScalarOp);
7180 else
7181 Cmp.addReg(VScalarOp, VScalarOpUndef,
7182 TRI->getSubRegFromChannel(Idx, 2));
7183
7184 // Combine the comparison results with AND.
7185 if (!CondReg) // First.
7186 CondReg = NewCondReg;
7187 else { // If not the first, we create an AND.
7188 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7189 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7190 .addReg(CondReg)
7191 .addReg(NewCondReg);
7192 CondReg = AndReg;
7193 }
7194 } // End for loop.
7195
7196 const auto *SScalarOpRC =
7197 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7198 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7199
7200 // Build scalar ScalarOp.
7201 auto Merge =
7202 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7203 unsigned Channel = 0;
7204 for (Register Piece : ReadlanePieces) {
7205 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7206 }
7207
7208 // Update ScalarOp operand to use the SGPR ScalarOp.
7209 ScalarOp->setReg(SScalarOp);
7210 ScalarOp->setIsKill();
7211 }
7212 }
7213
7214 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7215 MRI.setSimpleHint(SaveExec, CondReg);
7216
7217 // Update EXEC to matching lanes, saving original to SaveExec.
7218 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7219 .addReg(CondReg, RegState::Kill);
7220
7221 // The original instruction is here; we insert the terminators after it.
7222 I = BodyBB.end();
7223
7224 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7225 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7226 .addReg(LMC.ExecReg)
7227 .addReg(SaveExec);
7228
7229 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7230}
7231
7232// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7233// with SGPRs by iterating over all unique values across all lanes.
7234// Returns the loop basic block that now contains \p MI.
7235static MachineBasicBlock *
7239 MachineBasicBlock::iterator Begin = nullptr,
7240 MachineBasicBlock::iterator End = nullptr) {
7241 MachineBasicBlock &MBB = *MI.getParent();
7242 MachineFunction &MF = *MBB.getParent();
7243 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7244 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7245 MachineRegisterInfo &MRI = MF.getRegInfo();
7246 if (!Begin.isValid())
7247 Begin = &MI;
7248 if (!End.isValid()) {
7249 End = &MI;
7250 ++End;
7251 }
7252 const DebugLoc &DL = MI.getDebugLoc();
7254 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7255
7256 // Save SCC. Waterfall Loop may overwrite SCC.
7257 Register SaveSCCReg;
7258
7259 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7260 // rather than unlimited scan everywhere
7261 bool SCCNotDead =
7262 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7263 std::numeric_limits<unsigned>::max()) !=
7265 if (SCCNotDead) {
7266 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7267 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7268 .addImm(1)
7269 .addImm(0);
7270 }
7271
7272 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7273
7274 // Save the EXEC mask
7275 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7276
7277 // Killed uses in the instruction we are waterfalling around will be
7278 // incorrect due to the added control-flow.
7280 ++AfterMI;
7281 for (auto I = Begin; I != AfterMI; I++) {
7282 for (auto &MO : I->all_uses())
7283 MRI.clearKillFlags(MO.getReg());
7284 }
7285
7286 // To insert the loop we need to split the block. Move everything after this
7287 // point to a new block, and insert a new empty block between the two.
7290 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7292 ++MBBI;
7293
7294 MF.insert(MBBI, LoopBB);
7295 MF.insert(MBBI, BodyBB);
7296 MF.insert(MBBI, RemainderBB);
7297
7298 LoopBB->addSuccessor(BodyBB);
7299 BodyBB->addSuccessor(LoopBB);
7300 BodyBB->addSuccessor(RemainderBB);
7301
7302 // Move Begin to MI to the BodyBB, and the remainder of the block to
7303 // RemainderBB.
7304 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7305 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7306 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7307
7308 MBB.addSuccessor(LoopBB);
7309
7310 // Update dominators. We know that MBB immediately dominates LoopBB, that
7311 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7312 // RemainderBB. RemainderBB immediately dominates all of the successors
7313 // transferred to it from MBB that MBB used to properly dominate.
7314 if (MDT) {
7315 MDT->addNewBlock(LoopBB, &MBB);
7316 MDT->addNewBlock(BodyBB, LoopBB);
7317 MDT->addNewBlock(RemainderBB, BodyBB);
7318 for (auto &Succ : RemainderBB->successors()) {
7319 if (MDT->properlyDominates(&MBB, Succ)) {
7320 MDT->changeImmediateDominator(Succ, RemainderBB);
7321 }
7322 }
7323 }
7324
7325 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7326
7327 MachineBasicBlock::iterator First = RemainderBB->begin();
7328 // Restore SCC
7329 if (SCCNotDead) {
7330 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7331 .addReg(SaveSCCReg, RegState::Kill)
7332 .addImm(0);
7333 }
7334
7335 // Restore the EXEC mask
7336 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7337 .addReg(SaveExec);
7338 return BodyBB;
7339}
7340
7341// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7342static std::tuple<unsigned, unsigned>
7344 MachineBasicBlock &MBB = *MI.getParent();
7345 MachineFunction &MF = *MBB.getParent();
7346 MachineRegisterInfo &MRI = MF.getRegInfo();
7347
7348 // Extract the ptr from the resource descriptor.
7349 unsigned RsrcPtr =
7350 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7351 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7352
7353 // Create an empty resource descriptor
7354 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7355 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7356 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7357 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7358 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7359
7360 // Zero64 = 0
7361 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7362 .addImm(0);
7363
7364 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7365 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7366 .addImm(Lo_32(RsrcDataFormat));
7367
7368 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7369 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7370 .addImm(Hi_32(RsrcDataFormat));
7371
7372 // NewSRsrc = {Zero64, SRsrcFormat}
7373 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7374 .addReg(Zero64)
7375 .addImm(AMDGPU::sub0_sub1)
7376 .addReg(SRsrcFormatLo)
7377 .addImm(AMDGPU::sub2)
7378 .addReg(SRsrcFormatHi)
7379 .addImm(AMDGPU::sub3);
7380
7381 return std::tuple(RsrcPtr, NewSRsrc);
7382}
7383
7386 MachineDominatorTree *MDT) const {
7387 MachineFunction &MF = *MI.getMF();
7388 MachineRegisterInfo &MRI = MF.getRegInfo();
7389 MachineBasicBlock *CreatedBB = nullptr;
7390
7391 // Legalize VOP2
7392 if (isVOP2(MI) || isVOPC(MI)) {
7394 return CreatedBB;
7395 }
7396
7397 // Legalize VOP3
7398 if (isVOP3(MI)) {
7400 return CreatedBB;
7401 }
7402
7403 // Legalize SMRD
7404 if (isSMRD(MI)) {
7406 return CreatedBB;
7407 }
7408
7409 // Legalize FLAT
7410 if (isFLAT(MI)) {
7412 return CreatedBB;
7413 }
7414
7415 // Legalize PHI
7416 // The register class of the operands must be the same type as the register
7417 // class of the output.
7418 if (MI.getOpcode() == AMDGPU::PHI) {
7419 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7420 assert(!RI.isSGPRClass(VRC));
7421
7422 // Update all the operands so they have the same type.
7423 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7424 MachineOperand &Op = MI.getOperand(I);
7425 if (!Op.isReg() || !Op.getReg().isVirtual())
7426 continue;
7427
7428 // MI is a PHI instruction.
7429 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7431
7432 // Avoid creating no-op copies with the same src and dst reg class. These
7433 // confuse some of the machine passes.
7434 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7435 }
7436 }
7437
7438 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7439 // VGPR dest type and SGPR sources, insert copies so all operands are
7440 // VGPRs. This seems to help operand folding / the register coalescer.
7441 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7442 MachineBasicBlock *MBB = MI.getParent();
7443 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7444 if (RI.hasVGPRs(DstRC)) {
7445 // Update all the operands so they are VGPR register classes. These may
7446 // not be the same register class because REG_SEQUENCE supports mixing
7447 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7448 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7449 MachineOperand &Op = MI.getOperand(I);
7450 if (!Op.isReg() || !Op.getReg().isVirtual())
7451 continue;
7452
7453 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7454 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7455 if (VRC == OpRC)
7456 continue;
7457
7458 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7459 Op.setIsKill();
7460 }
7461 }
7462
7463 return CreatedBB;
7464 }
7465
7466 // Legalize INSERT_SUBREG
7467 // src0 must have the same register class as dst
7468 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7469 Register Dst = MI.getOperand(0).getReg();
7470 Register Src0 = MI.getOperand(1).getReg();
7471 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7472 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7473 if (DstRC != Src0RC) {
7474 MachineBasicBlock *MBB = MI.getParent();
7475 MachineOperand &Op = MI.getOperand(1);
7476 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7477 }
7478 return CreatedBB;
7479 }
7480
7481 // Legalize SI_INIT_M0
7482 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7483 MachineOperand &Src = MI.getOperand(0);
7484 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7485 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7486 return CreatedBB;
7487 }
7488
7489 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7490 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7491 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7492 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7493 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7494 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7495 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7496 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7497 MachineOperand &Src = MI.getOperand(1);
7498 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7499 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7500 return CreatedBB;
7501 }
7502
7503 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7504 //
7505 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7506 // scratch memory access. In both cases, the legalization never involves
7507 // conversion to the addr64 form.
7509 (isMUBUF(MI) || isMTBUF(MI)))) {
7510 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7511 ? AMDGPU::OpName::rsrc
7512 : AMDGPU::OpName::srsrc;
7513 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7514 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7515 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7516
7517 AMDGPU::OpName SampOpName =
7518 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7519 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7520 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7521 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7522
7523 return CreatedBB;
7524 }
7525
7526 // Legalize SI_CALL
7527 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7528 MachineOperand *Dest = &MI.getOperand(0);
7529 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7530 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7531 // following copies, we also need to move copies from and to physical
7532 // registers into the loop block.
7533 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7534 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7535
7536 // Also move the copies to physical registers into the loop block
7537 MachineBasicBlock &MBB = *MI.getParent();
7539 while (Start->getOpcode() != FrameSetupOpcode)
7540 --Start;
7542 while (End->getOpcode() != FrameDestroyOpcode)
7543 ++End;
7544 // Also include following copies of the return value
7545 ++End;
7546 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7547 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7548 ++End;
7549 CreatedBB =
7550 loadScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7551 }
7552 }
7553
7554 // Legalize s_sleep_var.
7555 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7556 const DebugLoc &DL = MI.getDebugLoc();
7557 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7558 int Src0Idx =
7559 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7560 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7561 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7562 .add(Src0);
7563 Src0.ChangeToRegister(Reg, false);
7564 return nullptr;
7565 }
7566
7567 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7568 // operands are scalar.
7569 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7570 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7571 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7572 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7573 for (MachineOperand &Src : MI.explicit_operands()) {
7574 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7575 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7576 }
7577 return CreatedBB;
7578 }
7579
7580 // Legalize MUBUF instructions.
7581 bool isSoffsetLegal = true;
7582 int SoffsetIdx =
7583 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7584 if (SoffsetIdx != -1) {
7585 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7586 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7587 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7588 isSoffsetLegal = false;
7589 }
7590 }
7591
7592 bool isRsrcLegal = true;
7593 int RsrcIdx =
7594 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7595 if (RsrcIdx != -1) {
7596 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7597 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7598 isRsrcLegal = false;
7599 }
7600
7601 // The operands are legal.
7602 if (isRsrcLegal && isSoffsetLegal)
7603 return CreatedBB;
7604
7605 if (!isRsrcLegal) {
7606 // Legalize a VGPR Rsrc
7607 //
7608 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7609 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7610 // a zero-value SRsrc.
7611 //
7612 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7613 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7614 // above.
7615 //
7616 // Otherwise we are on non-ADDR64 hardware, and/or we have
7617 // idxen/offen/bothen and we fall back to a waterfall loop.
7618
7619 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7620 MachineBasicBlock &MBB = *MI.getParent();
7621
7622 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7623 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7624 // This is already an ADDR64 instruction so we need to add the pointer
7625 // extracted from the resource descriptor to the current value of VAddr.
7626 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7627 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7628 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7629
7630 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7631 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7632 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7633
7634 unsigned RsrcPtr, NewSRsrc;
7635 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7636
7637 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7638 const DebugLoc &DL = MI.getDebugLoc();
7639 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7640 .addDef(CondReg0)
7641 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7642 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7643 .addImm(0);
7644
7645 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7646 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7647 .addDef(CondReg1, RegState::Dead)
7648 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7649 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7650 .addReg(CondReg0, RegState::Kill)
7651 .addImm(0);
7652
7653 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7654 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7655 .addReg(NewVAddrLo)
7656 .addImm(AMDGPU::sub0)
7657 .addReg(NewVAddrHi)
7658 .addImm(AMDGPU::sub1);
7659
7660 VAddr->setReg(NewVAddr);
7661 Rsrc->setReg(NewSRsrc);
7662 } else if (!VAddr && ST.hasAddr64()) {
7663 // This instructions is the _OFFSET variant, so we need to convert it to
7664 // ADDR64.
7665 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7666 "FIXME: Need to emit flat atomics here");
7667
7668 unsigned RsrcPtr, NewSRsrc;
7669 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7670
7671 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7672 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7673 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7674 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7675 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7676
7677 // Atomics with return have an additional tied operand and are
7678 // missing some of the special bits.
7679 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7680 MachineInstr *Addr64;
7681
7682 if (!VDataIn) {
7683 // Regular buffer load / store.
7685 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7686 .add(*VData)
7687 .addReg(NewVAddr)
7688 .addReg(NewSRsrc)
7689 .add(*SOffset)
7690 .add(*Offset);
7691
7692 if (const MachineOperand *CPol =
7693 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7694 MIB.addImm(CPol->getImm());
7695 }
7696
7697 if (const MachineOperand *TFE =
7698 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7699 MIB.addImm(TFE->getImm());
7700 }
7701
7702 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7703
7704 MIB.cloneMemRefs(MI);
7705 Addr64 = MIB;
7706 } else {
7707 // Atomics with return.
7708 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7709 .add(*VData)
7710 .add(*VDataIn)
7711 .addReg(NewVAddr)
7712 .addReg(NewSRsrc)
7713 .add(*SOffset)
7714 .add(*Offset)
7715 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7716 .cloneMemRefs(MI);
7717 }
7718
7719 MI.removeFromParent();
7720
7721 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7722 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7723 NewVAddr)
7724 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7725 .addImm(AMDGPU::sub0)
7726 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7727 .addImm(AMDGPU::sub1);
7728 } else {
7729 // Legalize a VGPR Rsrc and soffset together.
7730 if (!isSoffsetLegal) {
7731 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7732 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7733 return CreatedBB;
7734 }
7735 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7736 return CreatedBB;
7737 }
7738 }
7739
7740 // Legalize a VGPR soffset.
7741 if (!isSoffsetLegal) {
7742 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7743 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7744 return CreatedBB;
7745 }
7746 return CreatedBB;
7747}
7748
7750 InstrList.insert(MI);
7751 // Add MBUF instructiosn to deferred list.
7752 int RsrcIdx =
7753 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7754 if (RsrcIdx != -1) {
7755 DeferredList.insert(MI);
7756 }
7757}
7758
7760 return DeferredList.contains(MI);
7761}
7762
7763// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7764// lowering (change sgpr to vgpr).
7765// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7766// size. Need to legalize the size of the operands during the vgpr lowering
7767// chain. This can be removed after we have sgpr16 in place
7769 MachineRegisterInfo &MRI) const {
7770 if (!ST.useRealTrue16Insts())
7771 return;
7772
7773 unsigned Opcode = MI.getOpcode();
7774 MachineBasicBlock *MBB = MI.getParent();
7775 // Legalize operands and check for size mismatch
7776 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7777 OpIdx >= get(Opcode).getNumOperands() ||
7778 get(Opcode).operands()[OpIdx].RegClass == -1)
7779 return;
7780
7781 MachineOperand &Op = MI.getOperand(OpIdx);
7782 if (!Op.isReg() || !Op.getReg().isVirtual())
7783 return;
7784
7785 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7786 if (!RI.isVGPRClass(CurrRC))
7787 return;
7788
7789 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7790 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7791 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7792 Op.setSubReg(AMDGPU::lo16);
7793 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7794 const DebugLoc &DL = MI.getDebugLoc();
7795 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7796 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7797 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7798 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7799 .addReg(Op.getReg())
7800 .addImm(AMDGPU::lo16)
7801 .addReg(Undef)
7802 .addImm(AMDGPU::hi16);
7803 Op.setReg(NewDstReg);
7804 }
7805}
7807 MachineRegisterInfo &MRI) const {
7808 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7810}
7811
7813 MachineDominatorTree *MDT) const {
7814
7815 while (!Worklist.empty()) {
7816 MachineInstr &Inst = *Worklist.top();
7817 Worklist.erase_top();
7818 // Skip MachineInstr in the deferred list.
7819 if (Worklist.isDeferred(&Inst))
7820 continue;
7821 moveToVALUImpl(Worklist, MDT, Inst);
7822 }
7823
7824 // Deferred list of instructions will be processed once
7825 // all the MachineInstr in the worklist are done.
7826 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7827 moveToVALUImpl(Worklist, MDT, *Inst);
7828 assert(Worklist.empty() &&
7829 "Deferred MachineInstr are not supposed to re-populate worklist");
7830 }
7831}
7832
7835 MachineInstr &Inst) const {
7836
7838 if (!MBB)
7839 return;
7840 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7841 unsigned Opcode = Inst.getOpcode();
7842 unsigned NewOpcode = getVALUOp(Inst);
7843 const DebugLoc &DL = Inst.getDebugLoc();
7844
7845 // Handle some special cases
7846 switch (Opcode) {
7847 default:
7848 break;
7849 case AMDGPU::S_ADD_I32:
7850 case AMDGPU::S_SUB_I32: {
7851 // FIXME: The u32 versions currently selected use the carry.
7852 bool Changed;
7853 MachineBasicBlock *CreatedBBTmp = nullptr;
7854 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7855 if (Changed)
7856 return;
7857
7858 // Default handling
7859 break;
7860 }
7861
7862 case AMDGPU::S_MUL_U64:
7863 if (ST.hasVectorMulU64()) {
7864 NewOpcode = AMDGPU::V_MUL_U64_e64;
7865 break;
7866 }
7867 // Split s_mul_u64 in 32-bit vector multiplications.
7868 splitScalarSMulU64(Worklist, Inst, MDT);
7869 Inst.eraseFromParent();
7870 return;
7871
7872 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7873 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7874 // This is a special case of s_mul_u64 where all the operands are either
7875 // zero extended or sign extended.
7876 splitScalarSMulPseudo(Worklist, Inst, MDT);
7877 Inst.eraseFromParent();
7878 return;
7879
7880 case AMDGPU::S_AND_B64:
7881 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7882 Inst.eraseFromParent();
7883 return;
7884
7885 case AMDGPU::S_OR_B64:
7886 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7887 Inst.eraseFromParent();
7888 return;
7889
7890 case AMDGPU::S_XOR_B64:
7891 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7892 Inst.eraseFromParent();
7893 return;
7894
7895 case AMDGPU::S_NAND_B64:
7896 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7897 Inst.eraseFromParent();
7898 return;
7899
7900 case AMDGPU::S_NOR_B64:
7901 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7902 Inst.eraseFromParent();
7903 return;
7904
7905 case AMDGPU::S_XNOR_B64:
7906 if (ST.hasDLInsts())
7907 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7908 else
7909 splitScalar64BitXnor(Worklist, Inst, MDT);
7910 Inst.eraseFromParent();
7911 return;
7912
7913 case AMDGPU::S_ANDN2_B64:
7914 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7915 Inst.eraseFromParent();
7916 return;
7917
7918 case AMDGPU::S_ORN2_B64:
7919 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7920 Inst.eraseFromParent();
7921 return;
7922
7923 case AMDGPU::S_BREV_B64:
7924 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7925 Inst.eraseFromParent();
7926 return;
7927
7928 case AMDGPU::S_NOT_B64:
7929 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7930 Inst.eraseFromParent();
7931 return;
7932
7933 case AMDGPU::S_BCNT1_I32_B64:
7934 splitScalar64BitBCNT(Worklist, Inst);
7935 Inst.eraseFromParent();
7936 return;
7937
7938 case AMDGPU::S_BFE_I64:
7939 splitScalar64BitBFE(Worklist, Inst);
7940 Inst.eraseFromParent();
7941 return;
7942
7943 case AMDGPU::S_FLBIT_I32_B64:
7944 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7945 Inst.eraseFromParent();
7946 return;
7947 case AMDGPU::S_FF1_I32_B64:
7948 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7949 Inst.eraseFromParent();
7950 return;
7951
7952 case AMDGPU::S_LSHL_B32:
7953 if (ST.hasOnlyRevVALUShifts()) {
7954 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7955 swapOperands(Inst);
7956 }
7957 break;
7958 case AMDGPU::S_ASHR_I32:
7959 if (ST.hasOnlyRevVALUShifts()) {
7960 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7961 swapOperands(Inst);
7962 }
7963 break;
7964 case AMDGPU::S_LSHR_B32:
7965 if (ST.hasOnlyRevVALUShifts()) {
7966 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7967 swapOperands(Inst);
7968 }
7969 break;
7970 case AMDGPU::S_LSHL_B64:
7971 if (ST.hasOnlyRevVALUShifts()) {
7972 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7973 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7974 : AMDGPU::V_LSHLREV_B64_e64;
7975 swapOperands(Inst);
7976 }
7977 break;
7978 case AMDGPU::S_ASHR_I64:
7979 if (ST.hasOnlyRevVALUShifts()) {
7980 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7981 swapOperands(Inst);
7982 }
7983 break;
7984 case AMDGPU::S_LSHR_B64:
7985 if (ST.hasOnlyRevVALUShifts()) {
7986 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7987 swapOperands(Inst);
7988 }
7989 break;
7990
7991 case AMDGPU::S_ABS_I32:
7992 lowerScalarAbs(Worklist, Inst);
7993 Inst.eraseFromParent();
7994 return;
7995
7996 case AMDGPU::S_ABSDIFF_I32:
7997 lowerScalarAbsDiff(Worklist, Inst);
7998 Inst.eraseFromParent();
7999 return;
8000
8001 case AMDGPU::S_CBRANCH_SCC0:
8002 case AMDGPU::S_CBRANCH_SCC1: {
8003 // Clear unused bits of vcc
8004 Register CondReg = Inst.getOperand(1).getReg();
8005 bool IsSCC = CondReg == AMDGPU::SCC;
8007 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8008 .addReg(LMC.ExecReg)
8009 .addReg(IsSCC ? LMC.VccReg : CondReg);
8010 Inst.removeOperand(1);
8011 } break;
8012
8013 case AMDGPU::S_BFE_U64:
8014 case AMDGPU::S_BFM_B64:
8015 llvm_unreachable("Moving this op to VALU not implemented");
8016
8017 case AMDGPU::S_PACK_LL_B32_B16:
8018 case AMDGPU::S_PACK_LH_B32_B16:
8019 case AMDGPU::S_PACK_HL_B32_B16:
8020 case AMDGPU::S_PACK_HH_B32_B16:
8021 movePackToVALU(Worklist, MRI, Inst);
8022 Inst.eraseFromParent();
8023 return;
8024
8025 case AMDGPU::S_XNOR_B32:
8026 lowerScalarXnor(Worklist, Inst);
8027 Inst.eraseFromParent();
8028 return;
8029
8030 case AMDGPU::S_NAND_B32:
8031 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8032 Inst.eraseFromParent();
8033 return;
8034
8035 case AMDGPU::S_NOR_B32:
8036 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8037 Inst.eraseFromParent();
8038 return;
8039
8040 case AMDGPU::S_ANDN2_B32:
8041 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8042 Inst.eraseFromParent();
8043 return;
8044
8045 case AMDGPU::S_ORN2_B32:
8046 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8047 Inst.eraseFromParent();
8048 return;
8049
8050 // TODO: remove as soon as everything is ready
8051 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8052 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8053 // can only be selected from the uniform SDNode.
8054 case AMDGPU::S_ADD_CO_PSEUDO:
8055 case AMDGPU::S_SUB_CO_PSEUDO: {
8056 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8057 ? AMDGPU::V_ADDC_U32_e64
8058 : AMDGPU::V_SUBB_U32_e64;
8059 const auto *CarryRC = RI.getWaveMaskRegClass();
8060
8061 Register CarryInReg = Inst.getOperand(4).getReg();
8062 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8063 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8064 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8065 .addReg(CarryInReg);
8066 }
8067
8068 Register CarryOutReg = Inst.getOperand(1).getReg();
8069
8070 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8071 MRI.getRegClass(Inst.getOperand(0).getReg())));
8072 MachineInstr *CarryOp =
8073 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8074 .addReg(CarryOutReg, RegState::Define)
8075 .add(Inst.getOperand(2))
8076 .add(Inst.getOperand(3))
8077 .addReg(CarryInReg)
8078 .addImm(0);
8079 legalizeOperands(*CarryOp);
8080 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8081 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8082 Inst.eraseFromParent();
8083 }
8084 return;
8085 case AMDGPU::S_UADDO_PSEUDO:
8086 case AMDGPU::S_USUBO_PSEUDO: {
8087 MachineOperand &Dest0 = Inst.getOperand(0);
8088 MachineOperand &Dest1 = Inst.getOperand(1);
8089 MachineOperand &Src0 = Inst.getOperand(2);
8090 MachineOperand &Src1 = Inst.getOperand(3);
8091
8092 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8093 ? AMDGPU::V_ADD_CO_U32_e64
8094 : AMDGPU::V_SUB_CO_U32_e64;
8095 const TargetRegisterClass *NewRC =
8096 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8097 Register DestReg = MRI.createVirtualRegister(NewRC);
8098 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8099 .addReg(Dest1.getReg(), RegState::Define)
8100 .add(Src0)
8101 .add(Src1)
8102 .addImm(0); // clamp bit
8103
8104 legalizeOperands(*NewInstr, MDT);
8105 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8106 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8107 Inst.eraseFromParent();
8108 }
8109 return;
8110 case AMDGPU::S_LSHL1_ADD_U32:
8111 case AMDGPU::S_LSHL2_ADD_U32:
8112 case AMDGPU::S_LSHL3_ADD_U32:
8113 case AMDGPU::S_LSHL4_ADD_U32: {
8114 MachineOperand &Dest = Inst.getOperand(0);
8115 MachineOperand &Src0 = Inst.getOperand(1);
8116 MachineOperand &Src1 = Inst.getOperand(2);
8117 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8118 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8119 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8120 : 4);
8121
8122 const TargetRegisterClass *NewRC =
8123 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8124 Register DestReg = MRI.createVirtualRegister(NewRC);
8125 MachineInstr *NewInstr =
8126 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8127 .add(Src0)
8128 .addImm(ShiftAmt)
8129 .add(Src1);
8130
8131 legalizeOperands(*NewInstr, MDT);
8132 MRI.replaceRegWith(Dest.getReg(), DestReg);
8133 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8134 Inst.eraseFromParent();
8135 }
8136 return;
8137 case AMDGPU::S_CSELECT_B32:
8138 case AMDGPU::S_CSELECT_B64:
8139 lowerSelect(Worklist, Inst, MDT);
8140 Inst.eraseFromParent();
8141 return;
8142 case AMDGPU::S_CMP_EQ_I32:
8143 case AMDGPU::S_CMP_LG_I32:
8144 case AMDGPU::S_CMP_GT_I32:
8145 case AMDGPU::S_CMP_GE_I32:
8146 case AMDGPU::S_CMP_LT_I32:
8147 case AMDGPU::S_CMP_LE_I32:
8148 case AMDGPU::S_CMP_EQ_U32:
8149 case AMDGPU::S_CMP_LG_U32:
8150 case AMDGPU::S_CMP_GT_U32:
8151 case AMDGPU::S_CMP_GE_U32:
8152 case AMDGPU::S_CMP_LT_U32:
8153 case AMDGPU::S_CMP_LE_U32:
8154 case AMDGPU::S_CMP_EQ_U64:
8155 case AMDGPU::S_CMP_LG_U64:
8156 case AMDGPU::S_CMP_LT_F32:
8157 case AMDGPU::S_CMP_EQ_F32:
8158 case AMDGPU::S_CMP_LE_F32:
8159 case AMDGPU::S_CMP_GT_F32:
8160 case AMDGPU::S_CMP_LG_F32:
8161 case AMDGPU::S_CMP_GE_F32:
8162 case AMDGPU::S_CMP_O_F32:
8163 case AMDGPU::S_CMP_U_F32:
8164 case AMDGPU::S_CMP_NGE_F32:
8165 case AMDGPU::S_CMP_NLG_F32:
8166 case AMDGPU::S_CMP_NGT_F32:
8167 case AMDGPU::S_CMP_NLE_F32:
8168 case AMDGPU::S_CMP_NEQ_F32:
8169 case AMDGPU::S_CMP_NLT_F32: {
8170 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8171 auto NewInstr =
8172 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8173 .setMIFlags(Inst.getFlags());
8174 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8175 0) {
8176 NewInstr
8177 .addImm(0) // src0_modifiers
8178 .add(Inst.getOperand(0)) // src0
8179 .addImm(0) // src1_modifiers
8180 .add(Inst.getOperand(1)) // src1
8181 .addImm(0); // clamp
8182 } else {
8183 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8184 }
8185 legalizeOperands(*NewInstr, MDT);
8186 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8187 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8188 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8189 Inst.eraseFromParent();
8190 return;
8191 }
8192 case AMDGPU::S_CMP_LT_F16:
8193 case AMDGPU::S_CMP_EQ_F16:
8194 case AMDGPU::S_CMP_LE_F16:
8195 case AMDGPU::S_CMP_GT_F16:
8196 case AMDGPU::S_CMP_LG_F16:
8197 case AMDGPU::S_CMP_GE_F16:
8198 case AMDGPU::S_CMP_O_F16:
8199 case AMDGPU::S_CMP_U_F16:
8200 case AMDGPU::S_CMP_NGE_F16:
8201 case AMDGPU::S_CMP_NLG_F16:
8202 case AMDGPU::S_CMP_NGT_F16:
8203 case AMDGPU::S_CMP_NLE_F16:
8204 case AMDGPU::S_CMP_NEQ_F16:
8205 case AMDGPU::S_CMP_NLT_F16: {
8206 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8207 auto NewInstr =
8208 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8209 .setMIFlags(Inst.getFlags());
8210 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8211 NewInstr
8212 .addImm(0) // src0_modifiers
8213 .add(Inst.getOperand(0)) // src0
8214 .addImm(0) // src1_modifiers
8215 .add(Inst.getOperand(1)) // src1
8216 .addImm(0); // clamp
8217 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8218 NewInstr.addImm(0); // op_sel0
8219 } else {
8220 NewInstr
8221 .add(Inst.getOperand(0))
8222 .add(Inst.getOperand(1));
8223 }
8224 legalizeOperandsVALUt16(*NewInstr, MRI);
8225 legalizeOperands(*NewInstr, MDT);
8226 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8227 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8228 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8229 Inst.eraseFromParent();
8230 return;
8231 }
8232 case AMDGPU::S_CVT_HI_F32_F16: {
8233 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8234 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8235 if (ST.useRealTrue16Insts()) {
8236 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8237 .add(Inst.getOperand(1));
8238 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8239 .addImm(0) // src0_modifiers
8240 .addReg(TmpReg, {}, AMDGPU::hi16)
8241 .addImm(0) // clamp
8242 .addImm(0) // omod
8243 .addImm(0); // op_sel0
8244 } else {
8245 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8246 .addImm(16)
8247 .add(Inst.getOperand(1));
8248 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8249 .addImm(0) // src0_modifiers
8250 .addReg(TmpReg)
8251 .addImm(0) // clamp
8252 .addImm(0); // omod
8253 }
8254
8255 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8256 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8257 Inst.eraseFromParent();
8258 return;
8259 }
8260 case AMDGPU::S_MINIMUM_F32:
8261 case AMDGPU::S_MAXIMUM_F32: {
8262 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8263 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8264 .addImm(0) // src0_modifiers
8265 .add(Inst.getOperand(1))
8266 .addImm(0) // src1_modifiers
8267 .add(Inst.getOperand(2))
8268 .addImm(0) // clamp
8269 .addImm(0); // omod
8270 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8271
8272 legalizeOperands(*NewInstr, MDT);
8273 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8274 Inst.eraseFromParent();
8275 return;
8276 }
8277 case AMDGPU::S_MINIMUM_F16:
8278 case AMDGPU::S_MAXIMUM_F16: {
8279 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8280 ? &AMDGPU::VGPR_16RegClass
8281 : &AMDGPU::VGPR_32RegClass);
8282 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8283 .addImm(0) // src0_modifiers
8284 .add(Inst.getOperand(1))
8285 .addImm(0) // src1_modifiers
8286 .add(Inst.getOperand(2))
8287 .addImm(0) // clamp
8288 .addImm(0) // omod
8289 .addImm(0); // opsel0
8290 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8291 legalizeOperandsVALUt16(*NewInstr, MRI);
8292 legalizeOperands(*NewInstr, MDT);
8293 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8294 Inst.eraseFromParent();
8295 return;
8296 }
8297 case AMDGPU::V_S_EXP_F16_e64:
8298 case AMDGPU::V_S_LOG_F16_e64:
8299 case AMDGPU::V_S_RCP_F16_e64:
8300 case AMDGPU::V_S_RSQ_F16_e64:
8301 case AMDGPU::V_S_SQRT_F16_e64: {
8302 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8303 ? &AMDGPU::VGPR_16RegClass
8304 : &AMDGPU::VGPR_32RegClass);
8305 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8306 .add(Inst.getOperand(1)) // src0_modifiers
8307 .add(Inst.getOperand(2))
8308 .add(Inst.getOperand(3)) // clamp
8309 .add(Inst.getOperand(4)) // omod
8310 .setMIFlags(Inst.getFlags());
8311 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8312 NewInstr.addImm(0); // opsel0
8313 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8314 legalizeOperandsVALUt16(*NewInstr, MRI);
8315 legalizeOperands(*NewInstr, MDT);
8316 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8317 Inst.eraseFromParent();
8318 return;
8319 }
8320 }
8321
8322 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8323 // We cannot move this instruction to the VALU, so we should try to
8324 // legalize its operands instead.
8325 legalizeOperands(Inst, MDT);
8326 return;
8327 }
8328 // Handle converting generic instructions like COPY-to-SGPR into
8329 // COPY-to-VGPR.
8330 if (NewOpcode == Opcode) {
8331 Register DstReg = Inst.getOperand(0).getReg();
8332 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8333
8334 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8335 // hope for the best.
8336 if (Inst.isCopy() && DstReg.isPhysical() &&
8337 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8338 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8339 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8340 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8341 .add(Inst.getOperand(1));
8342 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8343 DstReg)
8344 .addReg(NewDst);
8345
8346 Inst.eraseFromParent();
8347 return;
8348 }
8349
8350 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8351 Register NewDstReg = Inst.getOperand(1).getReg();
8352 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8353 if (const TargetRegisterClass *CommonRC =
8354 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8355 // Instead of creating a copy where src and dst are the same register
8356 // class, we just replace all uses of dst with src. These kinds of
8357 // copies interfere with the heuristics MachineSink uses to decide
8358 // whether or not to split a critical edge. Since the pass assumes
8359 // that copies will end up as machine instructions and not be
8360 // eliminated.
8361 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8362 MRI.replaceRegWith(DstReg, NewDstReg);
8363 MRI.clearKillFlags(NewDstReg);
8364 Inst.getOperand(0).setReg(DstReg);
8365
8366 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8367 llvm_unreachable("failed to constrain register");
8368
8369 Inst.eraseFromParent();
8370
8371 for (MachineOperand &UseMO :
8372 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8373 MachineInstr &UseMI = *UseMO.getParent();
8374
8375 // Legalize t16 operands since replaceReg is called after
8376 // addUsersToVALU.
8378
8379 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8380 if (const TargetRegisterClass *OpRC =
8381 getRegClass(UseMI.getDesc(), OpIdx))
8382 MRI.constrainRegClass(NewDstReg, OpRC);
8383 }
8384
8385 return;
8386 }
8387 }
8388
8389 // If this is a v2s copy between 16bit and 32bit reg,
8390 // replace vgpr copy to reg_sequence/extract_subreg
8391 // This can be remove after we have sgpr16 in place
8392 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8393 Inst.getOperand(1).getReg().isVirtual() &&
8394 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8395 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8396 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8397 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8398 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8399 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8400 get(AMDGPU::IMPLICIT_DEF), Undef);
8401 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8402 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8403 .addReg(Inst.getOperand(1).getReg())
8404 .addImm(AMDGPU::lo16)
8405 .addReg(Undef)
8406 .addImm(AMDGPU::hi16);
8407 Inst.eraseFromParent();
8408 MRI.replaceRegWith(DstReg, NewDstReg);
8409 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8410 return;
8411 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8412 AMDGPU::lo16)) {
8413 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8414 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8415 MRI.replaceRegWith(DstReg, NewDstReg);
8416 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8417 return;
8418 }
8419 }
8420
8421 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8422 MRI.replaceRegWith(DstReg, NewDstReg);
8423 legalizeOperands(Inst, MDT);
8424 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8425 return;
8426 }
8427
8428 // Use the new VALU Opcode.
8429 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8430 .setMIFlags(Inst.getFlags());
8431 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8432 // Intersperse VOP3 modifiers among the SALU operands.
8433 NewInstr->addOperand(Inst.getOperand(0));
8434 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8435 AMDGPU::OpName::src0_modifiers) >= 0)
8436 NewInstr.addImm(0);
8437 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8438 const MachineOperand &Src = Inst.getOperand(1);
8439 NewInstr->addOperand(Src);
8440 }
8441
8442 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8443 // We are converting these to a BFE, so we need to add the missing
8444 // operands for the size and offset.
8445 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8446 NewInstr.addImm(0);
8447 NewInstr.addImm(Size);
8448 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8449 // The VALU version adds the second operand to the result, so insert an
8450 // extra 0 operand.
8451 NewInstr.addImm(0);
8452 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8453 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8454 // If we need to move this to VGPRs, we need to unpack the second
8455 // operand back into the 2 separate ones for bit offset and width.
8456 assert(OffsetWidthOp.isImm() &&
8457 "Scalar BFE is only implemented for constant width and offset");
8458 uint32_t Imm = OffsetWidthOp.getImm();
8459
8460 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8461 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8462 NewInstr.addImm(Offset);
8463 NewInstr.addImm(BitWidth);
8464 } else {
8465 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8466 AMDGPU::OpName::src1_modifiers) >= 0)
8467 NewInstr.addImm(0);
8468 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8469 NewInstr->addOperand(Inst.getOperand(2));
8470 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8471 AMDGPU::OpName::src2_modifiers) >= 0)
8472 NewInstr.addImm(0);
8473 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8474 NewInstr->addOperand(Inst.getOperand(3));
8475 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8476 NewInstr.addImm(0);
8477 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8478 NewInstr.addImm(0);
8479 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8480 NewInstr.addImm(0);
8481 }
8482 } else {
8483 // Just copy the SALU operands.
8484 for (const MachineOperand &Op : Inst.explicit_operands())
8485 NewInstr->addOperand(Op);
8486 }
8487
8488 // Remove any references to SCC. Vector instructions can't read from it, and
8489 // We're just about to add the implicit use / defs of VCC, and we don't want
8490 // both.
8491 for (MachineOperand &Op : Inst.implicit_operands()) {
8492 if (Op.getReg() == AMDGPU::SCC) {
8493 // Only propagate through live-def of SCC.
8494 if (Op.isDef() && !Op.isDead())
8495 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8496 if (Op.isUse())
8497 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8498 }
8499 }
8500 Inst.eraseFromParent();
8501 Register NewDstReg;
8502 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8503 Register DstReg = NewInstr->getOperand(0).getReg();
8504 assert(DstReg.isVirtual());
8505 // Update the destination register class.
8506 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8507 assert(NewDstRC);
8508 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8509 MRI.replaceRegWith(DstReg, NewDstReg);
8510 }
8511 fixImplicitOperands(*NewInstr);
8512
8513 legalizeOperandsVALUt16(*NewInstr, MRI);
8514
8515 // Legalize the operands
8516 legalizeOperands(*NewInstr, MDT);
8517 if (NewDstReg)
8518 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8519}
8520
8521// Add/sub require special handling to deal with carry outs.
8522std::pair<bool, MachineBasicBlock *>
8523SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8524 MachineDominatorTree *MDT) const {
8525 if (ST.hasAddNoCarryInsts()) {
8526 // Assume there is no user of scc since we don't select this in that case.
8527 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8528 // is used.
8529
8530 MachineBasicBlock &MBB = *Inst.getParent();
8531 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8532
8533 Register OldDstReg = Inst.getOperand(0).getReg();
8534 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8535
8536 unsigned Opc = Inst.getOpcode();
8537 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8538
8539 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8540 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8541
8542 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8543 Inst.removeOperand(3);
8544
8545 Inst.setDesc(get(NewOpc));
8546 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8547 Inst.addImplicitDefUseOperands(*MBB.getParent());
8548 MRI.replaceRegWith(OldDstReg, ResultReg);
8549 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8550
8551 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8552 return std::pair(true, NewBB);
8553 }
8554
8555 return std::pair(false, nullptr);
8556}
8557
8558void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8559 MachineDominatorTree *MDT) const {
8560
8561 MachineBasicBlock &MBB = *Inst.getParent();
8562 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8563 MachineBasicBlock::iterator MII = Inst;
8564 const DebugLoc &DL = Inst.getDebugLoc();
8565
8566 MachineOperand &Dest = Inst.getOperand(0);
8567 MachineOperand &Src0 = Inst.getOperand(1);
8568 MachineOperand &Src1 = Inst.getOperand(2);
8569 MachineOperand &Cond = Inst.getOperand(3);
8570
8571 Register CondReg = Cond.getReg();
8572 bool IsSCC = (CondReg == AMDGPU::SCC);
8573
8574 // If this is a trivial select where the condition is effectively not SCC
8575 // (CondReg is a source of copy to SCC), then the select is semantically
8576 // equivalent to copying CondReg. Hence, there is no need to create
8577 // V_CNDMASK, we can just use that and bail out.
8578 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8579 (Src1.getImm() == 0)) {
8580 MRI.replaceRegWith(Dest.getReg(), CondReg);
8581 return;
8582 }
8583
8584 Register NewCondReg = CondReg;
8585 if (IsSCC) {
8586 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8587 NewCondReg = MRI.createVirtualRegister(TC);
8588
8589 // Now look for the closest SCC def if it is a copy
8590 // replacing the CondReg with the COPY source register
8591 bool CopyFound = false;
8592 for (MachineInstr &CandI :
8594 Inst.getParent()->rend())) {
8595 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8596 -1) {
8597 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8598 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8599 .addReg(CandI.getOperand(1).getReg());
8600 CopyFound = true;
8601 }
8602 break;
8603 }
8604 }
8605 if (!CopyFound) {
8606 // SCC def is not a copy
8607 // Insert a trivial select instead of creating a copy, because a copy from
8608 // SCC would semantically mean just copying a single bit, but we may need
8609 // the result to be a vector condition mask that needs preserving.
8610 unsigned Opcode =
8611 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8612 auto NewSelect =
8613 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8614 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8615 }
8616 }
8617
8618 Register NewDestReg = MRI.createVirtualRegister(
8619 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8620 MachineInstr *NewInst;
8621 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8622 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8623 .addImm(0)
8624 .add(Src1) // False
8625 .addImm(0)
8626 .add(Src0) // True
8627 .addReg(NewCondReg);
8628 } else {
8629 NewInst =
8630 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8631 .add(Src1) // False
8632 .add(Src0) // True
8633 .addReg(NewCondReg);
8634 }
8635 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8636 legalizeOperands(*NewInst, MDT);
8637 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8638}
8639
8640void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8641 MachineInstr &Inst) const {
8642 MachineBasicBlock &MBB = *Inst.getParent();
8643 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8644 MachineBasicBlock::iterator MII = Inst;
8645 const DebugLoc &DL = Inst.getDebugLoc();
8646
8647 MachineOperand &Dest = Inst.getOperand(0);
8648 MachineOperand &Src = Inst.getOperand(1);
8649 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8650 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8651
8652 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8653 : AMDGPU::V_SUB_CO_U32_e32;
8654
8655 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8656 .addImm(0)
8657 .addReg(Src.getReg());
8658
8659 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8660 .addReg(Src.getReg())
8661 .addReg(TmpReg);
8662
8663 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8664 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8665}
8666
8667void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8668 MachineInstr &Inst) const {
8669 MachineBasicBlock &MBB = *Inst.getParent();
8670 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8671 MachineBasicBlock::iterator MII = Inst;
8672 const DebugLoc &DL = Inst.getDebugLoc();
8673
8674 MachineOperand &Dest = Inst.getOperand(0);
8675 MachineOperand &Src1 = Inst.getOperand(1);
8676 MachineOperand &Src2 = Inst.getOperand(2);
8677 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8678 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8679 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8680
8681 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8682 : AMDGPU::V_SUB_CO_U32_e32;
8683
8684 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8685 .addReg(Src1.getReg())
8686 .addReg(Src2.getReg());
8687
8688 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8689
8690 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8691 .addReg(SubResultReg)
8692 .addReg(TmpReg);
8693
8694 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8695 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8696}
8697
8698void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8699 MachineInstr &Inst) const {
8700 MachineBasicBlock &MBB = *Inst.getParent();
8701 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8702 MachineBasicBlock::iterator MII = Inst;
8703 const DebugLoc &DL = Inst.getDebugLoc();
8704
8705 MachineOperand &Dest = Inst.getOperand(0);
8706 MachineOperand &Src0 = Inst.getOperand(1);
8707 MachineOperand &Src1 = Inst.getOperand(2);
8708
8709 if (ST.hasDLInsts()) {
8710 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8711 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8712 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8713
8714 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8715 .add(Src0)
8716 .add(Src1);
8717
8718 MRI.replaceRegWith(Dest.getReg(), NewDest);
8719 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8720 } else {
8721 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8722 // invert either source and then perform the XOR. If either source is a
8723 // scalar register, then we can leave the inversion on the scalar unit to
8724 // achieve a better distribution of scalar and vector instructions.
8725 bool Src0IsSGPR = Src0.isReg() &&
8726 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8727 bool Src1IsSGPR = Src1.isReg() &&
8728 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8729 MachineInstr *Xor;
8730 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8731 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8732
8733 // Build a pair of scalar instructions and add them to the work list.
8734 // The next iteration over the work list will lower these to the vector
8735 // unit as necessary.
8736 if (Src0IsSGPR) {
8737 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8738 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8739 .addReg(Temp)
8740 .add(Src1);
8741 } else if (Src1IsSGPR) {
8742 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8743 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8744 .add(Src0)
8745 .addReg(Temp);
8746 } else {
8747 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8748 .add(Src0)
8749 .add(Src1);
8750 MachineInstr *Not =
8751 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8752 Worklist.insert(Not);
8753 }
8754
8755 MRI.replaceRegWith(Dest.getReg(), NewDest);
8756
8757 Worklist.insert(Xor);
8758
8759 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8760 }
8761}
8762
8763void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8764 MachineInstr &Inst,
8765 unsigned Opcode) const {
8766 MachineBasicBlock &MBB = *Inst.getParent();
8767 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8768 MachineBasicBlock::iterator MII = Inst;
8769 const DebugLoc &DL = Inst.getDebugLoc();
8770
8771 MachineOperand &Dest = Inst.getOperand(0);
8772 MachineOperand &Src0 = Inst.getOperand(1);
8773 MachineOperand &Src1 = Inst.getOperand(2);
8774
8775 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8776 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8777
8778 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8779 .add(Src0)
8780 .add(Src1);
8781
8782 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8783 .addReg(Interm);
8784
8785 Worklist.insert(&Op);
8786 Worklist.insert(&Not);
8787
8788 MRI.replaceRegWith(Dest.getReg(), NewDest);
8789 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8790}
8791
8792void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8793 MachineInstr &Inst,
8794 unsigned Opcode) const {
8795 MachineBasicBlock &MBB = *Inst.getParent();
8796 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8797 MachineBasicBlock::iterator MII = Inst;
8798 const DebugLoc &DL = Inst.getDebugLoc();
8799
8800 MachineOperand &Dest = Inst.getOperand(0);
8801 MachineOperand &Src0 = Inst.getOperand(1);
8802 MachineOperand &Src1 = Inst.getOperand(2);
8803
8804 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8805 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8806
8807 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8808 .add(Src1);
8809
8810 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8811 .add(Src0)
8812 .addReg(Interm);
8813
8814 Worklist.insert(&Not);
8815 Worklist.insert(&Op);
8816
8817 MRI.replaceRegWith(Dest.getReg(), NewDest);
8818 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8819}
8820
8821void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8822 MachineInstr &Inst, unsigned Opcode,
8823 bool Swap) const {
8824 MachineBasicBlock &MBB = *Inst.getParent();
8825 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8826
8827 MachineOperand &Dest = Inst.getOperand(0);
8828 MachineOperand &Src0 = Inst.getOperand(1);
8829 const DebugLoc &DL = Inst.getDebugLoc();
8830
8831 MachineBasicBlock::iterator MII = Inst;
8832
8833 const MCInstrDesc &InstDesc = get(Opcode);
8834 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8835 MRI.getRegClass(Src0.getReg()) :
8836 &AMDGPU::SGPR_32RegClass;
8837
8838 const TargetRegisterClass *Src0SubRC =
8839 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8840
8841 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8842 AMDGPU::sub0, Src0SubRC);
8843
8844 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8845 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8846 const TargetRegisterClass *NewDestSubRC =
8847 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8848
8849 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8850 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8851
8852 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8853 AMDGPU::sub1, Src0SubRC);
8854
8855 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8856 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8857
8858 if (Swap)
8859 std::swap(DestSub0, DestSub1);
8860
8861 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8862 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8863 .addReg(DestSub0)
8864 .addImm(AMDGPU::sub0)
8865 .addReg(DestSub1)
8866 .addImm(AMDGPU::sub1);
8867
8868 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8869
8870 Worklist.insert(&LoHalf);
8871 Worklist.insert(&HiHalf);
8872
8873 // We don't need to legalizeOperands here because for a single operand, src0
8874 // will support any kind of input.
8875
8876 // Move all users of this moved value.
8877 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8878}
8879
8880// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8881// split the s_mul_u64 in 32-bit vector multiplications.
8882void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8883 MachineInstr &Inst,
8884 MachineDominatorTree *MDT) const {
8885 MachineBasicBlock &MBB = *Inst.getParent();
8886 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8887
8888 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8889 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8890 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8891
8892 MachineOperand &Dest = Inst.getOperand(0);
8893 MachineOperand &Src0 = Inst.getOperand(1);
8894 MachineOperand &Src1 = Inst.getOperand(2);
8895 const DebugLoc &DL = Inst.getDebugLoc();
8896 MachineBasicBlock::iterator MII = Inst;
8897
8898 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8899 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8900 const TargetRegisterClass *Src0SubRC =
8901 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8902 if (RI.isSGPRClass(Src0SubRC))
8903 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8904 const TargetRegisterClass *Src1SubRC =
8905 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8906 if (RI.isSGPRClass(Src1SubRC))
8907 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8908
8909 // First, we extract the low 32-bit and high 32-bit values from each of the
8910 // operands.
8911 MachineOperand Op0L =
8912 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8913 MachineOperand Op1L =
8914 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8915 MachineOperand Op0H =
8916 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8917 MachineOperand Op1H =
8918 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8919
8920 // The multilication is done as follows:
8921 //
8922 // Op1H Op1L
8923 // * Op0H Op0L
8924 // --------------------
8925 // Op1H*Op0L Op1L*Op0L
8926 // + Op1H*Op0H Op1L*Op0H
8927 // -----------------------------------------
8928 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8929 //
8930 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8931 // value and that would overflow.
8932 // The low 32-bit value is Op1L*Op0L.
8933 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8934
8935 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8936 MachineInstr *Op1L_Op0H =
8937 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8938 .add(Op1L)
8939 .add(Op0H);
8940
8941 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8942 MachineInstr *Op1H_Op0L =
8943 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8944 .add(Op1H)
8945 .add(Op0L);
8946
8947 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8948 MachineInstr *Carry =
8949 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8950 .add(Op1L)
8951 .add(Op0L);
8952
8953 MachineInstr *LoHalf =
8954 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8955 .add(Op1L)
8956 .add(Op0L);
8957
8958 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8959 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8960 .addReg(Op1L_Op0H_Reg)
8961 .addReg(Op1H_Op0L_Reg);
8962
8963 MachineInstr *HiHalf =
8964 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8965 .addReg(AddReg)
8966 .addReg(CarryReg);
8967
8968 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8969 .addReg(DestSub0)
8970 .addImm(AMDGPU::sub0)
8971 .addReg(DestSub1)
8972 .addImm(AMDGPU::sub1);
8973
8974 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8975
8976 // Try to legalize the operands in case we need to swap the order to keep it
8977 // valid.
8978 legalizeOperands(*Op1L_Op0H, MDT);
8979 legalizeOperands(*Op1H_Op0L, MDT);
8980 legalizeOperands(*Carry, MDT);
8981 legalizeOperands(*LoHalf, MDT);
8982 legalizeOperands(*Add, MDT);
8983 legalizeOperands(*HiHalf, MDT);
8984
8985 // Move all users of this moved value.
8986 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8987}
8988
8989// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8990// multiplications.
8991void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8992 MachineInstr &Inst,
8993 MachineDominatorTree *MDT) const {
8994 MachineBasicBlock &MBB = *Inst.getParent();
8995 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8996
8997 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8998 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8999 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9000
9001 MachineOperand &Dest = Inst.getOperand(0);
9002 MachineOperand &Src0 = Inst.getOperand(1);
9003 MachineOperand &Src1 = Inst.getOperand(2);
9004 const DebugLoc &DL = Inst.getDebugLoc();
9005 MachineBasicBlock::iterator MII = Inst;
9006
9007 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9008 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9009 const TargetRegisterClass *Src0SubRC =
9010 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9011 if (RI.isSGPRClass(Src0SubRC))
9012 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9013 const TargetRegisterClass *Src1SubRC =
9014 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9015 if (RI.isSGPRClass(Src1SubRC))
9016 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9017
9018 // First, we extract the low 32-bit and high 32-bit values from each of the
9019 // operands.
9020 MachineOperand Op0L =
9021 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9022 MachineOperand Op1L =
9023 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9024
9025 unsigned Opc = Inst.getOpcode();
9026 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9027 ? AMDGPU::V_MUL_HI_U32_e64
9028 : AMDGPU::V_MUL_HI_I32_e64;
9029 MachineInstr *HiHalf =
9030 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9031
9032 MachineInstr *LoHalf =
9033 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9034 .add(Op1L)
9035 .add(Op0L);
9036
9037 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9038 .addReg(DestSub0)
9039 .addImm(AMDGPU::sub0)
9040 .addReg(DestSub1)
9041 .addImm(AMDGPU::sub1);
9042
9043 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9044
9045 // Try to legalize the operands in case we need to swap the order to keep it
9046 // valid.
9047 legalizeOperands(*HiHalf, MDT);
9048 legalizeOperands(*LoHalf, MDT);
9049
9050 // Move all users of this moved value.
9051 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9052}
9053
9054void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9055 MachineInstr &Inst, unsigned Opcode,
9056 MachineDominatorTree *MDT) const {
9057 MachineBasicBlock &MBB = *Inst.getParent();
9058 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9059
9060 MachineOperand &Dest = Inst.getOperand(0);
9061 MachineOperand &Src0 = Inst.getOperand(1);
9062 MachineOperand &Src1 = Inst.getOperand(2);
9063 const DebugLoc &DL = Inst.getDebugLoc();
9064
9065 MachineBasicBlock::iterator MII = Inst;
9066
9067 const MCInstrDesc &InstDesc = get(Opcode);
9068 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9069 MRI.getRegClass(Src0.getReg()) :
9070 &AMDGPU::SGPR_32RegClass;
9071
9072 const TargetRegisterClass *Src0SubRC =
9073 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9074 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9075 MRI.getRegClass(Src1.getReg()) :
9076 &AMDGPU::SGPR_32RegClass;
9077
9078 const TargetRegisterClass *Src1SubRC =
9079 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9080
9081 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9082 AMDGPU::sub0, Src0SubRC);
9083 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9084 AMDGPU::sub0, Src1SubRC);
9085 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9086 AMDGPU::sub1, Src0SubRC);
9087 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9088 AMDGPU::sub1, Src1SubRC);
9089
9090 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9091 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9092 const TargetRegisterClass *NewDestSubRC =
9093 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9094
9095 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9096 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9097 .add(SrcReg0Sub0)
9098 .add(SrcReg1Sub0);
9099
9100 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9101 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9102 .add(SrcReg0Sub1)
9103 .add(SrcReg1Sub1);
9104
9105 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9106 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9107 .addReg(DestSub0)
9108 .addImm(AMDGPU::sub0)
9109 .addReg(DestSub1)
9110 .addImm(AMDGPU::sub1);
9111
9112 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9113
9114 Worklist.insert(&LoHalf);
9115 Worklist.insert(&HiHalf);
9116
9117 // Move all users of this moved value.
9118 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9119}
9120
9121void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9122 MachineInstr &Inst,
9123 MachineDominatorTree *MDT) const {
9124 MachineBasicBlock &MBB = *Inst.getParent();
9125 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9126
9127 MachineOperand &Dest = Inst.getOperand(0);
9128 MachineOperand &Src0 = Inst.getOperand(1);
9129 MachineOperand &Src1 = Inst.getOperand(2);
9130 const DebugLoc &DL = Inst.getDebugLoc();
9131
9132 MachineBasicBlock::iterator MII = Inst;
9133
9134 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9135
9136 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9137
9138 MachineOperand* Op0;
9139 MachineOperand* Op1;
9140
9141 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9142 Op0 = &Src0;
9143 Op1 = &Src1;
9144 } else {
9145 Op0 = &Src1;
9146 Op1 = &Src0;
9147 }
9148
9149 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9150 .add(*Op0);
9151
9152 Register NewDest = MRI.createVirtualRegister(DestRC);
9153
9154 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9155 .addReg(Interm)
9156 .add(*Op1);
9157
9158 MRI.replaceRegWith(Dest.getReg(), NewDest);
9159
9160 Worklist.insert(&Xor);
9161}
9162
9163void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9164 MachineInstr &Inst) const {
9165 MachineBasicBlock &MBB = *Inst.getParent();
9166 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9167
9168 MachineBasicBlock::iterator MII = Inst;
9169 const DebugLoc &DL = Inst.getDebugLoc();
9170
9171 MachineOperand &Dest = Inst.getOperand(0);
9172 MachineOperand &Src = Inst.getOperand(1);
9173
9174 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9175 const TargetRegisterClass *SrcRC = Src.isReg() ?
9176 MRI.getRegClass(Src.getReg()) :
9177 &AMDGPU::SGPR_32RegClass;
9178
9179 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9180 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9181
9182 const TargetRegisterClass *SrcSubRC =
9183 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9184
9185 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9186 AMDGPU::sub0, SrcSubRC);
9187 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9188 AMDGPU::sub1, SrcSubRC);
9189
9190 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9191
9192 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9193
9194 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9195
9196 // We don't need to legalize operands here. src0 for either instruction can be
9197 // an SGPR, and the second input is unused or determined here.
9198 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9199}
9200
9201void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9202 MachineInstr &Inst) const {
9203 MachineBasicBlock &MBB = *Inst.getParent();
9204 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9205 MachineBasicBlock::iterator MII = Inst;
9206 const DebugLoc &DL = Inst.getDebugLoc();
9207
9208 MachineOperand &Dest = Inst.getOperand(0);
9209 uint32_t Imm = Inst.getOperand(2).getImm();
9210 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9211 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9212
9213 (void) Offset;
9214
9215 // Only sext_inreg cases handled.
9216 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9217 Offset == 0 && "Not implemented");
9218
9219 if (BitWidth < 32) {
9220 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9221 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9222 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9223
9224 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9225 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9226 .addImm(0)
9227 .addImm(BitWidth);
9228
9229 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9230 .addImm(31)
9231 .addReg(MidRegLo);
9232
9233 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9234 .addReg(MidRegLo)
9235 .addImm(AMDGPU::sub0)
9236 .addReg(MidRegHi)
9237 .addImm(AMDGPU::sub1);
9238
9239 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9240 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9241 return;
9242 }
9243
9244 MachineOperand &Src = Inst.getOperand(1);
9245 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9246 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9247
9248 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9249 .addImm(31)
9250 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9251
9252 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9253 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9254 .addImm(AMDGPU::sub0)
9255 .addReg(TmpReg)
9256 .addImm(AMDGPU::sub1);
9257
9258 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9259 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9260}
9261
9262void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9263 MachineInstr &Inst, unsigned Opcode,
9264 MachineDominatorTree *MDT) const {
9265 // (S_FLBIT_I32_B64 hi:lo) ->
9266 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9267 // (S_FF1_I32_B64 hi:lo) ->
9268 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9269
9270 MachineBasicBlock &MBB = *Inst.getParent();
9271 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9272 MachineBasicBlock::iterator MII = Inst;
9273 const DebugLoc &DL = Inst.getDebugLoc();
9274
9275 MachineOperand &Dest = Inst.getOperand(0);
9276 MachineOperand &Src = Inst.getOperand(1);
9277
9278 const MCInstrDesc &InstDesc = get(Opcode);
9279
9280 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9281 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9282 : AMDGPU::V_ADD_CO_U32_e32;
9283
9284 const TargetRegisterClass *SrcRC =
9285 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9286 const TargetRegisterClass *SrcSubRC =
9287 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9288
9289 MachineOperand SrcRegSub0 =
9290 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9291 MachineOperand SrcRegSub1 =
9292 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9293
9294 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9295 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9296 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9297 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9298
9299 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9300
9301 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9302
9303 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9304 .addReg(IsCtlz ? MidReg1 : MidReg2)
9305 .addImm(32)
9306 .addImm(1); // enable clamp
9307
9308 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9309 .addReg(MidReg3)
9310 .addReg(IsCtlz ? MidReg2 : MidReg1);
9311
9312 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9313
9314 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9315}
9316
9317void SIInstrInfo::addUsersToMoveToVALUWorklist(
9318 Register DstReg, MachineRegisterInfo &MRI,
9319 SIInstrWorklist &Worklist) const {
9320 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9321 MachineInstr &UseMI = *MO.getParent();
9322
9323 unsigned OpNo = 0;
9324
9325 switch (UseMI.getOpcode()) {
9326 case AMDGPU::COPY:
9327 case AMDGPU::WQM:
9328 case AMDGPU::SOFT_WQM:
9329 case AMDGPU::STRICT_WWM:
9330 case AMDGPU::STRICT_WQM:
9331 case AMDGPU::REG_SEQUENCE:
9332 case AMDGPU::PHI:
9333 case AMDGPU::INSERT_SUBREG:
9334 break;
9335 default:
9336 OpNo = MO.getOperandNo();
9337 break;
9338 }
9339
9340 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9341 MRI.constrainRegClass(DstReg, OpRC);
9342
9343 if (!RI.hasVectorRegisters(OpRC))
9344 Worklist.insert(&UseMI);
9345 else
9346 // Legalization could change user list.
9347 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9348 }
9349}
9350
9351void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9353 MachineInstr &Inst) const {
9354 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9355 MachineBasicBlock *MBB = Inst.getParent();
9356 MachineOperand &Src0 = Inst.getOperand(1);
9357 MachineOperand &Src1 = Inst.getOperand(2);
9358 const DebugLoc &DL = Inst.getDebugLoc();
9359
9360 if (ST.useRealTrue16Insts()) {
9361 Register SrcReg0, SrcReg1;
9362 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9363 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9364 BuildMI(*MBB, Inst, DL,
9365 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9366 .add(Src0);
9367 } else {
9368 SrcReg0 = Src0.getReg();
9369 }
9370
9371 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9372 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9373 BuildMI(*MBB, Inst, DL,
9374 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9375 .add(Src1);
9376 } else {
9377 SrcReg1 = Src1.getReg();
9378 }
9379
9380 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9381 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9382
9383 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9384 switch (Inst.getOpcode()) {
9385 case AMDGPU::S_PACK_LL_B32_B16:
9386 NewMI
9387 .addReg(SrcReg0, {},
9388 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9389 .addImm(AMDGPU::lo16)
9390 .addReg(SrcReg1, {},
9391 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9392 .addImm(AMDGPU::hi16);
9393 break;
9394 case AMDGPU::S_PACK_LH_B32_B16:
9395 NewMI
9396 .addReg(SrcReg0, {},
9397 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9398 .addImm(AMDGPU::lo16)
9399 .addReg(SrcReg1, {}, AMDGPU::hi16)
9400 .addImm(AMDGPU::hi16);
9401 break;
9402 case AMDGPU::S_PACK_HL_B32_B16:
9403 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9404 .addImm(AMDGPU::lo16)
9405 .addReg(SrcReg1, {},
9406 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9407 .addImm(AMDGPU::hi16);
9408 break;
9409 case AMDGPU::S_PACK_HH_B32_B16:
9410 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9411 .addImm(AMDGPU::lo16)
9412 .addReg(SrcReg1, {}, AMDGPU::hi16)
9413 .addImm(AMDGPU::hi16);
9414 break;
9415 default:
9416 llvm_unreachable("unhandled s_pack_* instruction");
9417 }
9418
9419 MachineOperand &Dest = Inst.getOperand(0);
9420 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9421 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9422 return;
9423 }
9424
9425 switch (Inst.getOpcode()) {
9426 case AMDGPU::S_PACK_LL_B32_B16: {
9427 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9428 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9429
9430 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9431 // 0.
9432 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9433 .addImm(0xffff);
9434
9435 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9436 .addReg(ImmReg, RegState::Kill)
9437 .add(Src0);
9438
9439 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9440 .add(Src1)
9441 .addImm(16)
9442 .addReg(TmpReg, RegState::Kill);
9443 break;
9444 }
9445 case AMDGPU::S_PACK_LH_B32_B16: {
9446 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9447 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9448 .addImm(0xffff);
9449 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9450 .addReg(ImmReg, RegState::Kill)
9451 .add(Src0)
9452 .add(Src1);
9453 break;
9454 }
9455 case AMDGPU::S_PACK_HL_B32_B16: {
9456 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9457 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9458 .addImm(16)
9459 .add(Src0);
9460 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9461 .add(Src1)
9462 .addImm(16)
9463 .addReg(TmpReg, RegState::Kill);
9464 break;
9465 }
9466 case AMDGPU::S_PACK_HH_B32_B16: {
9467 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9468 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9469 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9470 .addImm(16)
9471 .add(Src0);
9472 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9473 .addImm(0xffff0000);
9474 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9475 .add(Src1)
9476 .addReg(ImmReg, RegState::Kill)
9477 .addReg(TmpReg, RegState::Kill);
9478 break;
9479 }
9480 default:
9481 llvm_unreachable("unhandled s_pack_* instruction");
9482 }
9483
9484 MachineOperand &Dest = Inst.getOperand(0);
9485 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9486 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9487}
9488
9489void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9490 MachineInstr &SCCDefInst,
9491 SIInstrWorklist &Worklist,
9492 Register NewCond) const {
9493
9494 // Ensure that def inst defines SCC, which is still live.
9495 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9496 !Op.isDead() && Op.getParent() == &SCCDefInst);
9497 SmallVector<MachineInstr *, 4> CopyToDelete;
9498 // This assumes that all the users of SCC are in the same block
9499 // as the SCC def.
9500 for (MachineInstr &MI : // Skip the def inst itself.
9501 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9502 SCCDefInst.getParent()->end())) {
9503 // Check if SCC is used first.
9504 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9505 if (SCCIdx != -1) {
9506 if (MI.isCopy()) {
9507 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9508 Register DestReg = MI.getOperand(0).getReg();
9509
9510 MRI.replaceRegWith(DestReg, NewCond);
9511 CopyToDelete.push_back(&MI);
9512 } else {
9513
9514 if (NewCond.isValid())
9515 MI.getOperand(SCCIdx).setReg(NewCond);
9516
9517 Worklist.insert(&MI);
9518 }
9519 }
9520 // Exit if we find another SCC def.
9521 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9522 break;
9523 }
9524 for (auto &Copy : CopyToDelete)
9525 Copy->eraseFromParent();
9526}
9527
9528// Instructions that use SCC may be converted to VALU instructions. When that
9529// happens, the SCC register is changed to VCC_LO. The instruction that defines
9530// SCC must be changed to an instruction that defines VCC. This function makes
9531// sure that the instruction that defines SCC is added to the moveToVALU
9532// worklist.
9533void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9534 SIInstrWorklist &Worklist) const {
9535 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9536 // then there is nothing to do because the defining instruction has been
9537 // converted to a VALU already. If SCC then that instruction needs to be
9538 // converted to a VALU.
9539 for (MachineInstr &MI :
9540 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9541 SCCUseInst->getParent()->rend())) {
9542 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9543 break;
9544 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9545 Worklist.insert(&MI);
9546 break;
9547 }
9548 }
9549}
9550
9551const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9552 const MachineInstr &Inst) const {
9553 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9554
9555 switch (Inst.getOpcode()) {
9556 // For target instructions, getOpRegClass just returns the virtual register
9557 // class associated with the operand, so we need to find an equivalent VGPR
9558 // register class in order to move the instruction to the VALU.
9559 case AMDGPU::COPY:
9560 case AMDGPU::PHI:
9561 case AMDGPU::REG_SEQUENCE:
9562 case AMDGPU::INSERT_SUBREG:
9563 case AMDGPU::WQM:
9564 case AMDGPU::SOFT_WQM:
9565 case AMDGPU::STRICT_WWM:
9566 case AMDGPU::STRICT_WQM: {
9567 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9568 if (RI.isAGPRClass(SrcRC)) {
9569 if (RI.isAGPRClass(NewDstRC))
9570 return nullptr;
9571
9572 switch (Inst.getOpcode()) {
9573 case AMDGPU::PHI:
9574 case AMDGPU::REG_SEQUENCE:
9575 case AMDGPU::INSERT_SUBREG:
9576 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9577 break;
9578 default:
9579 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9580 }
9581
9582 if (!NewDstRC)
9583 return nullptr;
9584 } else {
9585 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9586 return nullptr;
9587
9588 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9589 if (!NewDstRC)
9590 return nullptr;
9591 }
9592
9593 return NewDstRC;
9594 }
9595 default:
9596 return NewDstRC;
9597 }
9598}
9599
9600// Find the one SGPR operand we are allowed to use.
9601Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9602 int OpIndices[3]) const {
9603 const MCInstrDesc &Desc = MI.getDesc();
9604
9605 // Find the one SGPR operand we are allowed to use.
9606 //
9607 // First we need to consider the instruction's operand requirements before
9608 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9609 // of VCC, but we are still bound by the constant bus requirement to only use
9610 // one.
9611 //
9612 // If the operand's class is an SGPR, we can never move it.
9613
9614 Register SGPRReg = findImplicitSGPRRead(MI);
9615 if (SGPRReg)
9616 return SGPRReg;
9617
9618 Register UsedSGPRs[3] = {Register()};
9619 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9620
9621 for (unsigned i = 0; i < 3; ++i) {
9622 int Idx = OpIndices[i];
9623 if (Idx == -1)
9624 break;
9625
9626 const MachineOperand &MO = MI.getOperand(Idx);
9627 if (!MO.isReg())
9628 continue;
9629
9630 // Is this operand statically required to be an SGPR based on the operand
9631 // constraints?
9632 const TargetRegisterClass *OpRC =
9633 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9634 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9635 if (IsRequiredSGPR)
9636 return MO.getReg();
9637
9638 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9639 Register Reg = MO.getReg();
9640 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9641 if (RI.isSGPRClass(RegRC))
9642 UsedSGPRs[i] = Reg;
9643 }
9644
9645 // We don't have a required SGPR operand, so we have a bit more freedom in
9646 // selecting operands to move.
9647
9648 // Try to select the most used SGPR. If an SGPR is equal to one of the
9649 // others, we choose that.
9650 //
9651 // e.g.
9652 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9653 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9654
9655 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9656 // prefer those.
9657
9658 if (UsedSGPRs[0]) {
9659 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9660 SGPRReg = UsedSGPRs[0];
9661 }
9662
9663 if (!SGPRReg && UsedSGPRs[1]) {
9664 if (UsedSGPRs[1] == UsedSGPRs[2])
9665 SGPRReg = UsedSGPRs[1];
9666 }
9667
9668 return SGPRReg;
9669}
9670
9672 AMDGPU::OpName OperandName) const {
9673 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9674 return nullptr;
9675
9676 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9677 if (Idx == -1)
9678 return nullptr;
9679
9680 return &MI.getOperand(Idx);
9681}
9682
9684 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9685 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9688 return (Format << 44) |
9689 (1ULL << 56) | // RESOURCE_LEVEL = 1
9690 (3ULL << 60); // OOB_SELECT = 3
9691 }
9692
9693 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9694 if (ST.isAmdHsaOS()) {
9695 // Set ATC = 1. GFX9 doesn't have this bit.
9696 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9697 RsrcDataFormat |= (1ULL << 56);
9698
9699 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9700 // BTW, it disables TC L2 and therefore decreases performance.
9701 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9702 RsrcDataFormat |= (2ULL << 59);
9703 }
9704
9705 return RsrcDataFormat;
9706}
9707
9711 0xffffffff; // Size;
9712
9713 // GFX9 doesn't have ELEMENT_SIZE.
9714 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9715 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9716 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9717 }
9718
9719 // IndexStride = 64 / 32.
9720 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9721 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9722
9723 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9724 // Clear them unless we want a huge stride.
9725 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9726 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9727 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9728
9729 return Rsrc23;
9730}
9731
9733 unsigned Opc = MI.getOpcode();
9734
9735 return isSMRD(Opc);
9736}
9737
9739 return get(Opc).mayLoad() &&
9740 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9741}
9742
9744 TypeSize &MemBytes) const {
9745 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9746 if (!Addr || !Addr->isFI())
9747 return Register();
9748
9749 assert(!MI.memoperands_empty() &&
9750 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9751
9752 FrameIndex = Addr->getIndex();
9753
9754 int VDataIdx =
9755 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
9756 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), VDataIdx));
9757 return MI.getOperand(VDataIdx).getReg();
9758}
9759
9761 TypeSize &MemBytes) const {
9762 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9763 assert(Addr && Addr->isFI());
9764 FrameIndex = Addr->getIndex();
9765
9766 int DataIdx =
9767 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::data);
9768 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), DataIdx));
9769 return MI.getOperand(DataIdx).getReg();
9770}
9771
9773 int &FrameIndex,
9774 TypeSize &MemBytes) const {
9775 if (!MI.mayLoad())
9776 return Register();
9777
9778 if (isMUBUF(MI) || isVGPRSpill(MI))
9779 return isStackAccess(MI, FrameIndex, MemBytes);
9780
9781 if (isSGPRSpill(MI))
9782 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9783
9784 return Register();
9785}
9786
9788 int &FrameIndex,
9789 TypeSize &MemBytes) const {
9790 if (!MI.mayStore())
9791 return Register();
9792
9793 if (isMUBUF(MI) || isVGPRSpill(MI))
9794 return isStackAccess(MI, FrameIndex, MemBytes);
9795
9796 if (isSGPRSpill(MI))
9797 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9798
9799 return Register();
9800}
9801
9803 unsigned Size = 0;
9805 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9806 while (++I != E && I->isInsideBundle()) {
9807 assert(!I->isBundle() && "No nested bundle!");
9809 }
9810
9811 return Size;
9812}
9813
9815 unsigned Opc = MI.getOpcode();
9817 unsigned DescSize = Desc.getSize();
9818
9819 // If we have a definitive size, we can use it. Otherwise we need to inspect
9820 // the operands to know the size.
9821 if (isFixedSize(MI)) {
9822 unsigned Size = DescSize;
9823
9824 // If we hit the buggy offset, an extra nop will be inserted in MC so
9825 // estimate the worst case.
9826 if (MI.isBranch() && ST.hasOffset3fBug())
9827 Size += 4;
9828
9829 return Size;
9830 }
9831
9832 // Instructions may have a 32-bit literal encoded after them. Check
9833 // operands that could ever be literals.
9834 if (isVALU(MI) || isSALU(MI)) {
9835 if (isDPP(MI))
9836 return DescSize;
9837 bool HasLiteral = false;
9838 unsigned LiteralSize = 4;
9839 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9840 const MachineOperand &Op = MI.getOperand(I);
9841 const MCOperandInfo &OpInfo = Desc.operands()[I];
9842 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9843 HasLiteral = true;
9844 if (ST.has64BitLiterals()) {
9845 switch (OpInfo.OperandType) {
9846 default:
9847 break;
9849 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9850 LiteralSize = 8;
9851 break;
9853 // A 32-bit literal is only valid when the value fits in BOTH signed
9854 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9855 // emitter's getLit64Encoding logic. This is because of the lack of
9856 // abilility to tell signedness of the literal, therefore we need to
9857 // be conservative and assume values outside this range require a
9858 // 64-bit literal encoding (8 bytes).
9859 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
9860 !isUInt<32>(Op.getImm()))
9861 LiteralSize = 8;
9862 break;
9863 }
9864 }
9865 break;
9866 }
9867 }
9868 return HasLiteral ? DescSize + LiteralSize : DescSize;
9869 }
9870
9871 // Check whether we have extra NSA words.
9872 if (isMIMG(MI)) {
9873 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9874 if (VAddr0Idx < 0)
9875 return 8;
9876
9877 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9878 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9879 }
9880
9881 switch (Opc) {
9882 case TargetOpcode::BUNDLE:
9883 return getInstBundleSize(MI);
9884 case TargetOpcode::INLINEASM:
9885 case TargetOpcode::INLINEASM_BR: {
9886 const MachineFunction *MF = MI.getMF();
9887 const char *AsmStr = MI.getOperand(0).getSymbolName();
9888 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9889 }
9890 default:
9891 if (MI.isMetaInstruction())
9892 return 0;
9893
9894 // If D16 Pseudo inst, get correct MC code size
9895 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9896 if (D16Info) {
9897 // Assume d16_lo/hi inst are always in same size
9898 unsigned LoInstOpcode = D16Info->LoOp;
9899 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9900 DescSize = Desc.getSize();
9901 }
9902
9903 // If FMA Pseudo inst, get correct MC code size
9904 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9905 // All potential lowerings are the same size; arbitrarily pick one.
9906 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9907 DescSize = Desc.getSize();
9908 }
9909
9910 return DescSize;
9911 }
9912}
9913
9915 if (!isFLAT(MI))
9916 return false;
9917
9918 if (MI.memoperands_empty())
9919 return true;
9920
9921 for (const MachineMemOperand *MMO : MI.memoperands()) {
9922 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9923 return true;
9924 }
9925 return false;
9926}
9927
9930 static const std::pair<int, const char *> TargetIndices[] = {
9931 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9932 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9933 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9934 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9935 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9936 return ArrayRef(TargetIndices);
9937}
9938
9939/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9940/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9946
9947/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9948/// pass.
9954
9955// Called during:
9956// - pre-RA scheduling and post-RA scheduling
9959 const ScheduleDAGMI *DAG) const {
9960 // Borrowed from Arm Target
9961 // We would like to restrict this hazard recognizer to only
9962 // post-RA scheduling; we can tell that we're post-RA because we don't
9963 // track VRegLiveness.
9964 if (!DAG->hasVRegLiveness())
9965 return new GCNHazardRecognizer(DAG->MF);
9967}
9968
9969std::pair<unsigned, unsigned>
9971 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9972}
9973
9976 static const std::pair<unsigned, const char *> TargetFlags[] = {
9977 {MO_GOTPCREL, "amdgpu-gotprel"},
9978 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9979 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9980 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9981 {MO_REL32_LO, "amdgpu-rel32-lo"},
9982 {MO_REL32_HI, "amdgpu-rel32-hi"},
9983 {MO_REL64, "amdgpu-rel64"},
9984 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9985 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9986 {MO_ABS64, "amdgpu-abs64"},
9987 };
9988
9989 return ArrayRef(TargetFlags);
9990}
9991
9994 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9995 {
9996 {MONoClobber, "amdgpu-noclobber"},
9997 {MOLastUse, "amdgpu-last-use"},
9998 {MOCooperative, "amdgpu-cooperative"},
9999 {MOThreadPrivate, "amdgpu-thread-private"},
10000 };
10001
10002 return ArrayRef(TargetFlags);
10003}
10004
10006 const MachineFunction &MF) const {
10008 assert(SrcReg.isVirtual());
10009 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10010 return AMDGPU::WWM_COPY;
10011
10012 return AMDGPU::COPY;
10013}
10014
10016 uint32_t Opcode = MI.getOpcode();
10017 // Check if it is SGPR spill or wwm-register spill Opcode.
10018 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10019 return true;
10020
10021 const MachineFunction *MF = MI.getMF();
10022 const MachineRegisterInfo &MRI = MF->getRegInfo();
10024
10025 // See if this is Liverange split instruction inserted for SGPR or
10026 // wwm-register. The implicit def inserted for wwm-registers should also be
10027 // included as they can appear at the bb begin.
10028 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10029 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10030 return false;
10031
10032 Register Reg = MI.getOperand(0).getReg();
10033 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10034 return IsLRSplitInst;
10035
10036 return MFI->isWWMReg(Reg);
10037}
10038
10040 Register Reg) const {
10041 // We need to handle instructions which may be inserted during register
10042 // allocation to handle the prolog. The initial prolog instruction may have
10043 // been separated from the start of the block by spills and copies inserted
10044 // needed by the prolog. However, the insertions for scalar registers can
10045 // always be placed at the BB top as they are independent of the exec mask
10046 // value.
10047 bool IsNullOrVectorRegister = true;
10048 if (Reg) {
10049 const MachineFunction *MF = MI.getMF();
10050 const MachineRegisterInfo &MRI = MF->getRegInfo();
10051 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10052 }
10053
10054 return IsNullOrVectorRegister &&
10055 (canAddToBBProlog(MI) ||
10056 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10057 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10058}
10059
10063 const DebugLoc &DL,
10064 Register DestReg) const {
10065 if (ST.hasAddNoCarryInsts())
10066 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10067
10068 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10069 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10070 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10071
10072 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10073 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10074}
10075
10078 const DebugLoc &DL,
10079 Register DestReg,
10080 RegScavenger &RS) const {
10081 if (ST.hasAddNoCarryInsts())
10082 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10083
10084 // If available, prefer to use vcc.
10085 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10086 ? Register(RI.getVCC())
10087 : RS.scavengeRegisterBackwards(
10088 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10089 0, /* AllowSpill */ false);
10090
10091 // TODO: Users need to deal with this.
10092 if (!UnusedCarry.isValid())
10093 return MachineInstrBuilder();
10094
10095 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10096 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10097}
10098
10099bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10100 switch (Opcode) {
10101 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10102 case AMDGPU::SI_KILL_I1_TERMINATOR:
10103 return true;
10104 default:
10105 return false;
10106 }
10107}
10108
10110 switch (Opcode) {
10111 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10112 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10113 case AMDGPU::SI_KILL_I1_PSEUDO:
10114 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10115 default:
10116 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10117 }
10118}
10119
10120bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10121 return Imm <= getMaxMUBUFImmOffset(ST);
10122}
10123
10125 // GFX12 field is non-negative 24-bit signed byte offset.
10126 const unsigned OffsetBits =
10127 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10128 return (1 << OffsetBits) - 1;
10129}
10130
10132 if (!ST.isWave32())
10133 return;
10134
10135 if (MI.isInlineAsm())
10136 return;
10137
10138 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10139 return;
10140
10141 for (auto &Op : MI.implicit_operands()) {
10142 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10143 Op.setReg(AMDGPU::VCC_LO);
10144 }
10145}
10146
10148 if (!isSMRD(MI))
10149 return false;
10150
10151 // Check that it is using a buffer resource.
10152 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10153 if (Idx == -1) // e.g. s_memtime
10154 return false;
10155
10156 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10157 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10158}
10159
10160// Given Imm, split it into the values to put into the SOffset and ImmOffset
10161// fields in an MUBUF instruction. Return false if it is not possible (due to a
10162// hardware bug needing a workaround).
10163//
10164// The required alignment ensures that individual address components remain
10165// aligned if they are aligned to begin with. It also ensures that additional
10166// offsets within the given alignment can be added to the resulting ImmOffset.
10168 uint32_t &ImmOffset, Align Alignment) const {
10169 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10170 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10171 uint32_t Overflow = 0;
10172
10173 if (Imm > MaxImm) {
10174 if (Imm <= MaxImm + 64) {
10175 // Use an SOffset inline constant for 4..64
10176 Overflow = Imm - MaxImm;
10177 Imm = MaxImm;
10178 } else {
10179 // Try to keep the same value in SOffset for adjacent loads, so that
10180 // the corresponding register contents can be re-used.
10181 //
10182 // Load values with all low-bits (except for alignment bits) set into
10183 // SOffset, so that a larger range of values can be covered using
10184 // s_movk_i32.
10185 //
10186 // Atomic operations fail to work correctly when individual address
10187 // components are unaligned, even if their sum is aligned.
10188 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10189 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10190 Imm = Low;
10191 Overflow = High - Alignment.value();
10192 }
10193 }
10194
10195 if (Overflow > 0) {
10196 // There is a hardware bug in SI and CI which prevents address clamping in
10197 // MUBUF instructions from working correctly with SOffsets. The immediate
10198 // offset is unaffected.
10199 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10200 return false;
10201
10202 // It is not possible to set immediate in SOffset field on some targets.
10203 if (ST.hasRestrictedSOffset())
10204 return false;
10205 }
10206
10207 ImmOffset = Imm;
10208 SOffset = Overflow;
10209 return true;
10210}
10211
10212// Depending on the used address space and instructions, some immediate offsets
10213// are allowed and some are not.
10214// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10215// scratch instruction offsets can also be negative. On GFX12, offsets can be
10216// negative for all variants.
10217//
10218// There are several bugs related to these offsets:
10219// On gfx10.1, flat instructions that go into the global address space cannot
10220// use an offset.
10221//
10222// For scratch instructions, the address can be either an SGPR or a VGPR.
10223// The following offsets can be used, depending on the architecture (x means
10224// cannot be used):
10225// +----------------------------+------+------+
10226// | Address-Mode | SGPR | VGPR |
10227// +----------------------------+------+------+
10228// | gfx9 | | |
10229// | negative, 4-aligned offset | x | ok |
10230// | negative, unaligned offset | x | ok |
10231// +----------------------------+------+------+
10232// | gfx10 | | |
10233// | negative, 4-aligned offset | ok | ok |
10234// | negative, unaligned offset | ok | x |
10235// +----------------------------+------+------+
10236// | gfx10.3 | | |
10237// | negative, 4-aligned offset | ok | ok |
10238// | negative, unaligned offset | ok | ok |
10239// +----------------------------+------+------+
10240//
10241// This function ignores the addressing mode, so if an offset cannot be used in
10242// one addressing mode, it is considered illegal.
10243bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10244 uint64_t FlatVariant) const {
10245 // TODO: Should 0 be special cased?
10246 if (!ST.hasFlatInstOffsets())
10247 return false;
10248
10249 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10250 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10251 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10252 return false;
10253
10254 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10255 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10256 (Offset % 4) != 0) {
10257 return false;
10258 }
10259
10260 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10261 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10262 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10263}
10264
10265// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10266std::pair<int64_t, int64_t>
10267SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10268 uint64_t FlatVariant) const {
10269 int64_t RemainderOffset = COffsetVal;
10270 int64_t ImmField = 0;
10271
10272 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10273 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10274
10275 if (AllowNegative) {
10276 // Use signed division by a power of two to truncate towards 0.
10277 int64_t D = 1LL << NumBits;
10278 RemainderOffset = (COffsetVal / D) * D;
10279 ImmField = COffsetVal - RemainderOffset;
10280
10281 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10282 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10283 (ImmField % 4) != 0) {
10284 // Make ImmField a multiple of 4
10285 RemainderOffset += ImmField % 4;
10286 ImmField -= ImmField % 4;
10287 }
10288 } else if (COffsetVal >= 0) {
10289 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10290 RemainderOffset = COffsetVal - ImmField;
10291 }
10292
10293 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10294 assert(RemainderOffset + ImmField == COffsetVal);
10295 return {ImmField, RemainderOffset};
10296}
10297
10299 if (ST.hasNegativeScratchOffsetBug() &&
10300 FlatVariant == SIInstrFlags::FlatScratch)
10301 return false;
10302
10303 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10304}
10305
10306static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10307 switch (ST.getGeneration()) {
10308 default:
10309 break;
10312 return SIEncodingFamily::SI;
10315 return SIEncodingFamily::VI;
10319 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10322 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10326 }
10327 llvm_unreachable("Unknown subtarget generation!");
10328}
10329
10330bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10331 switch(MCOp) {
10332 // These opcodes use indirect register addressing so
10333 // they need special handling by codegen (currently missing).
10334 // Therefore it is too risky to allow these opcodes
10335 // to be selected by dpp combiner or sdwa peepholer.
10336 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10337 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10338 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10339 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10340 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10341 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10342 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10343 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10344 return true;
10345 default:
10346 return false;
10347 }
10348}
10349
10350#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10351 case OPCODE##_dpp: \
10352 case OPCODE##_e32: \
10353 case OPCODE##_e64: \
10354 case OPCODE##_e64_dpp: \
10355 case OPCODE##_sdwa:
10356
10357static bool isRenamedInGFX9(int Opcode) {
10358 switch (Opcode) {
10359 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10360 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10361 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10362 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10363 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10364 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10365 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10366 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10367 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10368 //
10369 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10370 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10371 case AMDGPU::V_FMA_F16_gfx9_e64:
10372 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10373 case AMDGPU::V_INTERP_P2_F16:
10374 case AMDGPU::V_MAD_F16_e64:
10375 case AMDGPU::V_MAD_U16_e64:
10376 case AMDGPU::V_MAD_I16_e64:
10377 return true;
10378 default:
10379 return false;
10380 }
10381}
10382
10383int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10384 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10385 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10386
10387 unsigned Gen = subtargetEncodingFamily(ST);
10388
10389 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10391
10392 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10393 // subtarget has UnpackedD16VMem feature.
10394 // TODO: remove this when we discard GFX80 encoding.
10395 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10397
10398 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10399 switch (ST.getGeneration()) {
10400 default:
10402 break;
10405 break;
10408 break;
10409 }
10410 }
10411
10412 if (isMAI(Opcode)) {
10413 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10414 if (MFMAOp != -1)
10415 Opcode = MFMAOp;
10416 }
10417
10418 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10419
10420 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10422
10423 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10425
10426 // -1 means that Opcode is already a native instruction.
10427 if (MCOp == -1)
10428 return Opcode;
10429
10430 if (ST.hasGFX90AInsts()) {
10431 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10432 if (ST.hasGFX940Insts())
10434 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10436 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10438 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10439 MCOp = NMCOp;
10440 }
10441
10442 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10443 // encoding in the given subtarget generation.
10444 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10445 return -1;
10446
10447 if (isAsmOnlyOpcode(MCOp))
10448 return -1;
10449
10450 return MCOp;
10451}
10452
10453static
10455 assert(RegOpnd.isReg());
10456 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10457 getRegSubRegPair(RegOpnd);
10458}
10459
10462 assert(MI.isRegSequence());
10463 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10464 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10465 auto &RegOp = MI.getOperand(1 + 2 * I);
10466 return getRegOrUndef(RegOp);
10467 }
10469}
10470
10471// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10472// Following a subreg of reg:subreg isn't supported
10475 if (!RSR.SubReg)
10476 return false;
10477 switch (MI.getOpcode()) {
10478 default: break;
10479 case AMDGPU::REG_SEQUENCE:
10480 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10481 return true;
10482 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10483 case AMDGPU::INSERT_SUBREG:
10484 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10485 // inserted the subreg we're looking for
10486 RSR = getRegOrUndef(MI.getOperand(2));
10487 else { // the subreg in the rest of the reg
10488 auto R1 = getRegOrUndef(MI.getOperand(1));
10489 if (R1.SubReg) // subreg of subreg isn't supported
10490 return false;
10491 RSR.Reg = R1.Reg;
10492 }
10493 return true;
10494 }
10495 return false;
10496}
10497
10499 const MachineRegisterInfo &MRI) {
10500 assert(MRI.isSSA());
10501 if (!P.Reg.isVirtual())
10502 return nullptr;
10503
10504 auto RSR = P;
10505 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10506 while (auto *MI = DefInst) {
10507 DefInst = nullptr;
10508 switch (MI->getOpcode()) {
10509 case AMDGPU::COPY:
10510 case AMDGPU::V_MOV_B32_e32: {
10511 auto &Op1 = MI->getOperand(1);
10512 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10513 if (Op1.isUndef())
10514 return nullptr;
10515 RSR = getRegSubRegPair(Op1);
10516 DefInst = MRI.getVRegDef(RSR.Reg);
10517 }
10518 break;
10519 }
10520 default:
10521 if (followSubRegDef(*MI, RSR)) {
10522 if (!RSR.Reg)
10523 return nullptr;
10524 DefInst = MRI.getVRegDef(RSR.Reg);
10525 }
10526 }
10527 if (!DefInst)
10528 return MI;
10529 }
10530 return nullptr;
10531}
10532
10534 Register VReg,
10535 const MachineInstr &DefMI,
10536 const MachineInstr &UseMI) {
10537 assert(MRI.isSSA() && "Must be run on SSA");
10538
10539 auto *TRI = MRI.getTargetRegisterInfo();
10540 auto *DefBB = DefMI.getParent();
10541
10542 // Don't bother searching between blocks, although it is possible this block
10543 // doesn't modify exec.
10544 if (UseMI.getParent() != DefBB)
10545 return true;
10546
10547 const int MaxInstScan = 20;
10548 int NumInst = 0;
10549
10550 // Stop scan at the use.
10551 auto E = UseMI.getIterator();
10552 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10553 if (I->isDebugInstr())
10554 continue;
10555
10556 if (++NumInst > MaxInstScan)
10557 return true;
10558
10559 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10560 return true;
10561 }
10562
10563 return false;
10564}
10565
10567 Register VReg,
10568 const MachineInstr &DefMI) {
10569 assert(MRI.isSSA() && "Must be run on SSA");
10570
10571 auto *TRI = MRI.getTargetRegisterInfo();
10572 auto *DefBB = DefMI.getParent();
10573
10574 const int MaxUseScan = 10;
10575 int NumUse = 0;
10576
10577 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10578 auto &UseInst = *Use.getParent();
10579 // Don't bother searching between blocks, although it is possible this block
10580 // doesn't modify exec.
10581 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10582 return true;
10583
10584 if (++NumUse > MaxUseScan)
10585 return true;
10586 }
10587
10588 if (NumUse == 0)
10589 return false;
10590
10591 const int MaxInstScan = 20;
10592 int NumInst = 0;
10593
10594 // Stop scan when we have seen all the uses.
10595 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10596 assert(I != DefBB->end());
10597
10598 if (I->isDebugInstr())
10599 continue;
10600
10601 if (++NumInst > MaxInstScan)
10602 return true;
10603
10604 for (const MachineOperand &Op : I->operands()) {
10605 // We don't check reg masks here as they're used only on calls:
10606 // 1. EXEC is only considered const within one BB
10607 // 2. Call should be a terminator instruction if present in a BB
10608
10609 if (!Op.isReg())
10610 continue;
10611
10612 Register Reg = Op.getReg();
10613 if (Op.isUse()) {
10614 if (Reg == VReg && --NumUse == 0)
10615 return false;
10616 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10617 return true;
10618 }
10619 }
10620}
10621
10624 const DebugLoc &DL, Register Src, Register Dst) const {
10625 auto Cur = MBB.begin();
10626 if (Cur != MBB.end())
10627 do {
10628 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10629 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10630 ++Cur;
10631 } while (Cur != MBB.end() && Cur != LastPHIIt);
10632
10633 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10634 Dst);
10635}
10636
10639 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10640 if (InsPt != MBB.end() &&
10641 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10642 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10643 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10644 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10645 InsPt++;
10646 return BuildMI(MBB, InsPt, DL,
10647 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10648 .addReg(Src, {}, SrcSubReg)
10649 .addReg(AMDGPU::EXEC, RegState::Implicit);
10650 }
10651 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10652 Dst);
10653}
10654
10655bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10656
10659 MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr *&CopyMI,
10660 LiveIntervals *LIS, VirtRegMap *VRM) const {
10661 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10662 //
10663 // %0:sreg_32 = COPY $m0
10664 //
10665 // We explicitly chose SReg_32 for the virtual register so such a copy might
10666 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10667 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10668 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10669 // TargetInstrInfo::foldMemoryOperand() is going to try.
10670 // A similar issue also exists with spilling and reloading $exec registers.
10671 //
10672 // To prevent that, constrain the %0 register class here.
10673 if (isFullCopyInstr(MI)) {
10674 Register DstReg = MI.getOperand(0).getReg();
10675 Register SrcReg = MI.getOperand(1).getReg();
10676 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10677 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10678 MachineRegisterInfo &MRI = MF.getRegInfo();
10679 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10680 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10681 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10682 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10683 return nullptr;
10684 }
10685 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10686 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10687 return nullptr;
10688 }
10689 }
10690 }
10691
10692 return nullptr;
10693}
10694
10696 const MachineInstr &MI,
10697 unsigned *PredCost) const {
10698 if (MI.isBundle()) {
10700 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10701 unsigned Lat = 0, Count = 0;
10702 for (++I; I != E && I->isBundledWithPred(); ++I) {
10703 ++Count;
10704 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10705 }
10706 return Lat + Count - 1;
10707 }
10708
10709 return SchedModel.computeInstrLatency(&MI);
10710}
10711
10712const MachineOperand &
10714 if (const MachineOperand *CallAddrOp =
10715 getNamedOperand(MI, AMDGPU::OpName::src0))
10716 return *CallAddrOp;
10718}
10719
10722 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10723 unsigned Opcode = MI.getOpcode();
10724
10725 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10726 Register Dst = MI.getOperand(0).getReg();
10727 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10728 : MI.getOperand(1).getReg();
10729 LLT DstTy = MRI.getType(Dst);
10730 LLT SrcTy = MRI.getType(Src);
10731 unsigned DstAS = DstTy.getAddressSpace();
10732 unsigned SrcAS = SrcTy.getAddressSpace();
10733 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10734 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10735 ST.hasGloballyAddressableScratch()
10738 };
10739
10740 // If the target supports globally addressable scratch, the mapping from
10741 // scratch memory to the flat aperture changes therefore an address space cast
10742 // is no longer uniform.
10743 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10744 return HandleAddrSpaceCast(MI);
10745
10746 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10747 auto IID = GI->getIntrinsicID();
10752
10753 switch (IID) {
10754 case Intrinsic::amdgcn_addrspacecast_nonnull:
10755 return HandleAddrSpaceCast(MI);
10756 case Intrinsic::amdgcn_if:
10757 case Intrinsic::amdgcn_else:
10758 // FIXME: Uniform if second result
10759 break;
10760 }
10761
10763 }
10764
10765 // Loads from the private and flat address spaces are divergent, because
10766 // threads can execute the load instruction with the same inputs and get
10767 // different results.
10768 //
10769 // All other loads are not divergent, because if threads issue loads with the
10770 // same arguments, they will always get the same result.
10771 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10772 Opcode == AMDGPU::G_SEXTLOAD) {
10773 if (MI.memoperands_empty())
10774 return ValueUniformity::NeverUniform; // conservative assumption
10775
10776 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10777 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10778 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10779 })) {
10780 // At least one MMO in a non-global address space.
10782 }
10784 }
10785
10786 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10787 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10788 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10789 AMDGPU::isGenericAtomic(Opcode)) {
10791 }
10793}
10794
10796 if (!Formatter)
10797 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10798 return Formatter.get();
10799}
10800
10802
10803 if (isNeverUniform(MI))
10805
10806 unsigned opcode = MI.getOpcode();
10807 if (opcode == AMDGPU::V_READLANE_B32 ||
10808 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10809 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10811
10812 if (isCopyInstr(MI)) {
10813 const MachineOperand &srcOp = MI.getOperand(1);
10814 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10815 const TargetRegisterClass *regClass =
10816 RI.getPhysRegBaseClass(srcOp.getReg());
10817 return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
10819 }
10821 }
10822
10823 // GMIR handling
10824 if (MI.isPreISelOpcode())
10826
10827 // Atomics are divergent because they are executed sequentially: when an
10828 // atomic operation refers to the same address in each thread, then each
10829 // thread after the first sees the value written by the previous thread as
10830 // original value.
10831
10832 if (isAtomic(MI))
10834
10835 // Loads from the private and flat address spaces are divergent, because
10836 // threads can execute the load instruction with the same inputs and get
10837 // different results.
10838 if (isFLAT(MI) && MI.mayLoad()) {
10839 if (MI.memoperands_empty())
10840 return ValueUniformity::NeverUniform; // conservative assumption
10841
10842 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10843 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10844 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10845 })) {
10846 // At least one MMO in a non-global address space.
10848 }
10849
10851 }
10852
10853 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10854 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10855
10856 // FIXME: It's conceptually broken to report this for an instruction, and not
10857 // a specific def operand. For inline asm in particular, there could be mixed
10858 // uniform and divergent results.
10859 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10860 const MachineOperand &SrcOp = MI.getOperand(I);
10861 if (!SrcOp.isReg())
10862 continue;
10863
10864 Register Reg = SrcOp.getReg();
10865 if (!Reg || !SrcOp.readsReg())
10866 continue;
10867
10868 // If RegBank is null, this is unassigned or an unallocatable special
10869 // register, which are all scalars.
10870 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10871 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10873 }
10874
10875 // TODO: Uniformity check condtions above can be rearranged for more
10876 // redability
10877
10878 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10879 // currently turned into no-op COPYs by SelectionDAG ISel and are
10880 // therefore no longer recognizable.
10881
10883}
10884
10886 switch (MF.getFunction().getCallingConv()) {
10888 return 1;
10890 return 2;
10892 return 3;
10896 const Function &F = MF.getFunction();
10897 F.getContext().diagnose(DiagnosticInfoUnsupported(
10898 F, "ds_ordered_count unsupported for this calling conv"));
10899 [[fallthrough]];
10900 }
10903 case CallingConv::C:
10904 case CallingConv::Fast:
10905 default:
10906 // Assume other calling conventions are various compute callable functions
10907 return 0;
10908 }
10909}
10910
10912 Register &SrcReg2, int64_t &CmpMask,
10913 int64_t &CmpValue) const {
10914 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10915 return false;
10916
10917 switch (MI.getOpcode()) {
10918 default:
10919 break;
10920 case AMDGPU::S_CMP_EQ_U32:
10921 case AMDGPU::S_CMP_EQ_I32:
10922 case AMDGPU::S_CMP_LG_U32:
10923 case AMDGPU::S_CMP_LG_I32:
10924 case AMDGPU::S_CMP_LT_U32:
10925 case AMDGPU::S_CMP_LT_I32:
10926 case AMDGPU::S_CMP_GT_U32:
10927 case AMDGPU::S_CMP_GT_I32:
10928 case AMDGPU::S_CMP_LE_U32:
10929 case AMDGPU::S_CMP_LE_I32:
10930 case AMDGPU::S_CMP_GE_U32:
10931 case AMDGPU::S_CMP_GE_I32:
10932 case AMDGPU::S_CMP_EQ_U64:
10933 case AMDGPU::S_CMP_LG_U64:
10934 SrcReg = MI.getOperand(0).getReg();
10935 if (MI.getOperand(1).isReg()) {
10936 if (MI.getOperand(1).getSubReg())
10937 return false;
10938 SrcReg2 = MI.getOperand(1).getReg();
10939 CmpValue = 0;
10940 } else if (MI.getOperand(1).isImm()) {
10941 SrcReg2 = Register();
10942 CmpValue = MI.getOperand(1).getImm();
10943 } else {
10944 return false;
10945 }
10946 CmpMask = ~0;
10947 return true;
10948 case AMDGPU::S_CMPK_EQ_U32:
10949 case AMDGPU::S_CMPK_EQ_I32:
10950 case AMDGPU::S_CMPK_LG_U32:
10951 case AMDGPU::S_CMPK_LG_I32:
10952 case AMDGPU::S_CMPK_LT_U32:
10953 case AMDGPU::S_CMPK_LT_I32:
10954 case AMDGPU::S_CMPK_GT_U32:
10955 case AMDGPU::S_CMPK_GT_I32:
10956 case AMDGPU::S_CMPK_LE_U32:
10957 case AMDGPU::S_CMPK_LE_I32:
10958 case AMDGPU::S_CMPK_GE_U32:
10959 case AMDGPU::S_CMPK_GE_I32:
10960 SrcReg = MI.getOperand(0).getReg();
10961 SrcReg2 = Register();
10962 CmpValue = MI.getOperand(1).getImm();
10963 CmpMask = ~0;
10964 return true;
10965 }
10966
10967 return false;
10968}
10969
10971 for (MachineBasicBlock *S : MBB->successors()) {
10972 if (S->isLiveIn(AMDGPU::SCC))
10973 return false;
10974 }
10975 return true;
10976}
10977
10978// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10979// (incoming SCC) = !(SCC defined by SCCDef).
10980// Return true if all uses can be re-written, false otherwise.
10981bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10982 MachineBasicBlock *MBB = SCCDef->getParent();
10983 SmallVector<MachineInstr *> InvertInstr;
10984 bool SCCIsDead = false;
10985
10986 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10987 constexpr unsigned ScanLimit = 12;
10988 unsigned Count = 0;
10989 for (MachineInstr &MI :
10990 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
10991 if (++Count > ScanLimit)
10992 return false;
10993 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
10994 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10995 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10996 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10997 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10998 InvertInstr.push_back(&MI);
10999 else
11000 return false;
11001 }
11002 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11003 SCCIsDead = true;
11004 break;
11005 }
11006 }
11007 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11008 SCCIsDead = true;
11009
11010 // SCC may have more uses. Can't invert all of them.
11011 if (!SCCIsDead)
11012 return false;
11013
11014 // Invert uses
11015 for (MachineInstr *MI : InvertInstr) {
11016 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11017 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11018 swapOperands(*MI);
11019 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11020 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11021 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11022 ? AMDGPU::S_CBRANCH_SCC1
11023 : AMDGPU::S_CBRANCH_SCC0));
11024 } else {
11025 llvm_unreachable("SCC used but no inversion handling");
11026 }
11027 }
11028 return true;
11029}
11030
11031// SCC is already valid after SCCValid.
11032// SCCRedefine will redefine SCC to the same value already available after
11033// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11034// update kill/dead flags if necessary.
11035bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11036 bool NeedInversion) const {
11037 MachineInstr *KillsSCC = nullptr;
11038 if (SCCValid->getParent() != SCCRedefine->getParent())
11039 return false;
11040 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11041 SCCRedefine->getIterator())) {
11042 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11043 return false;
11044 if (MI.killsRegister(AMDGPU::SCC, &RI))
11045 KillsSCC = &MI;
11046 }
11047 if (NeedInversion && !invertSCCUse(SCCRedefine))
11048 return false;
11049 if (MachineOperand *SccDef =
11050 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11051 SccDef->setIsDead(false);
11052 if (KillsSCC)
11053 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11054 SCCRedefine->eraseFromParent();
11055 return true;
11056}
11057
11058static bool foldableSelect(const MachineInstr &Def) {
11059 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11060 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11061 return false;
11062 bool Op1IsNonZeroImm =
11063 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11064 bool Op2IsZeroImm =
11065 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11066 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11067 return false;
11068 return true;
11069}
11070
11071static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11072 unsigned &NewDefOpc) {
11073 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11074 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11075 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11076 Def.getOpcode() != AMDGPU::S_ADD_U32)
11077 return false;
11078 const MachineOperand &AddSrc1 = Def.getOperand(1);
11079 const MachineOperand &AddSrc2 = Def.getOperand(2);
11080 int64_t addend;
11081
11082 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11083 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11084 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11085 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11086 return false;
11087
11088 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11089 const MachineOperand *SccDef =
11090 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11091 if (!SccDef->isDead())
11092 return false;
11093 NewDefOpc = AMDGPU::S_ADD_U32;
11094 }
11095 NeedInversion = !NeedInversion;
11096 return true;
11097}
11098
11100 Register SrcReg2, int64_t CmpMask,
11101 int64_t CmpValue,
11102 const MachineRegisterInfo *MRI) const {
11103 if (!SrcReg || SrcReg.isPhysical())
11104 return false;
11105
11106 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11107 return false;
11108
11109 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11110 this](bool NeedInversion) -> bool {
11111 if (CmpValue != 0)
11112 return false;
11113
11114 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11115 if (!Def)
11116 return false;
11117
11118 // For S_OP that set SCC = DST!=0, do the transformation
11119 //
11120 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11121 //
11122 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11123 // do the transformation:
11124 //
11125 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11126 //
11127 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11128 // for S_CSELECT* already has the same value that will be calculated by
11129 // s_cmp_lg_*
11130 //
11131 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11132 // (non-zero imm), 0)
11133
11134 unsigned NewDefOpc = Def->getOpcode();
11135 if (!setsSCCIfResultIsNonZero(*Def) &&
11136 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11137 !foldableSelect(*Def))
11138 return false;
11139
11140 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11141 return false;
11142
11143 if (NewDefOpc != Def->getOpcode())
11144 Def->setDesc(get(NewDefOpc));
11145
11146 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11147 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11148 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11149 // sX = s_cselect_b64 (non-zero imm), 0
11150 // sLo = copy sX.sub0
11151 // sHi = copy sX.sub1
11152 // sY = s_or_b32 sLo, sHi
11153 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11154 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11155 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11156 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11157 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11158 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11159 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11160 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11161 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11162 Def2->getOperand(1).isReg() &&
11163 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11164 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11165 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11166 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11167 if (Select && foldableSelect(*Select))
11168 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11169 }
11170 }
11171 }
11172 return true;
11173 };
11174
11175 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11176 this](int64_t ExpectedValue, unsigned SrcSize,
11177 bool IsReversible, bool IsSigned) -> bool {
11178 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11179 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11180 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11181 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11182 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11183 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11184 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11185 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11186 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11187 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11188 //
11189 // Signed ge/gt are not used for the sign bit.
11190 //
11191 // If result of the AND is unused except in the compare:
11192 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11193 //
11194 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11195 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11196 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11197 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11198 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11199 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11200
11201 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11202 if (!Def)
11203 return false;
11204
11205 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11206 Def->getOpcode() != AMDGPU::S_AND_B64)
11207 return false;
11208
11209 int64_t Mask;
11210 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11211 if (MO->isImm())
11212 Mask = MO->getImm();
11213 else if (!getFoldableImm(MO, Mask))
11214 return false;
11215 Mask &= maxUIntN(SrcSize);
11216 return isPowerOf2_64(Mask);
11217 };
11218
11219 MachineOperand *SrcOp = &Def->getOperand(1);
11220 if (isMask(SrcOp))
11221 SrcOp = &Def->getOperand(2);
11222 else if (isMask(&Def->getOperand(2)))
11223 SrcOp = &Def->getOperand(1);
11224 else
11225 return false;
11226
11227 // A valid Mask is required to have a single bit set, hence a non-zero and
11228 // power-of-two value. This verifies that we will not do 64-bit shift below.
11229 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11230 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11231 if (IsSigned && BitNo == SrcSize - 1)
11232 return false;
11233
11234 ExpectedValue <<= BitNo;
11235
11236 bool IsReversedCC = false;
11237 if (CmpValue != ExpectedValue) {
11238 if (!IsReversible)
11239 return false;
11240 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11241 if (!IsReversedCC)
11242 return false;
11243 }
11244
11245 Register DefReg = Def->getOperand(0).getReg();
11246 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11247 return false;
11248
11249 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11250 return false;
11251
11252 if (!MRI->use_nodbg_empty(DefReg)) {
11253 assert(!IsReversedCC);
11254 return true;
11255 }
11256
11257 // Replace AND with unused result with a S_BITCMP.
11258 MachineBasicBlock *MBB = Def->getParent();
11259
11260 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11261 : AMDGPU::S_BITCMP1_B32
11262 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11263 : AMDGPU::S_BITCMP1_B64;
11264
11265 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11266 .add(*SrcOp)
11267 .addImm(BitNo);
11268 Def->eraseFromParent();
11269
11270 return true;
11271 };
11272
11273 switch (CmpInstr.getOpcode()) {
11274 default:
11275 break;
11276 case AMDGPU::S_CMP_EQ_U32:
11277 case AMDGPU::S_CMP_EQ_I32:
11278 case AMDGPU::S_CMPK_EQ_U32:
11279 case AMDGPU::S_CMPK_EQ_I32:
11280 return optimizeCmpAnd(1, 32, true, false) ||
11281 optimizeCmpSelect(/*NeedInversion=*/true);
11282 case AMDGPU::S_CMP_GE_U32:
11283 case AMDGPU::S_CMPK_GE_U32:
11284 return optimizeCmpAnd(1, 32, false, false);
11285 case AMDGPU::S_CMP_GE_I32:
11286 case AMDGPU::S_CMPK_GE_I32:
11287 return optimizeCmpAnd(1, 32, false, true);
11288 case AMDGPU::S_CMP_EQ_U64:
11289 return optimizeCmpAnd(1, 64, true, false);
11290 case AMDGPU::S_CMP_LG_U32:
11291 case AMDGPU::S_CMP_LG_I32:
11292 case AMDGPU::S_CMPK_LG_U32:
11293 case AMDGPU::S_CMPK_LG_I32:
11294 return optimizeCmpAnd(0, 32, true, false) ||
11295 optimizeCmpSelect(/*NeedInversion=*/false);
11296 case AMDGPU::S_CMP_GT_U32:
11297 case AMDGPU::S_CMPK_GT_U32:
11298 return optimizeCmpAnd(0, 32, false, false);
11299 case AMDGPU::S_CMP_GT_I32:
11300 case AMDGPU::S_CMPK_GT_I32:
11301 return optimizeCmpAnd(0, 32, false, true);
11302 case AMDGPU::S_CMP_LG_U64:
11303 return optimizeCmpAnd(0, 64, true, false) ||
11304 optimizeCmpSelect(/*NeedInversion=*/false);
11305 }
11306
11307 return false;
11308}
11309
11311 AMDGPU::OpName OpName) const {
11312 if (!ST.needsAlignedVGPRs())
11313 return;
11314
11315 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11316 if (OpNo < 0)
11317 return;
11318 MachineOperand &Op = MI.getOperand(OpNo);
11319 if (getOpSize(MI, OpNo) > 4)
11320 return;
11321
11322 // Add implicit aligned super-reg to force alignment on the data operand.
11323 const DebugLoc &DL = MI.getDebugLoc();
11324 MachineBasicBlock *BB = MI.getParent();
11326 Register DataReg = Op.getReg();
11327 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11329 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11330 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11331 Register NewVR =
11332 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11333 : &AMDGPU::VReg_64_Align2RegClass);
11334 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11335 .addReg(DataReg, {}, Op.getSubReg())
11336 .addImm(AMDGPU::sub0)
11337 .addReg(Undef)
11338 .addImm(AMDGPU::sub1);
11339 Op.setReg(NewVR);
11340 Op.setSubReg(AMDGPU::sub0);
11341 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11342}
11343
11345 if (isIGLP(*MI))
11346 return false;
11347
11349}
11350
11352 if (!isWMMA(MI) && !isSWMMAC(MI))
11353 return false;
11354
11355 if (ST.hasGFX1250Insts())
11356 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11357
11358 return true;
11359}
11360
11362 unsigned Opcode = MI.getOpcode();
11363
11364 if (AMDGPU::isGFX12Plus(ST))
11365 return isDOT(MI) || isXDLWMMA(MI);
11366
11367 if (!isMAI(MI) || isDGEMM(Opcode) ||
11368 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11369 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11370 return false;
11371
11372 if (!ST.hasGFX940Insts())
11373 return true;
11374
11375 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11376}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static MachineBasicBlock * loadScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:145
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
Register isStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
ValueUniformity getGenericValueUniformity(const MachineInstr &MI) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
ValueUniformity getValueUniformity(const MachineInstr &MI) const final
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:204
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:212
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:219
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:214
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:240
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:215
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:251
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:226
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:245
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:216
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:241
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:223
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:205
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:231
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:598
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:600
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:597
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:599
@ TI_CONSTDATA_START
Definition AMDGPU.h:596
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
constexpr RegState getUndefRegState(bool B)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:57
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:61
MachineInstr * top() const
Definition SIInstrInfo.h:66
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:85
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.