LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the scalar result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 // Ignore comparisons which are only used masked with exec.
156 // This allows some hoisting/sinking of VALU comparisons.
157 if (MI.isCompare()) {
158 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
159 if (!Dst)
160 return true;
161
162 Register DstReg = Dst->getReg();
163 if (!DstReg.isVirtual())
164 return true;
165
166 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
167 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
168 switch (Use.getOpcode()) {
169 case AMDGPU::S_AND_SAVEEXEC_B32:
170 case AMDGPU::S_AND_SAVEEXEC_B64:
171 break;
172 case AMDGPU::S_AND_B32:
173 case AMDGPU::S_AND_B64:
174 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
175 return true;
176 break;
177 default:
178 return true;
179 }
180 }
181 return false;
182 }
183
184 // If it is not convergent it does not depend on EXEC.
185 if (!MI.isConvergent())
186 return false;
187
188 switch (MI.getOpcode()) {
189 default:
190 break;
191 case AMDGPU::V_READFIRSTLANE_B32:
192 return true;
193 }
194
195 return false;
196}
197
199 // Any implicit use of exec by VALU is not a real register read.
200 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
201 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
202}
203
205 MachineBasicBlock *SuccToSinkTo,
206 MachineCycleInfo *CI) const {
207 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
208 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
209 return true;
210
211 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
212 // Check if sinking of MI would create temporal divergent use.
213 for (auto Op : MI.uses()) {
214 if (Op.isReg() && Op.getReg().isVirtual() &&
215 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
216 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
217
218 // SgprDef defined inside cycle
219 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
220 if (FromCycle == nullptr)
221 continue;
222
223 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
224 // Check if there is a FromCycle that contains SgprDef's basic block but
225 // does not contain SuccToSinkTo and also has divergent exit condition.
226 while (FromCycle && !FromCycle->contains(ToCycle)) {
228 FromCycle->getExitingBlocks(ExitingBlocks);
229
230 // FromCycle has divergent exit condition.
231 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
232 if (hasDivergentBranch(ExitingBlock))
233 return false;
234 }
235
236 FromCycle = FromCycle->getParentCycle();
237 }
238 }
239 }
240
241 return true;
242}
243
245 int64_t &Offset0,
246 int64_t &Offset1) const {
247 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
248 return false;
249
250 unsigned Opc0 = Load0->getMachineOpcode();
251 unsigned Opc1 = Load1->getMachineOpcode();
252
253 // Make sure both are actually loads.
254 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
255 return false;
256
257 // A mayLoad instruction without a def is not a load. Likely a prefetch.
258 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
259 return false;
260
261 if (isDS(Opc0) && isDS(Opc1)) {
262
263 // FIXME: Handle this case:
264 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
265 return false;
266
267 // Check base reg.
268 if (Load0->getOperand(0) != Load1->getOperand(0))
269 return false;
270
271 // Skip read2 / write2 variants for simplicity.
272 // TODO: We should report true if the used offsets are adjacent (excluded
273 // st64 versions).
274 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
275 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
276 if (Offset0Idx == -1 || Offset1Idx == -1)
277 return false;
278
279 // XXX - be careful of dataless loads
280 // getNamedOperandIdx returns the index for MachineInstrs. Since they
281 // include the output in the operand list, but SDNodes don't, we need to
282 // subtract the index by one.
283 Offset0Idx -= get(Opc0).NumDefs;
284 Offset1Idx -= get(Opc1).NumDefs;
285 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
286 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
287 return true;
288 }
289
290 if (isSMRD(Opc0) && isSMRD(Opc1)) {
291 // Skip time and cache invalidation instructions.
292 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
293 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
294 return false;
295
296 unsigned NumOps = getNumOperandsNoGlue(Load0);
297 if (NumOps != getNumOperandsNoGlue(Load1))
298 return false;
299
300 // Check base reg.
301 if (Load0->getOperand(0) != Load1->getOperand(0))
302 return false;
303
304 // Match register offsets, if both register and immediate offsets present.
305 assert(NumOps == 4 || NumOps == 5);
306 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
307 return false;
308
309 const ConstantSDNode *Load0Offset =
311 const ConstantSDNode *Load1Offset =
313
314 if (!Load0Offset || !Load1Offset)
315 return false;
316
317 Offset0 = Load0Offset->getZExtValue();
318 Offset1 = Load1Offset->getZExtValue();
319 return true;
320 }
321
322 // MUBUF and MTBUF can access the same addresses.
323 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
324
325 // MUBUF and MTBUF have vaddr at different indices.
326 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
327 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
328 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
329 return false;
330
331 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
332 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
333
334 if (OffIdx0 == -1 || OffIdx1 == -1)
335 return false;
336
337 // getNamedOperandIdx returns the index for MachineInstrs. Since they
338 // include the output in the operand list, but SDNodes don't, we need to
339 // subtract the index by one.
340 OffIdx0 -= get(Opc0).NumDefs;
341 OffIdx1 -= get(Opc1).NumDefs;
342
343 SDValue Off0 = Load0->getOperand(OffIdx0);
344 SDValue Off1 = Load1->getOperand(OffIdx1);
345
346 // The offset might be a FrameIndexSDNode.
347 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
348 return false;
349
350 Offset0 = Off0->getAsZExtVal();
351 Offset1 = Off1->getAsZExtVal();
352 return true;
353 }
354
355 return false;
356}
357
358static bool isStride64(unsigned Opc) {
359 switch (Opc) {
360 case AMDGPU::DS_READ2ST64_B32:
361 case AMDGPU::DS_READ2ST64_B64:
362 case AMDGPU::DS_WRITE2ST64_B32:
363 case AMDGPU::DS_WRITE2ST64_B64:
364 return true;
365 default:
366 return false;
367 }
368}
369
372 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
373 const TargetRegisterInfo *TRI) const {
374 if (!LdSt.mayLoadOrStore())
375 return false;
376
377 unsigned Opc = LdSt.getOpcode();
378 OffsetIsScalable = false;
379 const MachineOperand *BaseOp, *OffsetOp;
380 int DataOpIdx;
381
382 if (isDS(LdSt)) {
383 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
384 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
385 if (OffsetOp) {
386 // Normal, single offset LDS instruction.
387 if (!BaseOp) {
388 // DS_CONSUME/DS_APPEND use M0 for the base address.
389 // TODO: find the implicit use operand for M0 and use that as BaseOp?
390 return false;
391 }
392 BaseOps.push_back(BaseOp);
393 Offset = OffsetOp->getImm();
394 // Get appropriate operand, and compute width accordingly.
395 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
396 if (DataOpIdx == -1)
397 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
398 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
399 Width = LocationSize::precise(64);
400 else
401 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
402 } else {
403 // The 2 offset instructions use offset0 and offset1 instead. We can treat
404 // these as a load with a single offset if the 2 offsets are consecutive.
405 // We will use this for some partially aligned loads.
406 const MachineOperand *Offset0Op =
407 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
408 const MachineOperand *Offset1Op =
409 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
410
411 unsigned Offset0 = Offset0Op->getImm() & 0xff;
412 unsigned Offset1 = Offset1Op->getImm() & 0xff;
413 if (Offset0 + 1 != Offset1)
414 return false;
415
416 // Each of these offsets is in element sized units, so we need to convert
417 // to bytes of the individual reads.
418
419 unsigned EltSize;
420 if (LdSt.mayLoad())
421 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
422 else {
423 assert(LdSt.mayStore());
424 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
425 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
426 }
427
428 if (isStride64(Opc))
429 EltSize *= 64;
430
431 BaseOps.push_back(BaseOp);
432 Offset = EltSize * Offset0;
433 // Get appropriate operand(s), and compute width accordingly.
434 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
435 if (DataOpIdx == -1) {
436 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
439 Width = LocationSize::precise(
440 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
441 } else {
442 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
443 }
444 }
445 return true;
446 }
447
448 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
449 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
450 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
451 return false;
452 BaseOps.push_back(RSrc);
453 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
454 if (BaseOp && !BaseOp->isFI())
455 BaseOps.push_back(BaseOp);
456 const MachineOperand *OffsetImm =
457 getNamedOperand(LdSt, AMDGPU::OpName::offset);
458 Offset = OffsetImm->getImm();
459 const MachineOperand *SOffset =
460 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
461 if (SOffset) {
462 if (SOffset->isReg())
463 BaseOps.push_back(SOffset);
464 else
465 Offset += SOffset->getImm();
466 }
467 // Get appropriate operand, and compute width accordingly.
468 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
469 if (DataOpIdx == -1)
470 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
471 if (DataOpIdx == -1) // LDS DMA
472 return false;
473 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
474 return true;
475 }
476
477 if (isImage(LdSt)) {
478 auto RsrcOpName =
479 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
480 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
481 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
482 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
483 if (VAddr0Idx >= 0) {
484 // GFX10 possible NSA encoding.
485 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
486 BaseOps.push_back(&LdSt.getOperand(I));
487 } else {
488 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
489 }
490 Offset = 0;
491 // Get appropriate operand, and compute width accordingly.
492 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
493 if (DataOpIdx == -1)
494 return false; // no return sampler
495 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
496 return true;
497 }
498
499 if (isSMRD(LdSt)) {
500 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
501 if (!BaseOp) // e.g. S_MEMTIME
502 return false;
503 BaseOps.push_back(BaseOp);
504 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
505 Offset = OffsetOp ? OffsetOp->getImm() : 0;
506 // Get appropriate operand, and compute width accordingly.
507 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
508 if (DataOpIdx == -1)
509 return false;
510 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
511 return true;
512 }
513
514 if (isFLAT(LdSt)) {
515 // Instructions have either vaddr or saddr or both or none.
516 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
517 if (BaseOp)
518 BaseOps.push_back(BaseOp);
519 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
520 if (BaseOp)
521 BaseOps.push_back(BaseOp);
522 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
523 // Get appropriate operand, and compute width accordingly.
524 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
525 if (DataOpIdx == -1)
526 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
527 if (DataOpIdx == -1) // LDS DMA
528 return false;
529 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
530 return true;
531 }
532
533 return false;
534}
535
536static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
538 const MachineInstr &MI2,
540 // Only examine the first "base" operand of each instruction, on the
541 // assumption that it represents the real base address of the memory access.
542 // Other operands are typically offsets or indices from this base address.
543 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
544 return true;
545
546 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
547 return false;
548
549 auto *MO1 = *MI1.memoperands_begin();
550 auto *MO2 = *MI2.memoperands_begin();
551 if (MO1->getAddrSpace() != MO2->getAddrSpace())
552 return false;
553
554 const auto *Base1 = MO1->getValue();
555 const auto *Base2 = MO2->getValue();
556 if (!Base1 || !Base2)
557 return false;
558 Base1 = getUnderlyingObject(Base1);
559 Base2 = getUnderlyingObject(Base2);
560
561 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
562 return false;
563
564 return Base1 == Base2;
565}
566
568 int64_t Offset1, bool OffsetIsScalable1,
570 int64_t Offset2, bool OffsetIsScalable2,
571 unsigned ClusterSize,
572 unsigned NumBytes) const {
573 // If the mem ops (to be clustered) do not have the same base ptr, then they
574 // should not be clustered
575 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
576 if (!BaseOps1.empty() && !BaseOps2.empty()) {
577 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
578 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
579 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
580 return false;
581
582 const SIMachineFunctionInfo *MFI =
583 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
584 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
585 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
586 // If only one base op is empty, they do not have the same base ptr
587 return false;
588 }
589
590 // In order to avoid register pressure, on an average, the number of DWORDS
591 // loaded together by all clustered mem ops should not exceed
592 // MaxMemoryClusterDWords. This is an empirical value based on certain
593 // observations and performance related experiments.
594 // The good thing about this heuristic is - it avoids clustering of too many
595 // sub-word loads, and also avoids clustering of wide loads. Below is the
596 // brief summary of how the heuristic behaves for various `LoadSize` when
597 // MaxMemoryClusterDWords is 8.
598 //
599 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
600 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
601 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
602 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
603 // (5) LoadSize >= 17: do not cluster
604 const unsigned LoadSize = NumBytes / ClusterSize;
605 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
606 return NumDWords <= MaxMemoryClusterDWords;
607}
608
609// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
610// the first 16 loads will be interleaved with the stores, and the next 16 will
611// be clustered as expected. It should really split into 2 16 store batches.
612//
613// Loads are clustered until this returns false, rather than trying to schedule
614// groups of stores. This also means we have to deal with saying different
615// address space loads should be clustered, and ones which might cause bank
616// conflicts.
617//
618// This might be deprecated so it might not be worth that much effort to fix.
620 int64_t Offset0, int64_t Offset1,
621 unsigned NumLoads) const {
622 assert(Offset1 > Offset0 &&
623 "Second offset should be larger than first offset!");
624 // If we have less than 16 loads in a row, and the offsets are within 64
625 // bytes, then schedule together.
626
627 // A cacheline is 64 bytes (for global memory).
628 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
629}
630
633 const DebugLoc &DL, MCRegister DestReg,
634 MCRegister SrcReg, bool KillSrc,
635 const char *Msg = "illegal VGPR to SGPR copy") {
636 MachineFunction *MF = MBB.getParent();
637
639 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
640
641 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
642 .addReg(SrcReg, getKillRegState(KillSrc));
643}
644
645/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
646/// possible to have a direct copy in these cases on GFX908, so an intermediate
647/// VGPR copy is required.
651 const DebugLoc &DL, MCRegister DestReg,
652 MCRegister SrcReg, bool KillSrc,
653 RegScavenger &RS, bool RegsOverlap,
654 Register ImpDefSuperReg = Register(),
655 Register ImpUseSuperReg = Register()) {
656 assert((TII.getSubtarget().hasMAIInsts() &&
657 !TII.getSubtarget().hasGFX90AInsts()) &&
658 "Expected GFX908 subtarget.");
659
660 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
661 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
662 "Source register of the copy should be either an SGPR or an AGPR.");
663
664 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
665 "Destination register of the copy should be an AGPR.");
666
667 const SIRegisterInfo &RI = TII.getRegisterInfo();
668
669 // First try to find defining accvgpr_write to avoid temporary registers.
670 // In the case of copies of overlapping AGPRs, we conservatively do not
671 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
672 // an accvgpr_write used for this same copy due to implicit-defs
673 if (!RegsOverlap) {
674 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
675 --Def;
676
677 if (!Def->modifiesRegister(SrcReg, &RI))
678 continue;
679
680 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
681 Def->getOperand(0).getReg() != SrcReg)
682 break;
683
684 MachineOperand &DefOp = Def->getOperand(1);
685 assert(DefOp.isReg() || DefOp.isImm());
686
687 if (DefOp.isReg()) {
688 bool SafeToPropagate = true;
689 // Check that register source operand is not clobbered before MI.
690 // Immediate operands are always safe to propagate.
691 for (auto I = Def; I != MI && SafeToPropagate; ++I)
692 if (I->modifiesRegister(DefOp.getReg(), &RI))
693 SafeToPropagate = false;
694
695 if (!SafeToPropagate)
696 break;
697
698 for (auto I = Def; I != MI; ++I)
699 I->clearRegisterKills(DefOp.getReg(), &RI);
700 }
701
702 MachineInstrBuilder Builder =
703 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
704 .add(DefOp);
705 if (ImpDefSuperReg)
706 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
707
708 if (ImpUseSuperReg) {
709 Builder.addReg(ImpUseSuperReg,
711 }
712
713 return;
714 }
715 }
716
717 RS.enterBasicBlockEnd(MBB);
718 RS.backward(std::next(MI));
719
720 // Ideally we want to have three registers for a long reg_sequence copy
721 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
722 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
723 *MBB.getParent());
724
725 // Registers in the sequence are allocated contiguously so we can just
726 // use register number to pick one of three round-robin temps.
727 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
728 Register Tmp =
729 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
730 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
731 "VGPR used for an intermediate copy should have been reserved.");
732
733 // Only loop through if there are any free registers left. We don't want to
734 // spill.
735 while (RegNo--) {
736 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
737 /* RestoreAfter */ false, 0,
738 /* AllowSpill */ false);
739 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
740 break;
741 Tmp = Tmp2;
742 RS.setRegUsed(Tmp);
743 }
744
745 // Insert copy to temporary VGPR.
746 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
747 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
748 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
749 } else {
750 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
751 }
752
753 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
754 .addReg(SrcReg, getKillRegState(KillSrc));
755 if (ImpUseSuperReg) {
756 UseBuilder.addReg(ImpUseSuperReg,
758 }
759
760 MachineInstrBuilder DefBuilder
761 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
762 .addReg(Tmp, RegState::Kill);
763
764 if (ImpDefSuperReg)
765 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
766}
767
770 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
771 const TargetRegisterClass *RC, bool Forward) {
772 const SIRegisterInfo &RI = TII.getRegisterInfo();
773 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
775 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
776
777 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
778 int16_t SubIdx = BaseIndices[Idx];
779 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
780 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
781 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
782 unsigned Opcode = AMDGPU::S_MOV_B32;
783
784 // Is SGPR aligned? If so try to combine with next.
785 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
786 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
787 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
788 // Can use SGPR64 copy
789 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
790 SubIdx = RI.getSubRegFromChannel(Channel, 2);
791 DestSubReg = RI.getSubReg(DestReg, SubIdx);
792 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
793 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
794 Opcode = AMDGPU::S_MOV_B64;
795 Idx++;
796 }
797
798 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
799 .addReg(SrcSubReg)
800 .addReg(SrcReg, RegState::Implicit);
801
802 if (!FirstMI)
803 FirstMI = LastMI;
804
805 if (!Forward)
806 I--;
807 }
808
809 assert(FirstMI && LastMI);
810 if (!Forward)
811 std::swap(FirstMI, LastMI);
812
813 FirstMI->addOperand(
814 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
815
816 if (KillSrc)
817 LastMI->addRegisterKilled(SrcReg, &RI);
818}
819
822 const DebugLoc &DL, Register DestReg,
823 Register SrcReg, bool KillSrc, bool RenamableDest,
824 bool RenamableSrc) const {
825 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
826 unsigned Size = RI.getRegSizeInBits(*RC);
827 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
828 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
829
830 // The rest of copyPhysReg assumes Src and Dst size are the same size.
831 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
832 // we remove Fix16BitCopies and this code block?
833 if (Fix16BitCopies) {
834 if (((Size == 16) != (SrcSize == 16))) {
835 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
836 assert(ST.useRealTrue16Insts());
837 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
838 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
839 RegToFix = SubReg;
840
841 if (DestReg == SrcReg) {
842 // Identity copy. Insert empty bundle since ExpandPostRA expects an
843 // instruction here.
844 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
845 return;
846 }
847 RC = RI.getPhysRegBaseClass(DestReg);
848 Size = RI.getRegSizeInBits(*RC);
849 SrcRC = RI.getPhysRegBaseClass(SrcReg);
850 SrcSize = RI.getRegSizeInBits(*SrcRC);
851 }
852 }
853
854 if (RC == &AMDGPU::VGPR_32RegClass) {
855 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
856 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
857 AMDGPU::AGPR_32RegClass.contains(SrcReg));
858 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
859 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
860 BuildMI(MBB, MI, DL, get(Opc), DestReg)
861 .addReg(SrcReg, getKillRegState(KillSrc));
862 return;
863 }
864
865 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
866 RC == &AMDGPU::SReg_32RegClass) {
867 if (SrcReg == AMDGPU::SCC) {
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
869 .addImm(1)
870 .addImm(0);
871 return;
872 }
873
874 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
875 if (DestReg == AMDGPU::VCC_LO) {
876 // FIXME: Hack until VReg_1 removed.
877 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
878 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
879 .addImm(0)
880 .addReg(SrcReg, getKillRegState(KillSrc));
881 return;
882 }
883
884 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
885 return;
886 }
887
888 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 return;
891 }
892
893 if (RC == &AMDGPU::SReg_64RegClass) {
894 if (SrcReg == AMDGPU::SCC) {
895 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
896 .addImm(1)
897 .addImm(0);
898 return;
899 }
900
901 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
902 if (DestReg == AMDGPU::VCC) {
903 // FIXME: Hack until VReg_1 removed.
904 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
905 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
906 .addImm(0)
907 .addReg(SrcReg, getKillRegState(KillSrc));
908 return;
909 }
910
911 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
912 return;
913 }
914
915 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
916 .addReg(SrcReg, getKillRegState(KillSrc));
917 return;
918 }
919
920 if (DestReg == AMDGPU::SCC) {
921 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
922 // but SelectionDAG emits such copies for i1 sources.
923 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
924 // This copy can only be produced by patterns
925 // with explicit SCC, which are known to be enabled
926 // only for subtargets with S_CMP_LG_U64 present.
927 assert(ST.hasScalarCompareEq64());
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 } else {
932 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
933 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
934 .addReg(SrcReg, getKillRegState(KillSrc))
935 .addImm(0);
936 }
937
938 return;
939 }
940
941 if (RC == &AMDGPU::AGPR_32RegClass) {
942 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
943 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
944 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
945 .addReg(SrcReg, getKillRegState(KillSrc));
946 return;
947 }
948
949 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
950 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
951 .addReg(SrcReg, getKillRegState(KillSrc));
952 return;
953 }
954
955 // FIXME: Pass should maintain scavenger to avoid scan through the block on
956 // every AGPR spill.
957 RegScavenger RS;
958 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
959 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
960 return;
961 }
962
963 if (Size == 16) {
964 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
965 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
966 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
967
968 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
969 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
970 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
971 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
972 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
973 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
974 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
975 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
976
977 if (IsSGPRDst) {
978 if (!IsSGPRSrc) {
979 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
980 return;
981 }
982
983 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
984 .addReg(NewSrcReg, getKillRegState(KillSrc));
985 return;
986 }
987
988 if (IsAGPRDst || IsAGPRSrc) {
989 if (!DstLow || !SrcLow) {
990 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
991 "Cannot use hi16 subreg with an AGPR!");
992 }
993
994 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
995 return;
996 }
997
998 if (ST.useRealTrue16Insts()) {
999 if (IsSGPRSrc) {
1000 assert(SrcLow);
1001 SrcReg = NewSrcReg;
1002 }
1003 // Use the smaller instruction encoding if possible.
1004 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1005 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1006 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1007 .addReg(SrcReg);
1008 } else {
1009 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1010 .addImm(0) // src0_modifiers
1011 .addReg(SrcReg)
1012 .addImm(0); // op_sel
1013 }
1014 return;
1015 }
1016
1017 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1018 if (!DstLow || !SrcLow) {
1019 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1020 "Cannot use hi16 subreg on VI!");
1021 }
1022
1023 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1024 .addReg(NewSrcReg, getKillRegState(KillSrc));
1025 return;
1026 }
1027
1028 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1029 .addImm(0) // src0_modifiers
1030 .addReg(NewSrcReg)
1031 .addImm(0) // clamp
1038 // First implicit operand is $exec.
1039 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1040 return;
1041 }
1042
1043 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1044 if (ST.hasMovB64()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1046 .addReg(SrcReg, getKillRegState(KillSrc));
1047 return;
1048 }
1049 if (ST.hasPkMovB32()) {
1050 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1052 .addReg(SrcReg)
1054 .addReg(SrcReg)
1055 .addImm(0) // op_sel_lo
1056 .addImm(0) // op_sel_hi
1057 .addImm(0) // neg_lo
1058 .addImm(0) // neg_hi
1059 .addImm(0) // clamp
1060 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1061 return;
1062 }
1063 }
1064
1065 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1066 if (RI.isSGPRClass(RC)) {
1067 if (!RI.isSGPRClass(SrcRC)) {
1068 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1069 return;
1070 }
1071 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1072 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1073 Forward);
1074 return;
1075 }
1076
1077 unsigned EltSize = 4;
1078 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1079 if (RI.isAGPRClass(RC)) {
1080 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1081 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1082 else if (RI.hasVGPRs(SrcRC) ||
1083 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1084 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1085 else
1086 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1087 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1088 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1089 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1090 (RI.isProperlyAlignedRC(*RC) &&
1091 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1092 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1093 if (ST.hasMovB64()) {
1094 Opcode = AMDGPU::V_MOV_B64_e32;
1095 EltSize = 8;
1096 } else if (ST.hasPkMovB32()) {
1097 Opcode = AMDGPU::V_PK_MOV_B32;
1098 EltSize = 8;
1099 }
1100 }
1101
1102 // For the cases where we need an intermediate instruction/temporary register
1103 // (destination is an AGPR), we need a scavenger.
1104 //
1105 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1106 // whole block for every handled copy.
1107 std::unique_ptr<RegScavenger> RS;
1108 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1109 RS = std::make_unique<RegScavenger>();
1110
1111 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1112
1113 // If there is an overlap, we can't kill the super-register on the last
1114 // instruction, since it will also kill the components made live by this def.
1115 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1116 const bool CanKillSuperReg = KillSrc && !Overlap;
1117
1118 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1119 unsigned SubIdx;
1120 if (Forward)
1121 SubIdx = SubIndices[Idx];
1122 else
1123 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1124 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1125 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1126 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1127
1128 bool IsFirstSubreg = Idx == 0;
1129 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1130
1131 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1132 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1133 Register ImpUseSuper = SrcReg;
1134 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1135 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1136 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1138 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1140 .addReg(SrcSubReg)
1142 .addReg(SrcSubReg)
1143 .addImm(0) // op_sel_lo
1144 .addImm(0) // op_sel_hi
1145 .addImm(0) // neg_lo
1146 .addImm(0) // neg_hi
1147 .addImm(0) // clamp
1148 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1149 if (IsFirstSubreg)
1151 } else {
1152 MachineInstrBuilder Builder =
1153 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1154 if (IsFirstSubreg)
1155 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1156
1157 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1158 }
1159 }
1160}
1161
1162int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1163 int32_t NewOpc;
1164
1165 // Try to map original to commuted opcode
1166 NewOpc = AMDGPU::getCommuteRev(Opcode);
1167 if (NewOpc != -1)
1168 // Check if the commuted (REV) opcode exists on the target.
1169 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1170
1171 // Try to map commuted to original opcode
1172 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1173 if (NewOpc != -1)
1174 // Check if the original (non-REV) opcode exists on the target.
1175 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1176
1177 return Opcode;
1178}
1179
1180const TargetRegisterClass *
1182 return &AMDGPU::VGPR_32RegClass;
1183}
1184
1187 const DebugLoc &DL, Register DstReg,
1189 Register TrueReg,
1190 Register FalseReg) const {
1191 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1192 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1194 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1195 "Not a VGPR32 reg");
1196
1197 if (Cond.size() == 1) {
1198 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1199 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1200 .add(Cond[0]);
1201 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1202 .addImm(0)
1203 .addReg(FalseReg)
1204 .addImm(0)
1205 .addReg(TrueReg)
1206 .addReg(SReg);
1207 } else if (Cond.size() == 2) {
1208 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1209 switch (Cond[0].getImm()) {
1210 case SIInstrInfo::SCC_TRUE: {
1211 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1212 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1213 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1214 .addImm(0)
1215 .addReg(FalseReg)
1216 .addImm(0)
1217 .addReg(TrueReg)
1218 .addReg(SReg);
1219 break;
1220 }
1221 case SIInstrInfo::SCC_FALSE: {
1222 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1223 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1224 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1225 .addImm(0)
1226 .addReg(FalseReg)
1227 .addImm(0)
1228 .addReg(TrueReg)
1229 .addReg(SReg);
1230 break;
1231 }
1232 case SIInstrInfo::VCCNZ: {
1233 MachineOperand RegOp = Cond[1];
1234 RegOp.setImplicit(false);
1235 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1236 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1237 .add(RegOp);
1238 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1239 .addImm(0)
1240 .addReg(FalseReg)
1241 .addImm(0)
1242 .addReg(TrueReg)
1243 .addReg(SReg);
1244 break;
1245 }
1246 case SIInstrInfo::VCCZ: {
1247 MachineOperand RegOp = Cond[1];
1248 RegOp.setImplicit(false);
1249 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1250 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1251 .add(RegOp);
1252 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1253 .addImm(0)
1254 .addReg(TrueReg)
1255 .addImm(0)
1256 .addReg(FalseReg)
1257 .addReg(SReg);
1258 break;
1259 }
1260 case SIInstrInfo::EXECNZ: {
1261 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1262 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1263 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1264 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1265 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1266 .addImm(0)
1267 .addReg(FalseReg)
1268 .addImm(0)
1269 .addReg(TrueReg)
1270 .addReg(SReg);
1271 break;
1272 }
1273 case SIInstrInfo::EXECZ: {
1274 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1275 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1276 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1277 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1278 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1279 .addImm(0)
1280 .addReg(FalseReg)
1281 .addImm(0)
1282 .addReg(TrueReg)
1283 .addReg(SReg);
1284 llvm_unreachable("Unhandled branch predicate EXECZ");
1285 break;
1286 }
1287 default:
1288 llvm_unreachable("invalid branch predicate");
1289 }
1290 } else {
1291 llvm_unreachable("Can only handle Cond size 1 or 2");
1292 }
1293}
1294
1297 const DebugLoc &DL,
1298 Register SrcReg, int Value) const {
1299 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1300 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1301 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1302 .addImm(Value)
1303 .addReg(SrcReg);
1304
1305 return Reg;
1306}
1307
1310 const DebugLoc &DL,
1311 Register SrcReg, int Value) const {
1312 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1313 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1314 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1315 .addImm(Value)
1316 .addReg(SrcReg);
1317
1318 return Reg;
1319}
1320
1322 const Register Reg,
1323 int64_t &ImmVal) const {
1324 switch (MI.getOpcode()) {
1325 case AMDGPU::V_MOV_B32_e32:
1326 case AMDGPU::S_MOV_B32:
1327 case AMDGPU::S_MOVK_I32:
1328 case AMDGPU::S_MOV_B64:
1329 case AMDGPU::V_MOV_B64_e32:
1330 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1331 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1332 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1333 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1334 case AMDGPU::V_MOV_B64_PSEUDO:
1335 case AMDGPU::V_MOV_B16_t16_e32: {
1336 const MachineOperand &Src0 = MI.getOperand(1);
1337 if (Src0.isImm()) {
1338 ImmVal = Src0.getImm();
1339 return MI.getOperand(0).getReg() == Reg;
1340 }
1341
1342 return false;
1343 }
1344 case AMDGPU::V_MOV_B16_t16_e64: {
1345 const MachineOperand &Src0 = MI.getOperand(2);
1346 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1347 ImmVal = Src0.getImm();
1348 return MI.getOperand(0).getReg() == Reg;
1349 }
1350
1351 return false;
1352 }
1353 case AMDGPU::S_BREV_B32:
1354 case AMDGPU::V_BFREV_B32_e32:
1355 case AMDGPU::V_BFREV_B32_e64: {
1356 const MachineOperand &Src0 = MI.getOperand(1);
1357 if (Src0.isImm()) {
1358 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1359 return MI.getOperand(0).getReg() == Reg;
1360 }
1361
1362 return false;
1363 }
1364 case AMDGPU::S_NOT_B32:
1365 case AMDGPU::V_NOT_B32_e32:
1366 case AMDGPU::V_NOT_B32_e64: {
1367 const MachineOperand &Src0 = MI.getOperand(1);
1368 if (Src0.isImm()) {
1369 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1370 return MI.getOperand(0).getReg() == Reg;
1371 }
1372
1373 return false;
1374 }
1375 default:
1376 return false;
1377 }
1378}
1379
1380std::optional<int64_t>
1382 if (Op.isImm())
1383 return Op.getImm();
1384
1385 if (!Op.isReg() || !Op.getReg().isVirtual())
1386 return std::nullopt;
1387 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1388 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1389 if (Def && Def->isMoveImmediate()) {
1390 const MachineOperand &ImmSrc = Def->getOperand(1);
1391 if (ImmSrc.isImm())
1392 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1393 }
1394
1395 return std::nullopt;
1396}
1397
1399
1400 if (RI.isAGPRClass(DstRC))
1401 return AMDGPU::COPY;
1402 if (RI.getRegSizeInBits(*DstRC) == 16) {
1403 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1404 // before RA.
1405 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1406 }
1407 if (RI.getRegSizeInBits(*DstRC) == 32)
1408 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1409 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1410 return AMDGPU::S_MOV_B64;
1411 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1412 return AMDGPU::V_MOV_B64_PSEUDO;
1413 return AMDGPU::COPY;
1414}
1415
1416const MCInstrDesc &
1418 bool IsIndirectSrc) const {
1419 if (IsIndirectSrc) {
1420 if (VecSize <= 32) // 4 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1422 if (VecSize <= 64) // 8 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1424 if (VecSize <= 96) // 12 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1426 if (VecSize <= 128) // 16 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1428 if (VecSize <= 160) // 20 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1430 if (VecSize <= 192) // 24 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1432 if (VecSize <= 224) // 28 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1434 if (VecSize <= 256) // 32 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1436 if (VecSize <= 288) // 36 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1438 if (VecSize <= 320) // 40 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1440 if (VecSize <= 352) // 44 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1442 if (VecSize <= 384) // 48 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1444 if (VecSize <= 512) // 64 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1446 if (VecSize <= 1024) // 128 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1448
1449 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1450 }
1451
1452 if (VecSize <= 32) // 4 bytes
1453 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1454 if (VecSize <= 64) // 8 bytes
1455 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1456 if (VecSize <= 96) // 12 bytes
1457 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1458 if (VecSize <= 128) // 16 bytes
1459 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1460 if (VecSize <= 160) // 20 bytes
1461 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1462 if (VecSize <= 192) // 24 bytes
1463 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1464 if (VecSize <= 224) // 28 bytes
1465 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1466 if (VecSize <= 256) // 32 bytes
1467 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1468 if (VecSize <= 288) // 36 bytes
1469 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1470 if (VecSize <= 320) // 40 bytes
1471 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1472 if (VecSize <= 352) // 44 bytes
1473 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1474 if (VecSize <= 384) // 48 bytes
1475 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1476 if (VecSize <= 512) // 64 bytes
1477 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1478 if (VecSize <= 1024) // 128 bytes
1479 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1480
1481 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1482}
1483
1484static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1485 if (VecSize <= 32) // 4 bytes
1486 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1487 if (VecSize <= 64) // 8 bytes
1488 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1489 if (VecSize <= 96) // 12 bytes
1490 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1491 if (VecSize <= 128) // 16 bytes
1492 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1493 if (VecSize <= 160) // 20 bytes
1494 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1495 if (VecSize <= 192) // 24 bytes
1496 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1497 if (VecSize <= 224) // 28 bytes
1498 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1499 if (VecSize <= 256) // 32 bytes
1500 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1501 if (VecSize <= 288) // 36 bytes
1502 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1503 if (VecSize <= 320) // 40 bytes
1504 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1505 if (VecSize <= 352) // 44 bytes
1506 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1507 if (VecSize <= 384) // 48 bytes
1508 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1509 if (VecSize <= 512) // 64 bytes
1510 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1511 if (VecSize <= 1024) // 128 bytes
1512 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1513
1514 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1515}
1516
1517static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1518 if (VecSize <= 32) // 4 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1520 if (VecSize <= 64) // 8 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1522 if (VecSize <= 96) // 12 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1524 if (VecSize <= 128) // 16 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1526 if (VecSize <= 160) // 20 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1528 if (VecSize <= 192) // 24 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1530 if (VecSize <= 224) // 28 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1534 if (VecSize <= 288) // 36 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1536 if (VecSize <= 320) // 40 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1538 if (VecSize <= 352) // 44 bytes
1539 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1540 if (VecSize <= 384) // 48 bytes
1541 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1542 if (VecSize <= 512) // 64 bytes
1543 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1544 if (VecSize <= 1024) // 128 bytes
1545 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1546
1547 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1548}
1549
1550static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1551 if (VecSize <= 64) // 8 bytes
1552 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1553 if (VecSize <= 128) // 16 bytes
1554 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1555 if (VecSize <= 256) // 32 bytes
1556 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1557 if (VecSize <= 512) // 64 bytes
1558 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1559 if (VecSize <= 1024) // 128 bytes
1560 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1561
1562 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1563}
1564
1565const MCInstrDesc &
1566SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1567 bool IsSGPR) const {
1568 if (IsSGPR) {
1569 switch (EltSize) {
1570 case 32:
1571 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1572 case 64:
1573 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1574 default:
1575 llvm_unreachable("invalid reg indexing elt size");
1576 }
1577 }
1578
1579 assert(EltSize == 32 && "invalid reg indexing elt size");
1581}
1582
1583static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1584 switch (Size) {
1585 case 4:
1586 return AMDGPU::SI_SPILL_S32_SAVE;
1587 case 8:
1588 return AMDGPU::SI_SPILL_S64_SAVE;
1589 case 12:
1590 return AMDGPU::SI_SPILL_S96_SAVE;
1591 case 16:
1592 return AMDGPU::SI_SPILL_S128_SAVE;
1593 case 20:
1594 return AMDGPU::SI_SPILL_S160_SAVE;
1595 case 24:
1596 return AMDGPU::SI_SPILL_S192_SAVE;
1597 case 28:
1598 return AMDGPU::SI_SPILL_S224_SAVE;
1599 case 32:
1600 return AMDGPU::SI_SPILL_S256_SAVE;
1601 case 36:
1602 return AMDGPU::SI_SPILL_S288_SAVE;
1603 case 40:
1604 return AMDGPU::SI_SPILL_S320_SAVE;
1605 case 44:
1606 return AMDGPU::SI_SPILL_S352_SAVE;
1607 case 48:
1608 return AMDGPU::SI_SPILL_S384_SAVE;
1609 case 64:
1610 return AMDGPU::SI_SPILL_S512_SAVE;
1611 case 128:
1612 return AMDGPU::SI_SPILL_S1024_SAVE;
1613 default:
1614 llvm_unreachable("unknown register size");
1615 }
1616}
1617
1618static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1619 switch (Size) {
1620 case 2:
1621 return AMDGPU::SI_SPILL_V16_SAVE;
1622 case 4:
1623 return AMDGPU::SI_SPILL_V32_SAVE;
1624 case 8:
1625 return AMDGPU::SI_SPILL_V64_SAVE;
1626 case 12:
1627 return AMDGPU::SI_SPILL_V96_SAVE;
1628 case 16:
1629 return AMDGPU::SI_SPILL_V128_SAVE;
1630 case 20:
1631 return AMDGPU::SI_SPILL_V160_SAVE;
1632 case 24:
1633 return AMDGPU::SI_SPILL_V192_SAVE;
1634 case 28:
1635 return AMDGPU::SI_SPILL_V224_SAVE;
1636 case 32:
1637 return AMDGPU::SI_SPILL_V256_SAVE;
1638 case 36:
1639 return AMDGPU::SI_SPILL_V288_SAVE;
1640 case 40:
1641 return AMDGPU::SI_SPILL_V320_SAVE;
1642 case 44:
1643 return AMDGPU::SI_SPILL_V352_SAVE;
1644 case 48:
1645 return AMDGPU::SI_SPILL_V384_SAVE;
1646 case 64:
1647 return AMDGPU::SI_SPILL_V512_SAVE;
1648 case 128:
1649 return AMDGPU::SI_SPILL_V1024_SAVE;
1650 default:
1651 llvm_unreachable("unknown register size");
1652 }
1653}
1654
1655static unsigned getAVSpillSaveOpcode(unsigned Size) {
1656 switch (Size) {
1657 case 4:
1658 return AMDGPU::SI_SPILL_AV32_SAVE;
1659 case 8:
1660 return AMDGPU::SI_SPILL_AV64_SAVE;
1661 case 12:
1662 return AMDGPU::SI_SPILL_AV96_SAVE;
1663 case 16:
1664 return AMDGPU::SI_SPILL_AV128_SAVE;
1665 case 20:
1666 return AMDGPU::SI_SPILL_AV160_SAVE;
1667 case 24:
1668 return AMDGPU::SI_SPILL_AV192_SAVE;
1669 case 28:
1670 return AMDGPU::SI_SPILL_AV224_SAVE;
1671 case 32:
1672 return AMDGPU::SI_SPILL_AV256_SAVE;
1673 case 36:
1674 return AMDGPU::SI_SPILL_AV288_SAVE;
1675 case 40:
1676 return AMDGPU::SI_SPILL_AV320_SAVE;
1677 case 44:
1678 return AMDGPU::SI_SPILL_AV352_SAVE;
1679 case 48:
1680 return AMDGPU::SI_SPILL_AV384_SAVE;
1681 case 64:
1682 return AMDGPU::SI_SPILL_AV512_SAVE;
1683 case 128:
1684 return AMDGPU::SI_SPILL_AV1024_SAVE;
1685 default:
1686 llvm_unreachable("unknown register size");
1687 }
1688}
1689
1690static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1691 bool IsVectorSuperClass) {
1692 // Currently, there is only 32-bit WWM register spills needed.
1693 if (Size != 4)
1694 llvm_unreachable("unknown wwm register spill size");
1695
1696 if (IsVectorSuperClass)
1697 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1698
1699 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1700}
1701
1703 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1704 const SIMachineFunctionInfo &MFI) const {
1705 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1706
1707 // Choose the right opcode if spilling a WWM register.
1709 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710
1711 // TODO: Check if AGPRs are available
1712 if (ST.hasMAIInsts())
1713 return getAVSpillSaveOpcode(Size);
1714
1716}
1717
1720 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1721 MachineInstr::MIFlag Flags) const {
1722 MachineFunction *MF = MBB.getParent();
1724 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1725 const DebugLoc &DL = MBB.findDebugLoc(MI);
1726
1727 MachinePointerInfo PtrInfo
1728 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1730 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1731 FrameInfo.getObjectAlign(FrameIndex));
1732 unsigned SpillSize = RI.getSpillSize(*RC);
1733
1734 MachineRegisterInfo &MRI = MF->getRegInfo();
1735 if (RI.isSGPRClass(RC)) {
1736 MFI->setHasSpilledSGPRs();
1737 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1738 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1739 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740
1741 // We are only allowed to create one new instruction when spilling
1742 // registers, so we need to use pseudo instruction for spilling SGPRs.
1743 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1744
1745 // The SGPR spill/restore instructions only work on number sgprs, so we need
1746 // to make sure we are using the correct register class.
1747 if (SrcReg.isVirtual() && SpillSize == 4) {
1748 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1749 }
1750
1751 BuildMI(MBB, MI, DL, OpDesc)
1752 .addReg(SrcReg, getKillRegState(isKill)) // data
1753 .addFrameIndex(FrameIndex) // addr
1754 .addMemOperand(MMO)
1756
1757 if (RI.spillSGPRToVGPR())
1758 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1759 return;
1760 }
1761
1762 unsigned Opcode =
1763 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1764 MFI->setHasSpilledVGPRs();
1765
1766 BuildMI(MBB, MI, DL, get(Opcode))
1767 .addReg(SrcReg, getKillRegState(isKill)) // data
1768 .addFrameIndex(FrameIndex) // addr
1769 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1770 .addImm(0) // offset
1771 .addMemOperand(MMO);
1772}
1773
1774static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1775 switch (Size) {
1776 case 4:
1777 return AMDGPU::SI_SPILL_S32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_S64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_S96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_S128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_S160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_S192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_S224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_S256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_S288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_S320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_S352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_S384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_S512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_S1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 2:
1812 return AMDGPU::SI_SPILL_V16_RESTORE;
1813 case 4:
1814 return AMDGPU::SI_SPILL_V32_RESTORE;
1815 case 8:
1816 return AMDGPU::SI_SPILL_V64_RESTORE;
1817 case 12:
1818 return AMDGPU::SI_SPILL_V96_RESTORE;
1819 case 16:
1820 return AMDGPU::SI_SPILL_V128_RESTORE;
1821 case 20:
1822 return AMDGPU::SI_SPILL_V160_RESTORE;
1823 case 24:
1824 return AMDGPU::SI_SPILL_V192_RESTORE;
1825 case 28:
1826 return AMDGPU::SI_SPILL_V224_RESTORE;
1827 case 32:
1828 return AMDGPU::SI_SPILL_V256_RESTORE;
1829 case 36:
1830 return AMDGPU::SI_SPILL_V288_RESTORE;
1831 case 40:
1832 return AMDGPU::SI_SPILL_V320_RESTORE;
1833 case 44:
1834 return AMDGPU::SI_SPILL_V352_RESTORE;
1835 case 48:
1836 return AMDGPU::SI_SPILL_V384_RESTORE;
1837 case 64:
1838 return AMDGPU::SI_SPILL_V512_RESTORE;
1839 case 128:
1840 return AMDGPU::SI_SPILL_V1024_RESTORE;
1841 default:
1842 llvm_unreachable("unknown register size");
1843 }
1844}
1845
1846static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1847 switch (Size) {
1848 case 4:
1849 return AMDGPU::SI_SPILL_AV32_RESTORE;
1850 case 8:
1851 return AMDGPU::SI_SPILL_AV64_RESTORE;
1852 case 12:
1853 return AMDGPU::SI_SPILL_AV96_RESTORE;
1854 case 16:
1855 return AMDGPU::SI_SPILL_AV128_RESTORE;
1856 case 20:
1857 return AMDGPU::SI_SPILL_AV160_RESTORE;
1858 case 24:
1859 return AMDGPU::SI_SPILL_AV192_RESTORE;
1860 case 28:
1861 return AMDGPU::SI_SPILL_AV224_RESTORE;
1862 case 32:
1863 return AMDGPU::SI_SPILL_AV256_RESTORE;
1864 case 36:
1865 return AMDGPU::SI_SPILL_AV288_RESTORE;
1866 case 40:
1867 return AMDGPU::SI_SPILL_AV320_RESTORE;
1868 case 44:
1869 return AMDGPU::SI_SPILL_AV352_RESTORE;
1870 case 48:
1871 return AMDGPU::SI_SPILL_AV384_RESTORE;
1872 case 64:
1873 return AMDGPU::SI_SPILL_AV512_RESTORE;
1874 case 128:
1875 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1876 default:
1877 llvm_unreachable("unknown register size");
1878 }
1879}
1880
1881static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1882 bool IsVectorSuperClass) {
1883 // Currently, there is only 32-bit WWM register spills needed.
1884 if (Size != 4)
1885 llvm_unreachable("unknown wwm register spill size");
1886
1887 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1888 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1889
1890 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1891}
1892
1894 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1895 const SIMachineFunctionInfo &MFI) const {
1896 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1897
1898 // Choose the right opcode if restoring a WWM register.
1900 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1901
1902 // TODO: Check if AGPRs are available
1903 if (ST.hasMAIInsts())
1905
1906 assert(!RI.isAGPRClass(RC));
1908}
1909
1912 Register DestReg, int FrameIndex,
1913 const TargetRegisterClass *RC,
1914 Register VReg, unsigned SubReg,
1915 MachineInstr::MIFlag Flags) const {
1916 MachineFunction *MF = MBB.getParent();
1918 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1919 const DebugLoc &DL = MBB.findDebugLoc(MI);
1920 unsigned SpillSize = RI.getSpillSize(*RC);
1921
1922 MachinePointerInfo PtrInfo
1923 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1924
1926 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1927 FrameInfo.getObjectAlign(FrameIndex));
1928
1929 if (RI.isSGPRClass(RC)) {
1930 MFI->setHasSpilledSGPRs();
1931 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1932 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1933 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1934
1935 // FIXME: Maybe this should not include a memoperand because it will be
1936 // lowered to non-memory instructions.
1937 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1938 if (DestReg.isVirtual() && SpillSize == 4) {
1939 MachineRegisterInfo &MRI = MF->getRegInfo();
1940 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1941 }
1942
1943 if (RI.spillSGPRToVGPR())
1944 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1945 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1946 .addFrameIndex(FrameIndex) // addr
1947 .addMemOperand(MMO)
1949
1950 return;
1951 }
1952
1953 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1954 SpillSize, *MFI);
1955 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1956 .addFrameIndex(FrameIndex) // vaddr
1957 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1958 .addImm(0) // offset
1959 .addMemOperand(MMO);
1960}
1961
1966
1969 unsigned Quantity) const {
1970 DebugLoc DL = MBB.findDebugLoc(MI);
1971 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1972 while (Quantity > 0) {
1973 unsigned Arg = std::min(Quantity, MaxSNopCount);
1974 Quantity -= Arg;
1975 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1976 }
1977}
1978
1980 auto *MF = MBB.getParent();
1981 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1982
1983 assert(Info->isEntryFunction());
1984
1985 if (MBB.succ_empty()) {
1986 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1987 if (HasNoTerminator) {
1988 if (Info->returnsVoid()) {
1989 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1990 } else {
1991 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1992 }
1993 }
1994 }
1995}
1996
2000 const DebugLoc &DL) const {
2001 MachineFunction *MF = MBB.getParent();
2002 constexpr unsigned DoorbellIDMask = 0x3ff;
2003 constexpr unsigned ECQueueWaveAbort = 0x400;
2004
2005 MachineBasicBlock *TrapBB = &MBB;
2006 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2007
2008 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2009 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2010 TrapBB = MF->CreateMachineBasicBlock();
2011 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2012 MF->push_back(TrapBB);
2013 MBB.addSuccessor(TrapBB);
2014 }
2015 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2016 // will be a nop.
2017 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2018 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2019 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2020 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2021 DoorbellReg)
2023 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2024 .addUse(AMDGPU::M0);
2025 Register DoorbellRegMasked =
2026 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2027 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2028 .addUse(DoorbellReg)
2029 .addImm(DoorbellIDMask);
2030 Register SetWaveAbortBit =
2031 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2032 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2033 .addUse(DoorbellRegMasked)
2034 .addImm(ECQueueWaveAbort);
2035 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2036 .addUse(SetWaveAbortBit);
2037 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2039 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2040 .addUse(AMDGPU::TTMP2);
2041 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2042 TrapBB->addSuccessor(HaltLoopBB);
2043
2044 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2045 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2046 .addMBB(HaltLoopBB);
2047 MF->push_back(HaltLoopBB);
2048 HaltLoopBB->addSuccessor(HaltLoopBB);
2049
2050 return MBB.getNextNode();
2051}
2052
2054 switch (MI.getOpcode()) {
2055 default:
2056 if (MI.isMetaInstruction())
2057 return 0;
2058 return 1; // FIXME: Do wait states equal cycles?
2059
2060 case AMDGPU::S_NOP:
2061 return MI.getOperand(0).getImm() + 1;
2062 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2063 // hazard, even if one exist, won't really be visible. Should we handle it?
2064 }
2065}
2066
2068 MachineBasicBlock &MBB = *MI.getParent();
2069 DebugLoc DL = MBB.findDebugLoc(MI);
2071 switch (MI.getOpcode()) {
2072 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2073 case AMDGPU::S_MOV_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_MOV_B64));
2077 break;
2078
2079 case AMDGPU::S_MOV_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_MOV_B32));
2083 break;
2084
2085 case AMDGPU::S_XOR_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_XOR_B64));
2089 break;
2090
2091 case AMDGPU::S_XOR_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_XOR_B32));
2095 break;
2096 case AMDGPU::S_OR_B64_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_OR_B64));
2100 break;
2101 case AMDGPU::S_OR_B32_term:
2102 // This is only a terminator to get the correct spill code placement during
2103 // register allocation.
2104 MI.setDesc(get(AMDGPU::S_OR_B32));
2105 break;
2106
2107 case AMDGPU::S_ANDN2_B64_term:
2108 // This is only a terminator to get the correct spill code placement during
2109 // register allocation.
2110 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2111 break;
2112
2113 case AMDGPU::S_ANDN2_B32_term:
2114 // This is only a terminator to get the correct spill code placement during
2115 // register allocation.
2116 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2117 break;
2118
2119 case AMDGPU::S_AND_B64_term:
2120 // This is only a terminator to get the correct spill code placement during
2121 // register allocation.
2122 MI.setDesc(get(AMDGPU::S_AND_B64));
2123 break;
2124
2125 case AMDGPU::S_AND_B32_term:
2126 // This is only a terminator to get the correct spill code placement during
2127 // register allocation.
2128 MI.setDesc(get(AMDGPU::S_AND_B32));
2129 break;
2130
2131 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2132 // This is only a terminator to get the correct spill code placement during
2133 // register allocation.
2134 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2135 break;
2136
2137 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2138 // This is only a terminator to get the correct spill code placement during
2139 // register allocation.
2140 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2141 break;
2142
2143 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2144 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2145 break;
2146
2147 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2148 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2149 break;
2150 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2151 Register Dst = MI.getOperand(0).getReg();
2152 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2153 MI.setDesc(
2154 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2155 break;
2156 }
2157 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2158 Register Dst = MI.getOperand(0).getReg();
2159 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2160 int64_t Imm = MI.getOperand(1).getImm();
2161
2162 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2163 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2164 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2167 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2168 .addImm(SignExtend64<32>(Imm >> 32))
2170 MI.eraseFromParent();
2171 break;
2172 }
2173
2174 [[fallthrough]];
2175 }
2176 case AMDGPU::V_MOV_B64_PSEUDO: {
2177 Register Dst = MI.getOperand(0).getReg();
2178 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2179 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2180
2181 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2182 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2183
2184 const MachineOperand &SrcOp = MI.getOperand(1);
2185 // FIXME: Will this work for 64-bit floating point immediates?
2186 assert(!SrcOp.isFPImm());
2187 if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
2188 MI.setDesc(Mov64Desc);
2189 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2190 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2191 break;
2192 }
2193 if (SrcOp.isImm()) {
2194 APInt Imm(64, SrcOp.getImm());
2195 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2196 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2197 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2198 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2199
2200 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2201 PkMovRC->contains(Dst)) {
2202 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2204 .addImm(Lo.getSExtValue())
2206 .addImm(Lo.getSExtValue())
2207 .addImm(0) // op_sel_lo
2208 .addImm(0) // op_sel_hi
2209 .addImm(0) // neg_lo
2210 .addImm(0) // neg_hi
2211 .addImm(0); // clamp
2212 } else {
2213 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2214 .addImm(Lo.getSExtValue())
2216 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2217 .addImm(Hi.getSExtValue())
2219 }
2220 } else {
2221 assert(SrcOp.isReg());
2222 if (ST.hasPkMovB32() &&
2223 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2224 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2225 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2226 .addReg(SrcOp.getReg())
2228 .addReg(SrcOp.getReg())
2229 .addImm(0) // op_sel_lo
2230 .addImm(0) // op_sel_hi
2231 .addImm(0) // neg_lo
2232 .addImm(0) // neg_hi
2233 .addImm(0); // clamp
2234 } else {
2235 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2236 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2238 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2239 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2241 }
2242 }
2243 MI.eraseFromParent();
2244 break;
2245 }
2246 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2248 break;
2249 }
2250 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2251 const MachineOperand &SrcOp = MI.getOperand(1);
2252 assert(!SrcOp.isFPImm());
2253
2254 if (ST.has64BitLiterals()) {
2255 MI.setDesc(get(AMDGPU::S_MOV_B64));
2256 break;
2257 }
2258
2259 APInt Imm(64, SrcOp.getImm());
2260 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2261 MI.setDesc(get(AMDGPU::S_MOV_B64));
2262 break;
2263 }
2264
2265 Register Dst = MI.getOperand(0).getReg();
2266 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2267 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2268
2269 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2270 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2271 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2272 .addImm(Lo.getSExtValue())
2274 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2275 .addImm(Hi.getSExtValue())
2277 MI.eraseFromParent();
2278 break;
2279 }
2280 case AMDGPU::V_SET_INACTIVE_B32: {
2281 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2282 Register DstReg = MI.getOperand(0).getReg();
2283 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2284 .add(MI.getOperand(3))
2285 .add(MI.getOperand(4))
2286 .add(MI.getOperand(1))
2287 .add(MI.getOperand(2))
2288 .add(MI.getOperand(5));
2289 MI.eraseFromParent();
2290 break;
2291 }
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2325 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2326
2327 unsigned Opc;
2328 if (RI.hasVGPRs(EltRC)) {
2329 Opc = AMDGPU::V_MOVRELD_B32_e32;
2330 } else {
2331 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2332 : AMDGPU::S_MOVRELD_B32;
2333 }
2334
2335 const MCInstrDesc &OpDesc = get(Opc);
2336 Register VecReg = MI.getOperand(0).getReg();
2337 bool IsUndef = MI.getOperand(1).isUndef();
2338 unsigned SubReg = MI.getOperand(3).getImm();
2339 assert(VecReg == MI.getOperand(1).getReg());
2340
2342 BuildMI(MBB, MI, DL, OpDesc)
2343 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2344 .add(MI.getOperand(2))
2346 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2347
2348 const int ImpDefIdx =
2349 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2350 const int ImpUseIdx = ImpDefIdx + 1;
2351 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2352 MI.eraseFromParent();
2353 break;
2354 }
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2366 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2369 assert(ST.useVGPRIndexMode());
2370 Register VecReg = MI.getOperand(0).getReg();
2371 bool IsUndef = MI.getOperand(1).isUndef();
2372 MachineOperand &Idx = MI.getOperand(3);
2373 Register SubReg = MI.getOperand(4).getImm();
2374
2375 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2376 .add(Idx)
2378 SetOn->getOperand(3).setIsUndef();
2379
2380 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2382 BuildMI(MBB, MI, DL, OpDesc)
2383 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2384 .add(MI.getOperand(2))
2386 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2387
2388 const int ImpDefIdx =
2389 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2390 const int ImpUseIdx = ImpDefIdx + 1;
2391 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2392
2393 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2394
2395 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2396
2397 MI.eraseFromParent();
2398 break;
2399 }
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2410 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2411 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2414 assert(ST.useVGPRIndexMode());
2415 Register Dst = MI.getOperand(0).getReg();
2416 Register VecReg = MI.getOperand(1).getReg();
2417 bool IsUndef = MI.getOperand(1).isUndef();
2418 Register SubReg = MI.getOperand(3).getImm();
2419
2420 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2421 .add(MI.getOperand(2))
2423 SetOn->getOperand(3).setIsUndef();
2424
2425 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2426 .addDef(Dst)
2427 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2428 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2429
2430 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2431
2432 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2433
2434 MI.eraseFromParent();
2435 break;
2436 }
2437 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2438 MachineFunction &MF = *MBB.getParent();
2439 Register Reg = MI.getOperand(0).getReg();
2440 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2441 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2442 MachineOperand OpLo = MI.getOperand(1);
2443 MachineOperand OpHi = MI.getOperand(2);
2444
2445 // Create a bundle so these instructions won't be re-ordered by the
2446 // post-RA scheduler.
2447 MIBundleBuilder Bundler(MBB, MI);
2448 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2449
2450 // What we want here is an offset from the value returned by s_getpc (which
2451 // is the address of the s_add_u32 instruction) to the global variable, but
2452 // since the encoding of $symbol starts 4 bytes after the start of the
2453 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2454 // small. This requires us to add 4 to the global variable offset in order
2455 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2456 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2457 // instruction.
2458
2459 int64_t Adjust = 0;
2460 if (ST.hasGetPCZeroExtension()) {
2461 // Fix up hardware that does not sign-extend the 48-bit PC value by
2462 // inserting: s_sext_i32_i16 reghi, reghi
2463 Bundler.append(
2464 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2465 Adjust += 4;
2466 }
2467
2468 if (OpLo.isGlobal())
2469 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2470 Bundler.append(
2471 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2472
2473 if (OpHi.isGlobal())
2474 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2475 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2476 .addReg(RegHi)
2477 .add(OpHi));
2478
2479 finalizeBundle(MBB, Bundler.begin());
2480
2481 MI.eraseFromParent();
2482 break;
2483 }
2484 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2485 MachineFunction &MF = *MBB.getParent();
2486 Register Reg = MI.getOperand(0).getReg();
2487 MachineOperand Op = MI.getOperand(1);
2488
2489 // Create a bundle so these instructions won't be re-ordered by the
2490 // post-RA scheduler.
2491 MIBundleBuilder Bundler(MBB, MI);
2492 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2493 if (Op.isGlobal())
2494 Op.setOffset(Op.getOffset() + 4);
2495 Bundler.append(
2496 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2497
2498 finalizeBundle(MBB, Bundler.begin());
2499
2500 MI.eraseFromParent();
2501 break;
2502 }
2503 case AMDGPU::ENTER_STRICT_WWM: {
2504 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2505 // Whole Wave Mode is entered.
2506 MI.setDesc(get(LMC.OrSaveExecOpc));
2507 break;
2508 }
2509 case AMDGPU::ENTER_STRICT_WQM: {
2510 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2511 // STRICT_WQM is entered.
2512 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2513 .addReg(LMC.ExecReg);
2514 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2515
2516 MI.eraseFromParent();
2517 break;
2518 }
2519 case AMDGPU::EXIT_STRICT_WWM:
2520 case AMDGPU::EXIT_STRICT_WQM: {
2521 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2522 // WWM/STICT_WQM is exited.
2523 MI.setDesc(get(LMC.MovOpc));
2524 break;
2525 }
2526 case AMDGPU::SI_RETURN: {
2527 const MachineFunction *MF = MBB.getParent();
2528 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2529 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2530 // Hiding the return address use with SI_RETURN may lead to extra kills in
2531 // the function and missing live-ins. We are fine in practice because callee
2532 // saved register handling ensures the register value is restored before
2533 // RET, but we need the undef flag here to appease the MachineVerifier
2534 // liveness checks.
2536 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2537 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2538
2539 MIB.copyImplicitOps(MI);
2540 MI.eraseFromParent();
2541 break;
2542 }
2543
2544 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2545 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2546 MI.setDesc(get(AMDGPU::S_MUL_U64));
2547 break;
2548
2549 case AMDGPU::S_GETPC_B64_pseudo:
2550 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2551 if (ST.hasGetPCZeroExtension()) {
2552 Register Dst = MI.getOperand(0).getReg();
2553 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2554 // Fix up hardware that does not sign-extend the 48-bit PC value by
2555 // inserting: s_sext_i32_i16 dsthi, dsthi
2556 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2557 DstHi)
2558 .addReg(DstHi);
2559 }
2560 break;
2561
2562 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2563 assert(ST.hasBF16PackedInsts());
2564 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2565 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2566 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2567 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2568 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2569 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2570 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2571 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2572 break;
2573 }
2574
2575 case AMDGPU::GET_STACK_BASE:
2576 // The stack starts at offset 0 unless we need to reserve some space at the
2577 // bottom.
2578 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2579 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2580 // some of the VGPRs. The size of the required scratch space has already
2581 // been computed by prolog epilog insertion.
2582 const SIMachineFunctionInfo *MFI =
2583 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2584 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2585 Register DestReg = MI.getOperand(0).getReg();
2586 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2589 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2590 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2591 // SCC, so we need to check for 0 manually.
2592 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2593 // Change the implicif-def of SCC to an explicit use (but first remove
2594 // the dead flag if present).
2595 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2596 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2597 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2598 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2599 } else {
2600 MI.setDesc(get(AMDGPU::S_MOV_B32));
2601 MI.addOperand(MachineOperand::CreateImm(0));
2602 MI.removeOperand(
2603 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2604 }
2605 break;
2606 }
2607
2608 return true;
2609}
2610
2613 unsigned SubIdx, const MachineInstr &Orig,
2614 LaneBitmask UsedLanes) const {
2615
2616 // Try shrinking the instruction to remat only the part needed for current
2617 // context.
2618 // TODO: Handle more cases.
2619 unsigned Opcode = Orig.getOpcode();
2620 switch (Opcode) {
2621 case AMDGPU::S_MOV_B64:
2622 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2623 if (SubIdx != 0)
2624 break;
2625
2626 if (!Orig.getOperand(1).isImm())
2627 break;
2628
2629 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2630 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2631 if (UsedLanes.all())
2632 break;
2633
2634 // Determine which half of the 64-bit immediate corresponds to the use.
2635 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2636 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2637 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2638
2639 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2640 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2641
2642 if (NeedLo && NeedHi)
2643 break;
2644
2645 int64_t Imm64 = Orig.getOperand(1).getImm();
2646 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2647
2648 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2649
2650 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2651 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2652 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2653 .addImm(Imm32);
2654 return;
2655 }
2656
2657 case AMDGPU::S_LOAD_DWORDX16_IMM:
2658 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2659 if (SubIdx != 0)
2660 break;
2661
2662 if (I == MBB.end())
2663 break;
2664
2665 if (I->isBundled())
2666 break;
2667
2668 // Look for a single use of the register that is also a subreg.
2669 Register RegToFind = Orig.getOperand(0).getReg();
2670 MachineOperand *UseMO = nullptr;
2671 for (auto &CandMO : I->operands()) {
2672 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2673 continue;
2674 if (UseMO) {
2675 UseMO = nullptr;
2676 break;
2677 }
2678 UseMO = &CandMO;
2679 }
2680 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2681 break;
2682
2683 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2684 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2685
2686 MachineFunction *MF = MBB.getParent();
2687 MachineRegisterInfo &MRI = MF->getRegInfo();
2688 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2689
2690 unsigned NewOpcode = -1;
2691 if (SubregSize == 256)
2692 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2693 else if (SubregSize == 128)
2694 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2695 else
2696 break;
2697
2698 const MCInstrDesc &TID = get(NewOpcode);
2699 const TargetRegisterClass *NewRC =
2700 RI.getAllocatableClass(getRegClass(TID, 0));
2701 MRI.setRegClass(DestReg, NewRC);
2702
2703 UseMO->setReg(DestReg);
2704 UseMO->setSubReg(AMDGPU::NoSubRegister);
2705
2706 // Use a smaller load with the desired size, possibly with updated offset.
2707 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2708 MI->setDesc(TID);
2709 MI->getOperand(0).setReg(DestReg);
2710 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2711 if (Offset) {
2712 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2713 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2714 OffsetMO->setImm(FinalOffset);
2715 }
2717 for (const MachineMemOperand *MemOp : Orig.memoperands())
2718 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2719 SubregSize / 8));
2720 MI->setMemRefs(*MF, NewMMOs);
2721
2722 MBB.insert(I, MI);
2723 return;
2724 }
2725
2726 default:
2727 break;
2728 }
2729
2730 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2731}
2732
2733std::pair<MachineInstr*, MachineInstr*>
2735 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2736
2737 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2739 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2740 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2741 return std::pair(&MI, nullptr);
2742 }
2743
2744 MachineBasicBlock &MBB = *MI.getParent();
2745 DebugLoc DL = MBB.findDebugLoc(MI);
2746 MachineFunction *MF = MBB.getParent();
2747 MachineRegisterInfo &MRI = MF->getRegInfo();
2748 Register Dst = MI.getOperand(0).getReg();
2749 unsigned Part = 0;
2750 MachineInstr *Split[2];
2751
2752 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2753 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2754 if (Dst.isPhysical()) {
2755 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2756 } else {
2757 assert(MRI.isSSA());
2758 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2759 MovDPP.addDef(Tmp);
2760 }
2761
2762 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2763 const MachineOperand &SrcOp = MI.getOperand(I);
2764 assert(!SrcOp.isFPImm());
2765 if (SrcOp.isImm()) {
2766 APInt Imm(64, SrcOp.getImm());
2767 Imm.ashrInPlace(Part * 32);
2768 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2769 } else {
2770 assert(SrcOp.isReg());
2771 Register Src = SrcOp.getReg();
2772 if (Src.isPhysical())
2773 MovDPP.addReg(RI.getSubReg(Src, Sub));
2774 else
2775 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2776 }
2777 }
2778
2779 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2780 MovDPP.addImm(MO.getImm());
2781
2782 Split[Part] = MovDPP;
2783 ++Part;
2784 }
2785
2786 if (Dst.isVirtual())
2787 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2788 .addReg(Split[0]->getOperand(0).getReg())
2789 .addImm(AMDGPU::sub0)
2790 .addReg(Split[1]->getOperand(0).getReg())
2791 .addImm(AMDGPU::sub1);
2792
2793 MI.eraseFromParent();
2794 return std::pair(Split[0], Split[1]);
2795}
2796
2797std::optional<DestSourcePair>
2799 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2800 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2801
2802 return std::nullopt;
2803}
2804
2806 AMDGPU::OpName Src0OpName,
2807 MachineOperand &Src1,
2808 AMDGPU::OpName Src1OpName) const {
2809 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2810 if (!Src0Mods)
2811 return false;
2812
2813 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2814 assert(Src1Mods &&
2815 "All commutable instructions have both src0 and src1 modifiers");
2816
2817 int Src0ModsVal = Src0Mods->getImm();
2818 int Src1ModsVal = Src1Mods->getImm();
2819
2820 Src1Mods->setImm(Src0ModsVal);
2821 Src0Mods->setImm(Src1ModsVal);
2822 return true;
2823}
2824
2826 MachineOperand &RegOp,
2827 MachineOperand &NonRegOp) {
2828 Register Reg = RegOp.getReg();
2829 unsigned SubReg = RegOp.getSubReg();
2830 bool IsKill = RegOp.isKill();
2831 bool IsDead = RegOp.isDead();
2832 bool IsUndef = RegOp.isUndef();
2833 bool IsDebug = RegOp.isDebug();
2834
2835 if (NonRegOp.isImm())
2836 RegOp.ChangeToImmediate(NonRegOp.getImm());
2837 else if (NonRegOp.isFI())
2838 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2839 else if (NonRegOp.isGlobal()) {
2840 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2841 NonRegOp.getTargetFlags());
2842 } else
2843 return nullptr;
2844
2845 // Make sure we don't reinterpret a subreg index in the target flags.
2846 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2847
2848 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2849 NonRegOp.setSubReg(SubReg);
2850
2851 return &MI;
2852}
2853
2855 MachineOperand &NonRegOp1,
2856 MachineOperand &NonRegOp2) {
2857 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2858 int64_t NonRegVal = NonRegOp1.getImm();
2859
2860 NonRegOp1.setImm(NonRegOp2.getImm());
2861 NonRegOp2.setImm(NonRegVal);
2862 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2863 NonRegOp2.setTargetFlags(TargetFlags);
2864 return &MI;
2865}
2866
2867bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2868 unsigned OpIdx1) const {
2869 const MCInstrDesc &InstDesc = MI.getDesc();
2870 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2871 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2872
2873 unsigned Opc = MI.getOpcode();
2874 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2875
2876 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2877 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2878
2879 // Swap doesn't breach constant bus or literal limits
2880 // It may move literal to position other than src0, this is not allowed
2881 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2882 // FIXME: After gfx9, literal can be in place other than Src0
2883 if (isVALU(MI)) {
2884 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2885 !isInlineConstant(MO0, OpInfo1))
2886 return false;
2887 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2888 !isInlineConstant(MO1, OpInfo0))
2889 return false;
2890 }
2891
2892 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2893 if (OpInfo1.RegClass == -1)
2894 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2895 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2896 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2897 }
2898 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2899 if (OpInfo0.RegClass == -1)
2900 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2901 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2902 isLegalRegOperand(MI, OpIdx0, MO1);
2903 }
2904
2905 // No need to check 64-bit literals since swapping does not bring new
2906 // 64-bit literals into current instruction to fold to 32-bit
2907
2908 return isImmOperandLegal(MI, OpIdx1, MO0);
2909}
2910
2912 unsigned Src0Idx,
2913 unsigned Src1Idx) const {
2914 assert(!NewMI && "this should never be used");
2915
2916 unsigned Opc = MI.getOpcode();
2917 int CommutedOpcode = commuteOpcode(Opc);
2918 if (CommutedOpcode == -1)
2919 return nullptr;
2920
2921 if (Src0Idx > Src1Idx)
2922 std::swap(Src0Idx, Src1Idx);
2923
2924 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2925 static_cast<int>(Src0Idx) &&
2926 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2927 static_cast<int>(Src1Idx) &&
2928 "inconsistency with findCommutedOpIndices");
2929
2930 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2931 return nullptr;
2932
2933 MachineInstr *CommutedMI = nullptr;
2934 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2935 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2936 if (Src0.isReg() && Src1.isReg()) {
2937 // Be sure to copy the source modifiers to the right place.
2938 CommutedMI =
2939 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2940 } else if (Src0.isReg() && !Src1.isReg()) {
2941 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2942 } else if (!Src0.isReg() && Src1.isReg()) {
2943 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2944 } else if (Src0.isImm() && Src1.isImm()) {
2945 CommutedMI = swapImmOperands(MI, Src0, Src1);
2946 } else {
2947 // FIXME: Found two non registers to commute. This does happen.
2948 return nullptr;
2949 }
2950
2951 if (CommutedMI) {
2952 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2953 Src1, AMDGPU::OpName::src1_modifiers);
2954
2955 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2956 AMDGPU::OpName::src1_sel);
2957
2958 CommutedMI->setDesc(get(CommutedOpcode));
2959 }
2960
2961 return CommutedMI;
2962}
2963
2964// This needs to be implemented because the source modifiers may be inserted
2965// between the true commutable operands, and the base
2966// TargetInstrInfo::commuteInstruction uses it.
2968 unsigned &SrcOpIdx0,
2969 unsigned &SrcOpIdx1) const {
2970 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2971}
2972
2974 unsigned &SrcOpIdx0,
2975 unsigned &SrcOpIdx1) const {
2976 if (!Desc.isCommutable())
2977 return false;
2978
2979 unsigned Opc = Desc.getOpcode();
2980 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2981 if (Src0Idx == -1)
2982 return false;
2983
2984 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2985 if (Src1Idx == -1)
2986 return false;
2987
2988 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2989}
2990
2992 int64_t BrOffset) const {
2993 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2994 // because its dest block is unanalyzable.
2995 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2996
2997 // Convert to dwords.
2998 BrOffset /= 4;
2999
3000 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
3001 // from the next instruction.
3002 BrOffset -= 1;
3003
3004 return isIntN(BranchOffsetBits, BrOffset);
3005}
3006
3009 return MI.getOperand(0).getMBB();
3010}
3011
3013 for (const MachineInstr &MI : MBB->terminators()) {
3014 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
3015 MI.getOpcode() == AMDGPU::SI_LOOP)
3016 return true;
3017 }
3018 return false;
3019}
3020
3022 MachineBasicBlock &DestBB,
3023 MachineBasicBlock &RestoreBB,
3024 const DebugLoc &DL, int64_t BrOffset,
3025 RegScavenger *RS) const {
3026 assert(MBB.empty() &&
3027 "new block should be inserted for expanding unconditional branch");
3028 assert(MBB.pred_size() == 1);
3029 assert(RestoreBB.empty() &&
3030 "restore block should be inserted for restoring clobbered registers");
3031
3032 MachineFunction *MF = MBB.getParent();
3033 MachineRegisterInfo &MRI = MF->getRegInfo();
3035 auto I = MBB.end();
3036 auto &MCCtx = MF->getContext();
3037
3038 if (ST.useAddPC64Inst()) {
3039 MCSymbol *Offset =
3040 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
3041 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
3043 MCSymbol *PostAddPCLabel =
3044 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
3045 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
3046 auto *OffsetExpr = MCBinaryExpr::createSub(
3047 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
3048 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
3049 Offset->setVariableValue(OffsetExpr);
3050 return;
3051 }
3052
3053 assert(RS && "RegScavenger required for long branching");
3054
3055 // FIXME: Virtual register workaround for RegScavenger not working with empty
3056 // blocks.
3057 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3058
3059 // Note: as this is used after hazard recognizer we need to apply some hazard
3060 // workarounds directly.
3061 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3062 ST.hasVALUReadSGPRHazard();
3063 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3064 if (FlushSGPRWrites)
3065 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
3067 };
3068
3069 // We need to compute the offset relative to the instruction immediately after
3070 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3071 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3072 ApplyHazardWorkarounds();
3073
3074 MCSymbol *PostGetPCLabel =
3075 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3076 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3077
3078 MCSymbol *OffsetLo =
3079 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3080 MCSymbol *OffsetHi =
3081 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3082 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3083 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3084 .addReg(PCReg, {}, AMDGPU::sub0)
3085 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3086 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3087 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3088 .addReg(PCReg, {}, AMDGPU::sub1)
3089 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3090 ApplyHazardWorkarounds();
3091
3092 // Insert the indirect branch after the other terminator.
3093 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3094 .addReg(PCReg);
3095
3096 // If a spill is needed for the pc register pair, we need to insert a spill
3097 // restore block right before the destination block, and insert a short branch
3098 // into the old destination block's fallthrough predecessor.
3099 // e.g.:
3100 //
3101 // s_cbranch_scc0 skip_long_branch:
3102 //
3103 // long_branch_bb:
3104 // spill s[8:9]
3105 // s_getpc_b64 s[8:9]
3106 // s_add_u32 s8, s8, restore_bb
3107 // s_addc_u32 s9, s9, 0
3108 // s_setpc_b64 s[8:9]
3109 //
3110 // skip_long_branch:
3111 // foo;
3112 //
3113 // .....
3114 //
3115 // dest_bb_fallthrough_predecessor:
3116 // bar;
3117 // s_branch dest_bb
3118 //
3119 // restore_bb:
3120 // restore s[8:9]
3121 // fallthrough dest_bb
3122 ///
3123 // dest_bb:
3124 // buzz;
3125
3126 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3127 Register Scav;
3128
3129 // If we've previously reserved a register for long branches
3130 // avoid running the scavenger and just use those registers
3131 if (LongBranchReservedReg) {
3132 RS->enterBasicBlock(MBB);
3133 Scav = LongBranchReservedReg;
3134 } else {
3135 RS->enterBasicBlockEnd(MBB);
3136 Scav = RS->scavengeRegisterBackwards(
3137 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3138 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3139 }
3140 if (Scav) {
3141 RS->setRegUsed(Scav);
3142 MRI.replaceRegWith(PCReg, Scav);
3143 MRI.clearVirtRegs();
3144 } else {
3145 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3146 // SGPR spill.
3147 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3148 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3149 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3150 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3151 MRI.clearVirtRegs();
3152 }
3153
3154 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3155 // Now, the distance could be defined.
3157 MCSymbolRefExpr::create(DestLabel, MCCtx),
3158 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3159 // Add offset assignments.
3160 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3161 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3162 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3163 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3164}
3165
3166unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3167 switch (Cond) {
3168 case SIInstrInfo::SCC_TRUE:
3169 return AMDGPU::S_CBRANCH_SCC1;
3170 case SIInstrInfo::SCC_FALSE:
3171 return AMDGPU::S_CBRANCH_SCC0;
3172 case SIInstrInfo::VCCNZ:
3173 return AMDGPU::S_CBRANCH_VCCNZ;
3174 case SIInstrInfo::VCCZ:
3175 return AMDGPU::S_CBRANCH_VCCZ;
3176 case SIInstrInfo::EXECNZ:
3177 return AMDGPU::S_CBRANCH_EXECNZ;
3178 case SIInstrInfo::EXECZ:
3179 return AMDGPU::S_CBRANCH_EXECZ;
3180 default:
3181 llvm_unreachable("invalid branch predicate");
3182 }
3183}
3184
3185SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3186 switch (Opcode) {
3187 case AMDGPU::S_CBRANCH_SCC0:
3188 return SCC_FALSE;
3189 case AMDGPU::S_CBRANCH_SCC1:
3190 return SCC_TRUE;
3191 case AMDGPU::S_CBRANCH_VCCNZ:
3192 return VCCNZ;
3193 case AMDGPU::S_CBRANCH_VCCZ:
3194 return VCCZ;
3195 case AMDGPU::S_CBRANCH_EXECNZ:
3196 return EXECNZ;
3197 case AMDGPU::S_CBRANCH_EXECZ:
3198 return EXECZ;
3199 default:
3200 return INVALID_BR;
3201 }
3202}
3203
3207 MachineBasicBlock *&FBB,
3209 bool AllowModify) const {
3210 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3211 // Unconditional Branch
3212 TBB = I->getOperand(0).getMBB();
3213 return false;
3214 }
3215
3216 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3217 if (Pred == INVALID_BR)
3218 return true;
3219
3220 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3221 Cond.push_back(MachineOperand::CreateImm(Pred));
3222 Cond.push_back(I->getOperand(1)); // Save the branch register.
3223
3224 ++I;
3225
3226 if (I == MBB.end()) {
3227 // Conditional branch followed by fall-through.
3228 TBB = CondBB;
3229 return false;
3230 }
3231
3232 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3233 TBB = CondBB;
3234 FBB = I->getOperand(0).getMBB();
3235 return false;
3236 }
3237
3238 return true;
3239}
3240
3242 MachineBasicBlock *&FBB,
3244 bool AllowModify) const {
3245 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3246 auto E = MBB.end();
3247 if (I == E)
3248 return false;
3249
3250 // Skip over the instructions that are artificially terminators for special
3251 // exec management.
3252 while (I != E && !I->isBranch() && !I->isReturn()) {
3253 switch (I->getOpcode()) {
3254 case AMDGPU::S_MOV_B64_term:
3255 case AMDGPU::S_XOR_B64_term:
3256 case AMDGPU::S_OR_B64_term:
3257 case AMDGPU::S_ANDN2_B64_term:
3258 case AMDGPU::S_AND_B64_term:
3259 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3260 case AMDGPU::S_MOV_B32_term:
3261 case AMDGPU::S_XOR_B32_term:
3262 case AMDGPU::S_OR_B32_term:
3263 case AMDGPU::S_ANDN2_B32_term:
3264 case AMDGPU::S_AND_B32_term:
3265 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3266 break;
3267 case AMDGPU::SI_IF:
3268 case AMDGPU::SI_ELSE:
3269 case AMDGPU::SI_KILL_I1_TERMINATOR:
3270 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3271 // FIXME: It's messy that these need to be considered here at all.
3272 return true;
3273 default:
3274 llvm_unreachable("unexpected non-branch terminator inst");
3275 }
3276
3277 ++I;
3278 }
3279
3280 if (I == E)
3281 return false;
3282
3283 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3284}
3285
3287 int *BytesRemoved) const {
3288 unsigned Count = 0;
3289 unsigned RemovedSize = 0;
3290 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3291 // Skip over artificial terminators when removing instructions.
3292 if (MI.isBranch() || MI.isReturn()) {
3293 RemovedSize += getInstSizeInBytes(MI);
3294 MI.eraseFromParent();
3295 ++Count;
3296 }
3297 }
3298
3299 if (BytesRemoved)
3300 *BytesRemoved = RemovedSize;
3301
3302 return Count;
3303}
3304
3305// Copy the flags onto the implicit condition register operand.
3307 const MachineOperand &OrigCond) {
3308 CondReg.setIsUndef(OrigCond.isUndef());
3309 CondReg.setIsKill(OrigCond.isKill());
3310}
3311
3314 MachineBasicBlock *FBB,
3316 const DebugLoc &DL,
3317 int *BytesAdded) const {
3318 if (!FBB && Cond.empty()) {
3319 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3320 .addMBB(TBB);
3321 if (BytesAdded)
3322 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3323 return 1;
3324 }
3325
3326 assert(TBB && Cond[0].isImm());
3327
3328 unsigned Opcode
3329 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3330
3331 if (!FBB) {
3332 MachineInstr *CondBr =
3333 BuildMI(&MBB, DL, get(Opcode))
3334 .addMBB(TBB);
3335
3336 // Copy the flags onto the implicit condition register operand.
3337 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3338 fixImplicitOperands(*CondBr);
3339
3340 if (BytesAdded)
3341 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3342 return 1;
3343 }
3344
3345 assert(TBB && FBB);
3346
3347 MachineInstr *CondBr =
3348 BuildMI(&MBB, DL, get(Opcode))
3349 .addMBB(TBB);
3350 fixImplicitOperands(*CondBr);
3351 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3352 .addMBB(FBB);
3353
3354 MachineOperand &CondReg = CondBr->getOperand(1);
3355 CondReg.setIsUndef(Cond[1].isUndef());
3356 CondReg.setIsKill(Cond[1].isKill());
3357
3358 if (BytesAdded)
3359 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3360
3361 return 2;
3362}
3363
3366 if (Cond.size() != 2) {
3367 return true;
3368 }
3369
3370 if (Cond[0].isImm()) {
3371 Cond[0].setImm(-Cond[0].getImm());
3372 return false;
3373 }
3374
3375 return true;
3376}
3377
3380 Register DstReg, Register TrueReg,
3381 Register FalseReg, int &CondCycles,
3382 int &TrueCycles, int &FalseCycles) const {
3383 switch (Cond[0].getImm()) {
3384 case VCCNZ:
3385 case VCCZ: {
3386 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3387 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3388 if (MRI.getRegClass(FalseReg) != RC)
3389 return false;
3390
3391 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3392 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3393
3394 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3395 return RI.hasVGPRs(RC) && NumInsts <= 6;
3396 }
3397 case SCC_TRUE:
3398 case SCC_FALSE: {
3399 // FIXME: We could insert for VGPRs if we could replace the original compare
3400 // with a vector one.
3401 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3402 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3403 if (MRI.getRegClass(FalseReg) != RC)
3404 return false;
3405
3406 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3407
3408 // Multiples of 8 can do s_cselect_b64
3409 if (NumInsts % 2 == 0)
3410 NumInsts /= 2;
3411
3412 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3413 return RI.isSGPRClass(RC);
3414 }
3415 default:
3416 return false;
3417 }
3418}
3419
3423 Register TrueReg, Register FalseReg) const {
3424 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3425 if (Pred == VCCZ || Pred == SCC_FALSE) {
3426 Pred = static_cast<BranchPredicate>(-Pred);
3427 std::swap(TrueReg, FalseReg);
3428 }
3429
3430 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3431 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3432 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3433
3434 if (DstSize == 32) {
3436 if (Pred == SCC_TRUE) {
3437 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3438 .addReg(TrueReg)
3439 .addReg(FalseReg);
3440 } else {
3441 // Instruction's operands are backwards from what is expected.
3442 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3443 .addReg(FalseReg)
3444 .addReg(TrueReg);
3445 }
3446
3447 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3448 return;
3449 }
3450
3451 if (DstSize == 64 && Pred == SCC_TRUE) {
3453 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3454 .addReg(TrueReg)
3455 .addReg(FalseReg);
3456
3457 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3458 return;
3459 }
3460
3461 static const int16_t Sub0_15[] = {
3462 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3463 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3464 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3465 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3466 };
3467
3468 static const int16_t Sub0_15_64[] = {
3469 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3470 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3471 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3472 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3473 };
3474
3475 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3476 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3477 const int16_t *SubIndices = Sub0_15;
3478 int NElts = DstSize / 32;
3479
3480 // 64-bit select is only available for SALU.
3481 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3482 if (Pred == SCC_TRUE) {
3483 if (NElts % 2) {
3484 SelOp = AMDGPU::S_CSELECT_B32;
3485 EltRC = &AMDGPU::SGPR_32RegClass;
3486 } else {
3487 SelOp = AMDGPU::S_CSELECT_B64;
3488 EltRC = &AMDGPU::SGPR_64RegClass;
3489 SubIndices = Sub0_15_64;
3490 NElts /= 2;
3491 }
3492 }
3493
3495 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3496
3497 I = MIB->getIterator();
3498
3500 for (int Idx = 0; Idx != NElts; ++Idx) {
3501 Register DstElt = MRI.createVirtualRegister(EltRC);
3502 Regs.push_back(DstElt);
3503
3504 unsigned SubIdx = SubIndices[Idx];
3505
3507 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3508 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3509 .addReg(FalseReg, {}, SubIdx)
3510 .addReg(TrueReg, {}, SubIdx);
3511 } else {
3512 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3513 .addReg(TrueReg, {}, SubIdx)
3514 .addReg(FalseReg, {}, SubIdx);
3515 }
3516
3517 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3519
3520 MIB.addReg(DstElt)
3521 .addImm(SubIdx);
3522 }
3523}
3524
3526 switch (MI.getOpcode()) {
3527 case AMDGPU::V_MOV_B16_t16_e32:
3528 case AMDGPU::V_MOV_B16_t16_e64:
3529 case AMDGPU::V_MOV_B32_e32:
3530 case AMDGPU::V_MOV_B32_e64:
3531 case AMDGPU::V_MOV_B64_PSEUDO:
3532 case AMDGPU::V_MOV_B64_e32:
3533 case AMDGPU::V_MOV_B64_e64:
3534 case AMDGPU::S_MOV_B32:
3535 case AMDGPU::S_MOV_B64:
3536 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3537 case AMDGPU::COPY:
3538 case AMDGPU::WWM_COPY:
3539 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3540 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3541 case AMDGPU::V_ACCVGPR_MOV_B32:
3542 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3543 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3544 return true;
3545 default:
3546 return false;
3547 }
3548}
3549
3551 switch (MI.getOpcode()) {
3552 case AMDGPU::V_MOV_B16_t16_e32:
3553 case AMDGPU::V_MOV_B16_t16_e64:
3554 return 2;
3555 case AMDGPU::V_MOV_B32_e32:
3556 case AMDGPU::V_MOV_B32_e64:
3557 case AMDGPU::V_MOV_B64_PSEUDO:
3558 case AMDGPU::V_MOV_B64_e32:
3559 case AMDGPU::V_MOV_B64_e64:
3560 case AMDGPU::S_MOV_B32:
3561 case AMDGPU::S_MOV_B64:
3562 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3563 case AMDGPU::COPY:
3564 case AMDGPU::WWM_COPY:
3565 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3566 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3567 case AMDGPU::V_ACCVGPR_MOV_B32:
3568 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3569 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3570 return 1;
3571 default:
3572 llvm_unreachable("MI is not a foldable copy");
3573 }
3574}
3575
3576static constexpr AMDGPU::OpName ModifierOpNames[] = {
3577 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3578 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3579 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3580
3582 unsigned Opc = MI.getOpcode();
3583 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3584 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3585 if (Idx >= 0)
3586 MI.removeOperand(Idx);
3587 }
3588}
3589
3591 const MCInstrDesc &NewDesc) const {
3592 MI.setDesc(NewDesc);
3593
3594 // Remove any leftover implicit operands from mutating the instruction. e.g.
3595 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3596 // anymore.
3597 const MCInstrDesc &Desc = MI.getDesc();
3598 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3599 Desc.implicit_defs().size();
3600
3601 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3602 MI.removeOperand(I);
3603}
3604
3605std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3606 unsigned SubRegIndex) {
3607 switch (SubRegIndex) {
3608 case AMDGPU::NoSubRegister:
3609 return Imm;
3610 case AMDGPU::sub0:
3611 return SignExtend64<32>(Imm);
3612 case AMDGPU::sub1:
3613 return SignExtend64<32>(Imm >> 32);
3614 case AMDGPU::lo16:
3615 return SignExtend64<16>(Imm);
3616 case AMDGPU::hi16:
3617 return SignExtend64<16>(Imm >> 16);
3618 case AMDGPU::sub1_lo16:
3619 return SignExtend64<16>(Imm >> 32);
3620 case AMDGPU::sub1_hi16:
3621 return SignExtend64<16>(Imm >> 48);
3622 default:
3623 return std::nullopt;
3624 }
3625
3626 llvm_unreachable("covered subregister switch");
3627}
3628
3629static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3630 switch (Opc) {
3631 case AMDGPU::V_MAC_F16_e32:
3632 case AMDGPU::V_MAC_F16_e64:
3633 case AMDGPU::V_MAD_F16_e64:
3634 return AMDGPU::V_MADAK_F16;
3635 case AMDGPU::V_MAC_F32_e32:
3636 case AMDGPU::V_MAC_F32_e64:
3637 case AMDGPU::V_MAD_F32_e64:
3638 return AMDGPU::V_MADAK_F32;
3639 case AMDGPU::V_FMAC_F32_e32:
3640 case AMDGPU::V_FMAC_F32_e64:
3641 case AMDGPU::V_FMA_F32_e64:
3642 return AMDGPU::V_FMAAK_F32;
3643 case AMDGPU::V_FMAC_F16_e32:
3644 case AMDGPU::V_FMAC_F16_e64:
3645 case AMDGPU::V_FMAC_F16_t16_e64:
3646 case AMDGPU::V_FMAC_F16_fake16_e64:
3647 case AMDGPU::V_FMAC_F16_t16_e32:
3648 case AMDGPU::V_FMAC_F16_fake16_e32:
3649 case AMDGPU::V_FMA_F16_e64:
3650 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3651 ? AMDGPU::V_FMAAK_F16_t16
3652 : AMDGPU::V_FMAAK_F16_fake16
3653 : AMDGPU::V_FMAAK_F16;
3654 case AMDGPU::V_FMAC_F64_e32:
3655 case AMDGPU::V_FMAC_F64_e64:
3656 case AMDGPU::V_FMA_F64_e64:
3657 return AMDGPU::V_FMAAK_F64;
3658 default:
3659 llvm_unreachable("invalid instruction");
3660 }
3661}
3662
3663static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3664 switch (Opc) {
3665 case AMDGPU::V_MAC_F16_e32:
3666 case AMDGPU::V_MAC_F16_e64:
3667 case AMDGPU::V_MAD_F16_e64:
3668 return AMDGPU::V_MADMK_F16;
3669 case AMDGPU::V_MAC_F32_e32:
3670 case AMDGPU::V_MAC_F32_e64:
3671 case AMDGPU::V_MAD_F32_e64:
3672 return AMDGPU::V_MADMK_F32;
3673 case AMDGPU::V_FMAC_F32_e32:
3674 case AMDGPU::V_FMAC_F32_e64:
3675 case AMDGPU::V_FMA_F32_e64:
3676 return AMDGPU::V_FMAMK_F32;
3677 case AMDGPU::V_FMAC_F16_e32:
3678 case AMDGPU::V_FMAC_F16_e64:
3679 case AMDGPU::V_FMAC_F16_t16_e64:
3680 case AMDGPU::V_FMAC_F16_fake16_e64:
3681 case AMDGPU::V_FMAC_F16_t16_e32:
3682 case AMDGPU::V_FMAC_F16_fake16_e32:
3683 case AMDGPU::V_FMA_F16_e64:
3684 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3685 ? AMDGPU::V_FMAMK_F16_t16
3686 : AMDGPU::V_FMAMK_F16_fake16
3687 : AMDGPU::V_FMAMK_F16;
3688 case AMDGPU::V_FMAC_F64_e32:
3689 case AMDGPU::V_FMAC_F64_e64:
3690 case AMDGPU::V_FMA_F64_e64:
3691 return AMDGPU::V_FMAMK_F64;
3692 default:
3693 llvm_unreachable("invalid instruction");
3694 }
3695}
3696
3698 Register Reg, MachineRegisterInfo *MRI) const {
3699 int64_t Imm;
3700 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3701 return false;
3702
3703 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3704
3705 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3706
3707 unsigned Opc = UseMI.getOpcode();
3708 if (Opc == AMDGPU::COPY) {
3709 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3710
3711 Register DstReg = UseMI.getOperand(0).getReg();
3712 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3713
3714 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3715
3716 if (HasMultipleUses) {
3717 // TODO: This should fold in more cases with multiple use, but we need to
3718 // more carefully consider what those uses are.
3719 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3720
3721 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3722 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3723 return false;
3724
3725 // Most of the time folding a 32-bit inline constant is free (though this
3726 // might not be true if we can't later fold it into a real user).
3727 //
3728 // FIXME: This isInlineConstant check is imprecise if
3729 // getConstValDefinedInReg handled the tricky non-mov cases.
3730 if (ImmDefSize == 32 &&
3732 return false;
3733 }
3734
3735 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3736 RI.getSubRegIdxSize(UseSubReg) == 16;
3737
3738 if (Is16Bit) {
3739 if (RI.hasVGPRs(DstRC))
3740 return false; // Do not clobber vgpr_hi16
3741
3742 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3743 return false;
3744 }
3745
3746 MachineFunction *MF = UseMI.getMF();
3747
3748 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3749 MCRegister MovDstPhysReg =
3750 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3751
3752 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3753
3754 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3755 for (unsigned MovOp :
3756 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3757 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3758 const MCInstrDesc &MovDesc = get(MovOp);
3759
3760 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3761 if (Is16Bit) {
3762 // We just need to find a correctly sized register class, so the
3763 // subregister index compatibility doesn't matter since we're statically
3764 // extracting the immediate value.
3765 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3766 if (!MovDstRC)
3767 continue;
3768
3769 if (MovDstPhysReg) {
3770 // FIXME: We probably should not do this. If there is a live value in
3771 // the high half of the register, it will be corrupted.
3772 MovDstPhysReg =
3773 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3774 if (!MovDstPhysReg)
3775 continue;
3776 }
3777 }
3778
3779 // Result class isn't the right size, try the next instruction.
3780 if (MovDstPhysReg) {
3781 if (!MovDstRC->contains(MovDstPhysReg))
3782 return false;
3783 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3784 // TODO: This will be overly conservative in the case of 16-bit virtual
3785 // SGPRs. We could hack up the virtual register uses to use a compatible
3786 // 32-bit class.
3787 continue;
3788 }
3789
3790 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3791
3792 // Ensure the interpreted immediate value is a valid operand in the new
3793 // mov.
3794 //
3795 // FIXME: isImmOperandLegal should have form that doesn't require existing
3796 // MachineInstr or MachineOperand
3797 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3798 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3799 break;
3800
3801 NewOpc = MovOp;
3802 break;
3803 }
3804
3805 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3806 return false;
3807
3808 if (Is16Bit) {
3809 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3810 if (MovDstPhysReg)
3811 UseMI.getOperand(0).setReg(MovDstPhysReg);
3812 assert(UseMI.getOperand(1).getReg().isVirtual());
3813 }
3814
3815 const MCInstrDesc &NewMCID = get(NewOpc);
3816 UseMI.setDesc(NewMCID);
3817 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3818 UseMI.addImplicitDefUseOperands(*MF);
3819 return true;
3820 }
3821
3822 if (HasMultipleUses)
3823 return false;
3824
3825 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3826 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3827 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3828 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3829 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3830 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3831 Opc == AMDGPU::V_FMAC_F64_e64) {
3832 // Don't fold if we are using source or output modifiers. The new VOP2
3833 // instructions don't have them.
3835 return false;
3836
3837 // If this is a free constant, there's no reason to do this.
3838 // TODO: We could fold this here instead of letting SIFoldOperands do it
3839 // later.
3840 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3841
3842 // Any src operand can be used for the legality check.
3843 if (isInlineConstant(UseMI, Src0Idx, Imm))
3844 return false;
3845
3846 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3847
3848 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3849 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3850
3851 auto CopyRegOperandToNarrowerRC =
3852 [MRI, this](MachineInstr &MI, unsigned OpNo,
3853 const TargetRegisterClass *NewRC) -> void {
3854 if (!MI.getOperand(OpNo).isReg())
3855 return;
3856 Register Reg = MI.getOperand(OpNo).getReg();
3857 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3858 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3859 return;
3860 Register Tmp = MRI->createVirtualRegister(NewRC);
3861 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3862 get(AMDGPU::COPY), Tmp)
3863 .addReg(Reg);
3864 MI.getOperand(OpNo).setReg(Tmp);
3865 MI.getOperand(OpNo).setIsKill();
3866 };
3867
3868 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3869 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3870 (Src1->isReg() && Src1->getReg() == Reg)) {
3871 MachineOperand *RegSrc =
3872 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3873 if (!RegSrc->isReg())
3874 return false;
3875 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3876 ST.getConstantBusLimit(Opc) < 2)
3877 return false;
3878
3879 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3880 return false;
3881
3882 // If src2 is also a literal constant then we have to choose which one to
3883 // fold. In general it is better to choose madak so that the other literal
3884 // can be materialized in an sgpr instead of a vgpr:
3885 // s_mov_b32 s0, literal
3886 // v_madak_f32 v0, s0, v0, literal
3887 // Instead of:
3888 // v_mov_b32 v1, literal
3889 // v_madmk_f32 v0, v0, literal, v1
3890 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3891 if (Def && Def->isMoveImmediate() &&
3892 !isInlineConstant(Def->getOperand(1)))
3893 return false;
3894
3895 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3896 if (pseudoToMCOpcode(NewOpc) == -1)
3897 return false;
3898
3899 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3900 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3901
3902 // FIXME: This would be a lot easier if we could return a new instruction
3903 // instead of having to modify in place.
3904
3905 Register SrcReg = RegSrc->getReg();
3906 unsigned SrcSubReg = RegSrc->getSubReg();
3907 Src0->setReg(SrcReg);
3908 Src0->setSubReg(SrcSubReg);
3909 Src0->setIsKill(RegSrc->isKill());
3910
3911 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3912 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3913 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3914 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3915 UseMI.untieRegOperand(
3916 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3917
3918 Src1->ChangeToImmediate(*SubRegImm);
3919
3921 UseMI.setDesc(get(NewOpc));
3922
3923 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3924 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3925 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3926 Register Tmp = MRI->createVirtualRegister(NewRC);
3927 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3928 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3929 UseMI.getOperand(0).getReg())
3930 .addReg(Tmp, RegState::Kill);
3931 UseMI.getOperand(0).setReg(Tmp);
3932 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3933 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3934 }
3935
3936 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3937 if (DeleteDef)
3938 DefMI.eraseFromParent();
3939
3940 return true;
3941 }
3942
3943 // Added part is the constant: Use v_madak_{f16, f32}.
3944 if (Src2->isReg() && Src2->getReg() == Reg) {
3945 if (ST.getConstantBusLimit(Opc) < 2) {
3946 // Not allowed to use constant bus for another operand.
3947 // We can however allow an inline immediate as src0.
3948 bool Src0Inlined = false;
3949 if (Src0->isReg()) {
3950 // Try to inline constant if possible.
3951 // If the Def moves immediate and the use is single
3952 // We are saving VGPR here.
3953 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3954 if (Def && Def->isMoveImmediate() &&
3955 isInlineConstant(Def->getOperand(1)) &&
3956 MRI->hasOneNonDBGUse(Src0->getReg())) {
3957 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3958 Src0Inlined = true;
3959 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3960 RI.isSGPRReg(*MRI, Src0->getReg())) {
3961 return false;
3962 }
3963 // VGPR is okay as Src0 - fallthrough
3964 }
3965
3966 if (Src1->isReg() && !Src0Inlined) {
3967 // We have one slot for inlinable constant so far - try to fill it
3968 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3969 if (Def && Def->isMoveImmediate() &&
3970 isInlineConstant(Def->getOperand(1)) &&
3971 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3972 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3973 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3974 return false;
3975 // VGPR is okay as Src1 - fallthrough
3976 }
3977 }
3978
3979 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3980 if (pseudoToMCOpcode(NewOpc) == -1)
3981 return false;
3982
3983 // FIXME: This would be a lot easier if we could return a new instruction
3984 // instead of having to modify in place.
3985
3986 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3987 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3988 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3989 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3990 UseMI.untieRegOperand(
3991 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3992
3993 const std::optional<int64_t> SubRegImm =
3994 extractSubregFromImm(Imm, Src2->getSubReg());
3995
3996 // ChangingToImmediate adds Src2 back to the instruction.
3997 Src2->ChangeToImmediate(*SubRegImm);
3998
3999 // These come before src2.
4001 UseMI.setDesc(get(NewOpc));
4002
4003 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
4004 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
4005 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
4006 Register Tmp = MRI->createVirtualRegister(NewRC);
4007 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
4008 UseMI.getDebugLoc(), get(AMDGPU::COPY),
4009 UseMI.getOperand(0).getReg())
4010 .addReg(Tmp, RegState::Kill);
4011 UseMI.getOperand(0).setReg(Tmp);
4012 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
4013 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
4014 }
4015
4016 // It might happen that UseMI was commuted
4017 // and we now have SGPR as SRC1. If so 2 inlined
4018 // constant and SGPR are illegal.
4020
4021 bool DeleteDef = MRI->use_nodbg_empty(Reg);
4022 if (DeleteDef)
4023 DefMI.eraseFromParent();
4024
4025 return true;
4026 }
4027 }
4028
4029 return false;
4030}
4031
4032static bool
4035 if (BaseOps1.size() != BaseOps2.size())
4036 return false;
4037 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4038 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
4039 return false;
4040 }
4041 return true;
4042}
4043
4044static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4045 LocationSize WidthB, int OffsetB) {
4046 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4047 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4048 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4049 return LowWidth.hasValue() &&
4050 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4051}
4052
4053bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4054 const MachineInstr &MIb) const {
4055 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4056 int64_t Offset0, Offset1;
4057 LocationSize Dummy0 = LocationSize::precise(0);
4058 LocationSize Dummy1 = LocationSize::precise(0);
4059 bool Offset0IsScalable, Offset1IsScalable;
4060 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
4061 Dummy0, &RI) ||
4062 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
4063 Dummy1, &RI))
4064 return false;
4065
4066 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
4067 return false;
4068
4069 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4070 // FIXME: Handle ds_read2 / ds_write2.
4071 return false;
4072 }
4073 LocationSize Width0 = MIa.memoperands().front()->getSize();
4074 LocationSize Width1 = MIb.memoperands().front()->getSize();
4075 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4076}
4077
4079 const MachineInstr &MIb) const {
4080 assert(MIa.mayLoadOrStore() &&
4081 "MIa must load from or modify a memory location");
4082 assert(MIb.mayLoadOrStore() &&
4083 "MIb must load from or modify a memory location");
4084
4086 return false;
4087
4088 // XXX - Can we relax this between address spaces?
4089 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4090 return false;
4091
4092 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4093 return false;
4094
4095 if (MIa.isBundle() || MIb.isBundle())
4096 return false;
4097
4098 // TODO: Should we check the address space from the MachineMemOperand? That
4099 // would allow us to distinguish objects we know don't alias based on the
4100 // underlying address space, even if it was lowered to a different one,
4101 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4102 // buffer.
4103 if (isDS(MIa)) {
4104 if (isDS(MIb))
4105 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4106
4107 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4108 }
4109
4110 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4111 if (isMUBUF(MIb) || isMTBUF(MIb))
4112 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4113
4114 if (isFLAT(MIb))
4115 return isFLATScratch(MIb);
4116
4117 return !isSMRD(MIb);
4118 }
4119
4120 if (isSMRD(MIa)) {
4121 if (isSMRD(MIb))
4122 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4123
4124 if (isFLAT(MIb))
4125 return isFLATScratch(MIb);
4126
4127 return !isMUBUF(MIb) && !isMTBUF(MIb);
4128 }
4129
4130 if (isFLAT(MIa)) {
4131 if (isFLAT(MIb)) {
4132 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4133 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4134 return true;
4135
4136 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4137 }
4138
4139 return false;
4140 }
4141
4142 return false;
4143}
4144
4146 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4147 if (Reg.isPhysical())
4148 return false;
4149 auto *Def = MRI.getUniqueVRegDef(Reg);
4150 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4151 Imm = Def->getOperand(1).getImm();
4152 if (DefMI)
4153 *DefMI = Def;
4154 return true;
4155 }
4156 return false;
4157}
4158
4159static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4160 MachineInstr **DefMI = nullptr) {
4161 if (!MO->isReg())
4162 return false;
4163 const MachineFunction *MF = MO->getParent()->getMF();
4164 const MachineRegisterInfo &MRI = MF->getRegInfo();
4165 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4166}
4167
4169 MachineInstr &NewMI) {
4170 if (LV) {
4171 unsigned NumOps = MI.getNumOperands();
4172 for (unsigned I = 1; I < NumOps; ++I) {
4173 MachineOperand &Op = MI.getOperand(I);
4174 if (Op.isReg() && Op.isKill())
4175 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4176 }
4177 }
4178}
4179
4180static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4181 switch (Opc) {
4182 case AMDGPU::V_MAC_F16_e32:
4183 case AMDGPU::V_MAC_F16_e64:
4184 return AMDGPU::V_MAD_F16_e64;
4185 case AMDGPU::V_MAC_F32_e32:
4186 case AMDGPU::V_MAC_F32_e64:
4187 return AMDGPU::V_MAD_F32_e64;
4188 case AMDGPU::V_MAC_LEGACY_F32_e32:
4189 case AMDGPU::V_MAC_LEGACY_F32_e64:
4190 return AMDGPU::V_MAD_LEGACY_F32_e64;
4191 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4192 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4193 return AMDGPU::V_FMA_LEGACY_F32_e64;
4194 case AMDGPU::V_FMAC_F16_e32:
4195 case AMDGPU::V_FMAC_F16_e64:
4196 case AMDGPU::V_FMAC_F16_t16_e64:
4197 case AMDGPU::V_FMAC_F16_fake16_e64:
4198 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4199 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4200 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4201 : AMDGPU::V_FMA_F16_gfx9_e64;
4202 case AMDGPU::V_FMAC_F32_e32:
4203 case AMDGPU::V_FMAC_F32_e64:
4204 return AMDGPU::V_FMA_F32_e64;
4205 case AMDGPU::V_FMAC_F64_e32:
4206 case AMDGPU::V_FMAC_F64_e64:
4207 return AMDGPU::V_FMA_F64_e64;
4208 default:
4209 llvm_unreachable("invalid instruction");
4210 }
4211}
4212
4213/// Helper struct for the implementation of 3-address conversion to communicate
4214/// updates made to instruction operands.
4216 /// Other instruction whose def is no longer used by the converted
4217 /// instruction.
4219};
4220
4222 LiveVariables *LV,
4223 LiveIntervals *LIS) const {
4224 MachineBasicBlock &MBB = *MI.getParent();
4225 MachineInstr *CandidateMI = &MI;
4226
4227 if (MI.isBundle()) {
4228 // This is a temporary placeholder for bundle handling that enables us to
4229 // exercise the relevant code paths in the two-address instruction pass.
4230 if (MI.getBundleSize() != 1)
4231 return nullptr;
4232 CandidateMI = MI.getNextNode();
4233 }
4234
4236 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4237 if (!NewMI)
4238 return nullptr;
4239
4240 if (MI.isBundle()) {
4241 CandidateMI->eraseFromBundle();
4242
4243 for (MachineOperand &MO : MI.all_defs()) {
4244 if (MO.isTied())
4245 MI.untieRegOperand(MO.getOperandNo());
4246 }
4247 } else {
4248 updateLiveVariables(LV, MI, *NewMI);
4249 if (LIS) {
4250 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4251 // SlotIndex of defs needs to be updated when converting to early-clobber
4252 MachineOperand &Def = NewMI->getOperand(0);
4253 if (Def.isEarlyClobber() && Def.isReg() &&
4254 LIS->hasInterval(Def.getReg())) {
4255 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4256 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4257 auto &LI = LIS->getInterval(Def.getReg());
4258 auto UpdateDefIndex = [&](LiveRange &LR) {
4259 auto *S = LR.find(OldIndex);
4260 if (S != LR.end() && S->start == OldIndex) {
4261 assert(S->valno && S->valno->def == OldIndex);
4262 S->start = NewIndex;
4263 S->valno->def = NewIndex;
4264 }
4265 };
4266 UpdateDefIndex(LI);
4267 for (auto &SR : LI.subranges())
4268 UpdateDefIndex(SR);
4269 }
4270 }
4271 }
4272
4273 if (U.RemoveMIUse) {
4274 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4275 // The only user is the instruction which will be killed.
4276 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4277
4278 if (MRI.hasOneNonDBGUse(DefReg)) {
4279 // We cannot just remove the DefMI here, calling pass will crash.
4280 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4281 U.RemoveMIUse->getOperand(0).setIsDead(true);
4282 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4283 U.RemoveMIUse->removeOperand(I);
4284 if (LV)
4285 LV->getVarInfo(DefReg).AliveBlocks.clear();
4286 }
4287
4288 if (MI.isBundle()) {
4289 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4290 if (!VRI.Reads && !VRI.Writes) {
4291 for (MachineOperand &MO : MI.all_uses()) {
4292 if (MO.isReg() && MO.getReg() == DefReg) {
4293 assert(MO.getSubReg() == 0 &&
4294 "tied sub-registers in bundles currently not supported");
4295 MI.removeOperand(MO.getOperandNo());
4296 break;
4297 }
4298 }
4299
4300 if (LIS)
4301 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4302 }
4303 } else if (LIS) {
4304 LiveInterval &DefLI = LIS->getInterval(DefReg);
4305
4306 // We cannot delete the original instruction here, so hack out the use
4307 // in the original instruction with a dummy register so we can use
4308 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4309 // not have the complexity of deleting a use to consider here.
4310 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4311 for (MachineOperand &MIOp : MI.uses()) {
4312 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4313 MIOp.setIsUndef(true);
4314 MIOp.setReg(DummyReg);
4315 }
4316 }
4317
4318 if (MI.isBundle()) {
4319 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4320 if (!VRI.Reads && !VRI.Writes) {
4321 for (MachineOperand &MIOp : MI.uses()) {
4322 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4323 MIOp.setIsUndef(true);
4324 MIOp.setReg(DummyReg);
4325 }
4326 }
4327 }
4328
4329 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4330 false, /*isUndef=*/true));
4331 }
4332
4333 LIS->shrinkToUses(&DefLI);
4334 }
4335 }
4336
4337 return MI.isBundle() ? &MI : NewMI;
4338}
4339
4341SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4342 ThreeAddressUpdates &U) const {
4343 MachineBasicBlock &MBB = *MI.getParent();
4344 unsigned Opc = MI.getOpcode();
4345
4346 // Handle MFMA.
4347 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4348 if (NewMFMAOpc != -1) {
4350 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4351 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4352 MIB.add(MI.getOperand(I));
4353 return MIB;
4354 }
4355
4356 if (SIInstrInfo::isWMMA(MI)) {
4357 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4358 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4359 .setMIFlags(MI.getFlags());
4360 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4361 MIB->addOperand(MI.getOperand(I));
4362 return MIB;
4363 }
4364
4365 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4366 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4367 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4368 "present pre-RA");
4369
4370 // Handle MAC/FMAC.
4371 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4372 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4373 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4374 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4375 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4376 bool Src0Literal = false;
4377
4378 switch (Opc) {
4379 default:
4380 return nullptr;
4381 case AMDGPU::V_MAC_F16_e64:
4382 case AMDGPU::V_FMAC_F16_e64:
4383 case AMDGPU::V_FMAC_F16_t16_e64:
4384 case AMDGPU::V_FMAC_F16_fake16_e64:
4385 case AMDGPU::V_MAC_F32_e64:
4386 case AMDGPU::V_MAC_LEGACY_F32_e64:
4387 case AMDGPU::V_FMAC_F32_e64:
4388 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4389 case AMDGPU::V_FMAC_F64_e64:
4390 break;
4391 case AMDGPU::V_MAC_F16_e32:
4392 case AMDGPU::V_FMAC_F16_e32:
4393 case AMDGPU::V_MAC_F32_e32:
4394 case AMDGPU::V_MAC_LEGACY_F32_e32:
4395 case AMDGPU::V_FMAC_F32_e32:
4396 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4397 case AMDGPU::V_FMAC_F64_e32: {
4398 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4399 AMDGPU::OpName::src0);
4400 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4401 if (!Src0->isReg() && !Src0->isImm())
4402 return nullptr;
4403
4404 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4405 Src0Literal = true;
4406
4407 break;
4408 }
4409 }
4410
4411 MachineInstrBuilder MIB;
4412 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4413 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4414 const MachineOperand *Src0Mods =
4415 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4416 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4417 const MachineOperand *Src1Mods =
4418 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4419 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4420 const MachineOperand *Src2Mods =
4421 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4422 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4423 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4424 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4425
4426 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4427 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4428 // If we have an SGPR input, we will violate the constant bus restriction.
4429 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4430 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4431 MachineInstr *DefMI;
4432
4433 int64_t Imm;
4434 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4435 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4436 if (pseudoToMCOpcode(NewOpc) != -1) {
4437 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4438 .add(*Dst)
4439 .add(*Src0)
4440 .add(*Src1)
4441 .addImm(Imm)
4442 .setMIFlags(MI.getFlags());
4443 U.RemoveMIUse = DefMI;
4444 return MIB;
4445 }
4446 }
4447 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4448 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4449 if (pseudoToMCOpcode(NewOpc) != -1) {
4450 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4451 .add(*Dst)
4452 .add(*Src0)
4453 .addImm(Imm)
4454 .add(*Src2)
4455 .setMIFlags(MI.getFlags());
4456 U.RemoveMIUse = DefMI;
4457 return MIB;
4458 }
4459 }
4460 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4461 if (Src0Literal) {
4462 Imm = Src0->getImm();
4463 DefMI = nullptr;
4464 }
4465 if (pseudoToMCOpcode(NewOpc) != -1 &&
4467 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4468 Src1)) {
4469 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4470 .add(*Dst)
4471 .add(*Src1)
4472 .addImm(Imm)
4473 .add(*Src2)
4474 .setMIFlags(MI.getFlags());
4475 U.RemoveMIUse = DefMI;
4476 return MIB;
4477 }
4478 }
4479 }
4480
4481 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4482 // if VOP3 does not allow a literal operand.
4483 if (Src0Literal && !ST.hasVOP3Literal())
4484 return nullptr;
4485
4486 unsigned NewOpc = getNewFMAInst(ST, Opc);
4487
4488 if (pseudoToMCOpcode(NewOpc) == -1)
4489 return nullptr;
4490
4491 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4492 .add(*Dst)
4493 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4494 .add(*Src0)
4495 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4496 .add(*Src1)
4497 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4498 .add(*Src2)
4499 .addImm(Clamp ? Clamp->getImm() : 0)
4500 .addImm(Omod ? Omod->getImm() : 0)
4501 .setMIFlags(MI.getFlags());
4502 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4503 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4504 return MIB;
4505}
4506
4507// It's not generally safe to move VALU instructions across these since it will
4508// start using the register as a base index rather than directly.
4509// XXX - Why isn't hasSideEffects sufficient for these?
4511 switch (MI.getOpcode()) {
4512 case AMDGPU::S_SET_GPR_IDX_ON:
4513 case AMDGPU::S_SET_GPR_IDX_MODE:
4514 case AMDGPU::S_SET_GPR_IDX_OFF:
4515 return true;
4516 default:
4517 return false;
4518 }
4519}
4520
4522 const MachineBasicBlock *MBB,
4523 const MachineFunction &MF) const {
4524 // Skipping the check for SP writes in the base implementation. The reason it
4525 // was added was apparently due to compile time concerns.
4526 //
4527 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4528 // but is probably avoidable.
4529
4530 // Copied from base implementation.
4531 // Terminators and labels can't be scheduled around.
4532 if (MI.isTerminator() || MI.isPosition())
4533 return true;
4534
4535 // INLINEASM_BR can jump to another block
4536 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4537 return true;
4538
4539 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4540 return true;
4541
4542 // Target-independent instructions do not have an implicit-use of EXEC, even
4543 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4544 // boundaries prevents incorrect movements of such instructions.
4545 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4546 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4547 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4548 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4549 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4551}
4552
4554 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4555 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4556 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4557}
4558
4560 // Instructions that access scratch use FLAT encoding or BUF encodings.
4561 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4562 return false;
4563
4564 // SCRATCH instructions always access scratch.
4565 if (isFLATScratch(MI))
4566 return true;
4567
4568 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4569 // via the aperture.
4570 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4571 return false;
4572
4573 // If there are no memory operands then conservatively assume the flat
4574 // operation may access scratch.
4575 if (MI.memoperands_empty())
4576 return true;
4577
4578 // See if any memory operand specifies an address space that involves scratch.
4579 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4580 unsigned AS = Memop->getAddrSpace();
4581 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4582 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4583 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4584 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4585 }
4586 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4587 });
4588}
4589
4591 assert(isFLAT(MI));
4592
4593 // All flat instructions use the VMEM counter except prefetch.
4594 if (!usesVM_CNT(MI))
4595 return false;
4596
4597 // If there are no memory operands then conservatively assume the flat
4598 // operation may access VMEM.
4599 if (MI.memoperands_empty())
4600 return true;
4601
4602 // See if any memory operand specifies an address space that involves VMEM.
4603 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4604 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4605 // (GDS) address space is not supported by flat operations. Therefore, simply
4606 // return true unless only the LDS address space is found.
4607 for (const MachineMemOperand *Memop : MI.memoperands()) {
4608 unsigned AS = Memop->getAddrSpace();
4610 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4611 return true;
4612 }
4613
4614 return false;
4615}
4616
4618 assert(isFLAT(MI));
4619
4620 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4621 if (!usesLGKM_CNT(MI))
4622 return false;
4623
4624 // If in tgsplit mode then there can be no use of LDS.
4625 if (ST.isTgSplitEnabled())
4626 return false;
4627
4628 // If there are no memory operands then conservatively assume the flat
4629 // operation may access LDS.
4630 if (MI.memoperands_empty())
4631 return true;
4632
4633 // See if any memory operand specifies an address space that involves LDS.
4634 for (const MachineMemOperand *Memop : MI.memoperands()) {
4635 unsigned AS = Memop->getAddrSpace();
4637 return true;
4638 }
4639
4640 return false;
4641}
4642
4644 // Skip the full operand and register alias search modifiesRegister
4645 // does. There's only a handful of instructions that touch this, it's only an
4646 // implicit def, and doesn't alias any other registers.
4647 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4648}
4649
4651 unsigned Opcode = MI.getOpcode();
4652
4653 if (MI.mayStore() && isSMRD(MI))
4654 return true; // scalar store or atomic
4655
4656 // This will terminate the function when other lanes may need to continue.
4657 if (MI.isReturn())
4658 return true;
4659
4660 // These instructions cause shader I/O that may cause hardware lockups
4661 // when executed with an empty EXEC mask.
4662 //
4663 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4664 // EXEC = 0, but checking for that case here seems not worth it
4665 // given the typical code patterns.
4666 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4667 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4668 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4669 Opcode == AMDGPU::S_SETHALT)
4670 return true;
4671
4672 if (MI.isCall() || MI.isInlineAsm())
4673 return true; // conservative assumption
4674
4675 // Assume that barrier interactions are only intended with active lanes.
4676 if (isBarrier(Opcode))
4677 return true;
4678
4679 // A mode change is a scalar operation that influences vector instructions.
4681 return true;
4682
4683 // These are like SALU instructions in terms of effects, so it's questionable
4684 // whether we should return true for those.
4685 //
4686 // However, executing them with EXEC = 0 causes them to operate on undefined
4687 // data, which we avoid by returning true here.
4688 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4689 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4690 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4691 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4692 return true;
4693
4694 return false;
4695}
4696
4698 const MachineInstr &MI) const {
4699 if (MI.isMetaInstruction())
4700 return false;
4701
4702 // This won't read exec if this is an SGPR->SGPR copy.
4703 if (MI.isCopyLike()) {
4704 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4705 return true;
4706
4707 // Make sure this isn't copying exec as a normal operand
4708 return MI.readsRegister(AMDGPU::EXEC, &RI);
4709 }
4710
4711 // Make a conservative assumption about the callee.
4712 if (MI.isCall())
4713 return true;
4714
4715 // Be conservative with any unhandled generic opcodes.
4716 if (!isTargetSpecificOpcode(MI.getOpcode()))
4717 return true;
4718
4719 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4720}
4721
4722bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4723 switch (Imm.getBitWidth()) {
4724 case 1: // This likely will be a condition code mask.
4725 return true;
4726
4727 case 32:
4728 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4729 ST.hasInv2PiInlineImm());
4730 case 64:
4731 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4732 ST.hasInv2PiInlineImm());
4733 case 16:
4734 return ST.has16BitInsts() &&
4735 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4736 ST.hasInv2PiInlineImm());
4737 default:
4738 llvm_unreachable("invalid bitwidth");
4739 }
4740}
4741
4743 APInt IntImm = Imm.bitcastToAPInt();
4744 int64_t IntImmVal = IntImm.getSExtValue();
4745 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4746 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4747 default:
4748 llvm_unreachable("invalid fltSemantics");
4751 return isInlineConstant(IntImm);
4753 return ST.has16BitInsts() &&
4754 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4756 return ST.has16BitInsts() &&
4757 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4758 }
4759}
4760
4761bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4762 // MachineOperand provides no way to tell the true operand size, since it only
4763 // records a 64-bit value. We need to know the size to determine if a 32-bit
4764 // floating point immediate bit pattern is legal for an integer immediate. It
4765 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4766 switch (OperandType) {
4776 int32_t Trunc = static_cast<int32_t>(Imm);
4777 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4778 }
4784 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4787 // We would expect inline immediates to not be concerned with an integer/fp
4788 // distinction. However, in the case of 16-bit integer operations, the
4789 // "floating point" values appear to not work. It seems read the low 16-bits
4790 // of 32-bit immediates, which happens to always work for the integer
4791 // values.
4792 //
4793 // See llvm bugzilla 46302.
4794 //
4795 // TODO: Theoretically we could use op-sel to use the high bits of the
4796 // 32-bit FP values.
4805 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4810 return false;
4813 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4814 // A few special case instructions have 16-bit operands on subtargets
4815 // where 16-bit instructions are not legal.
4816 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4817 // constants in these cases
4818 int16_t Trunc = static_cast<int16_t>(Imm);
4819 return ST.has16BitInsts() &&
4820 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4821 }
4822
4823 return false;
4824 }
4827 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4828 int16_t Trunc = static_cast<int16_t>(Imm);
4829 return ST.has16BitInsts() &&
4830 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4831 }
4832 return false;
4833 }
4837 return false;
4839 return isLegalAV64PseudoImm(Imm);
4842 // Always embedded in the instruction for free.
4843 return true;
4853 // Just ignore anything else.
4854 return true;
4855 default:
4856 llvm_unreachable("invalid operand type");
4857 }
4858}
4859
4860static bool compareMachineOp(const MachineOperand &Op0,
4861 const MachineOperand &Op1) {
4862 if (Op0.getType() != Op1.getType())
4863 return false;
4864
4865 switch (Op0.getType()) {
4867 return Op0.getReg() == Op1.getReg();
4869 return Op0.getImm() == Op1.getImm();
4870 default:
4871 llvm_unreachable("Didn't expect to be comparing these operand types");
4872 }
4873}
4874
4876 const MCOperandInfo &OpInfo) const {
4877 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4878 return true;
4879
4880 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4881 return false;
4882
4883 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4884 return true;
4885
4886 return ST.hasVOP3Literal();
4887}
4888
4889bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4890 int64_t ImmVal) const {
4891 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4892 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4893 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4894 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4895 AMDGPU::OpName::src2))
4896 return false;
4897 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4898 }
4899
4900 return isLiteralOperandLegal(InstDesc, OpInfo);
4901}
4902
4903bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4904 const MachineOperand &MO) const {
4905 if (MO.isImm())
4906 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4907
4908 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4909 "unexpected imm-like operand kind");
4910 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4911 return isLiteralOperandLegal(InstDesc, OpInfo);
4912}
4913
4915 // 2 32-bit inline constants packed into one.
4916 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4917 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4918}
4919
4920bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4921 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4922 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4923 return false;
4924
4925 int Op32 = AMDGPU::getVOPe32(Opcode);
4926 if (Op32 == -1)
4927 return false;
4928
4929 return pseudoToMCOpcode(Op32) != -1;
4930}
4931
4932bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4933 // The src0_modifier operand is present on all instructions
4934 // that have modifiers.
4935
4936 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4937}
4938
4940 AMDGPU::OpName OpName) const {
4941 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4942 return Mods && Mods->getImm();
4943}
4944
4946 return any_of(ModifierOpNames,
4947 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4948}
4949
4951 const MachineRegisterInfo &MRI) const {
4952 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4953 // Can't shrink instruction with three operands.
4954 if (Src2) {
4955 switch (MI.getOpcode()) {
4956 default: return false;
4957
4958 case AMDGPU::V_ADDC_U32_e64:
4959 case AMDGPU::V_SUBB_U32_e64:
4960 case AMDGPU::V_SUBBREV_U32_e64: {
4961 const MachineOperand *Src1
4962 = getNamedOperand(MI, AMDGPU::OpName::src1);
4963 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4964 return false;
4965 // Additional verification is needed for sdst/src2.
4966 return true;
4967 }
4968 case AMDGPU::V_MAC_F16_e64:
4969 case AMDGPU::V_MAC_F32_e64:
4970 case AMDGPU::V_MAC_LEGACY_F32_e64:
4971 case AMDGPU::V_FMAC_F16_e64:
4972 case AMDGPU::V_FMAC_F16_t16_e64:
4973 case AMDGPU::V_FMAC_F16_fake16_e64:
4974 case AMDGPU::V_FMAC_F32_e64:
4975 case AMDGPU::V_FMAC_F64_e64:
4976 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4977 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4978 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4979 return false;
4980 break;
4981
4982 case AMDGPU::V_CNDMASK_B32_e64:
4983 break;
4984 }
4985 }
4986
4987 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4988 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4989 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4990 return false;
4991
4992 // We don't need to check src0, all input types are legal, so just make sure
4993 // src0 isn't using any modifiers.
4994 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4995 return false;
4996
4997 // Can it be shrunk to a valid 32 bit opcode?
4998 if (!hasVALU32BitEncoding(MI.getOpcode()))
4999 return false;
5000
5001 // Check output modifiers
5002 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
5003 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
5004 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
5005 // TODO: Can we avoid checking bound_ctrl/fi here?
5006 // They are only used by permlane*_swap special case.
5007 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
5008 !hasModifiersSet(MI, AMDGPU::OpName::fi);
5009}
5010
5011// Set VCC operand with all flags from \p Orig, except for setting it as
5012// implicit.
5014 const MachineOperand &Orig) {
5015
5016 for (MachineOperand &Use : MI.implicit_operands()) {
5017 if (Use.isUse() &&
5018 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
5019 Use.setIsUndef(Orig.isUndef());
5020 Use.setIsKill(Orig.isKill());
5021 return;
5022 }
5023 }
5024}
5025
5027 unsigned Op32) const {
5028 MachineBasicBlock *MBB = MI.getParent();
5029
5030 const MCInstrDesc &Op32Desc = get(Op32);
5031 MachineInstrBuilder Inst32 =
5032 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
5033 .setMIFlags(MI.getFlags());
5034
5035 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
5036 // For VOPC instructions, this is replaced by an implicit def of vcc.
5037
5038 // We assume the defs of the shrunk opcode are in the same order, and the
5039 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5040 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5041 Inst32.add(MI.getOperand(I));
5042
5043 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
5044
5045 int Idx = MI.getNumExplicitDefs();
5046 for (const MachineOperand &Use : MI.explicit_uses()) {
5047 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5049 continue;
5050
5051 if (&Use == Src2) {
5052 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
5053 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5054 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5055 // of vcc was already added during the initial BuildMI, but we
5056 // 1) may need to change vcc to vcc_lo to preserve the original register
5057 // 2) have to preserve the original flags.
5058 copyFlagsToImplicitVCC(*Inst32, *Src2);
5059 continue;
5060 }
5061 }
5062
5063 Inst32.add(Use);
5064 }
5065
5066 // FIXME: Losing implicit operands
5067 fixImplicitOperands(*Inst32);
5068 return Inst32;
5069}
5070
5072 // Null is free
5073 Register Reg = RegOp.getReg();
5074 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5075 return false;
5076
5077 // SGPRs use the constant bus
5078
5079 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5080 // physical register operands should also count, except for exec.
5081 if (RegOp.isImplicit())
5082 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5083
5084 // SGPRs use the constant bus
5085 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5086 AMDGPU::SReg_64RegClass.contains(Reg);
5087}
5088
5090 const MachineRegisterInfo &MRI) const {
5091 Register Reg = RegOp.getReg();
5092 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5093 : physRegUsesConstantBus(RegOp);
5094}
5095
5097 const MachineOperand &MO,
5098 const MCOperandInfo &OpInfo) const {
5099 // Literal constants use the constant bus.
5100 if (!MO.isReg())
5101 return !isInlineConstant(MO, OpInfo);
5102
5103 Register Reg = MO.getReg();
5104 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5106}
5107
5109 for (const MachineOperand &MO : MI.implicit_operands()) {
5110 // We only care about reads.
5111 if (MO.isDef())
5112 continue;
5113
5114 switch (MO.getReg()) {
5115 case AMDGPU::VCC:
5116 case AMDGPU::VCC_LO:
5117 case AMDGPU::VCC_HI:
5118 case AMDGPU::M0:
5119 case AMDGPU::FLAT_SCR:
5120 return MO.getReg();
5121
5122 default:
5123 break;
5124 }
5125 }
5126
5127 return Register();
5128}
5129
5130static bool shouldReadExec(const MachineInstr &MI) {
5131 if (SIInstrInfo::isVALU(MI)) {
5132 switch (MI.getOpcode()) {
5133 case AMDGPU::V_READLANE_B32:
5134 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5135 case AMDGPU::V_WRITELANE_B32:
5136 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5137 return false;
5138 }
5139
5140 return true;
5141 }
5142
5143 if (MI.isPreISelOpcode() ||
5144 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5147 return false;
5148
5149 return true;
5150}
5151
5152static bool isRegOrFI(const MachineOperand &MO) {
5153 return MO.isReg() || MO.isFI();
5154}
5155
5156static bool isSubRegOf(const SIRegisterInfo &TRI,
5157 const MachineOperand &SuperVec,
5158 const MachineOperand &SubReg) {
5159 if (SubReg.getReg().isPhysical())
5160 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5161
5162 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5163 SubReg.getReg() == SuperVec.getReg();
5164}
5165
5166// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5167bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5168 const MachineRegisterInfo &MRI,
5169 StringRef &ErrInfo) const {
5170 Register DstReg = MI.getOperand(0).getReg();
5171 Register SrcReg = MI.getOperand(1).getReg();
5172 // This is a check for copy from vector register to SGPR
5173 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5174 ErrInfo = "illegal copy from vector register to SGPR";
5175 return false;
5176 }
5177 return true;
5178}
5179
5181 StringRef &ErrInfo) const {
5182 uint32_t Opcode = MI.getOpcode();
5183 const MachineFunction *MF = MI.getMF();
5184 const MachineRegisterInfo &MRI = MF->getRegInfo();
5185
5186 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5187 // Find a better property to recognize the point where instruction selection
5188 // is just done.
5189 // We can only enforce this check after SIFixSGPRCopies pass so that the
5190 // illegal copies are legalized and thereafter we don't expect a pass
5191 // inserting similar copies.
5192 if (!MRI.isSSA() && MI.isCopy())
5193 return verifyCopy(MI, MRI, ErrInfo);
5194
5195 if (SIInstrInfo::isGenericOpcode(Opcode))
5196 return true;
5197
5198 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5199 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5200 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5201 int Src3Idx = -1;
5202 if (Src0Idx == -1) {
5203 // VOPD V_DUAL_* instructions use different operand names.
5204 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5205 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5206 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5207 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5208 }
5209
5210 // Make sure the number of operands is correct.
5211 const MCInstrDesc &Desc = get(Opcode);
5212 if (!Desc.isVariadic() &&
5213 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5214 ErrInfo = "Instruction has wrong number of operands.";
5215 return false;
5216 }
5217
5218 if (MI.isInlineAsm()) {
5219 // Verify register classes for inlineasm constraints.
5220 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5221 I != E; ++I) {
5222 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5223 if (!RC)
5224 continue;
5225
5226 const MachineOperand &Op = MI.getOperand(I);
5227 if (!Op.isReg())
5228 continue;
5229
5230 Register Reg = Op.getReg();
5231 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5232 ErrInfo = "inlineasm operand has incorrect register class.";
5233 return false;
5234 }
5235 }
5236
5237 return true;
5238 }
5239
5240 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5241 ErrInfo = "missing memory operand from image instruction.";
5242 return false;
5243 }
5244
5245 // Make sure the register classes are correct.
5246 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5247 const MachineOperand &MO = MI.getOperand(i);
5248 if (MO.isFPImm()) {
5249 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5250 "all fp values to integers.";
5251 return false;
5252 }
5253
5254 const MCOperandInfo &OpInfo = Desc.operands()[i];
5255 int16_t RegClass = getOpRegClassID(OpInfo);
5256
5257 switch (OpInfo.OperandType) {
5259 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5260 ErrInfo = "Illegal immediate value for operand.";
5261 return false;
5262 }
5263 break;
5277 break;
5279 break;
5280 break;
5294 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5295 ErrInfo = "Illegal immediate value for operand.";
5296 return false;
5297 }
5298 break;
5299 }
5302 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5303 ErrInfo = "Expected inline constant for operand.";
5304 return false;
5305 }
5306 break;
5309 break;
5314 // Check if this operand is an immediate.
5315 // FrameIndex operands will be replaced by immediates, so they are
5316 // allowed.
5317 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5318 ErrInfo = "Expected immediate, but got non-immediate";
5319 return false;
5320 }
5321 break;
5325 break;
5326 default:
5327 if (OpInfo.isGenericType())
5328 continue;
5329 break;
5330 }
5331
5332 if (!MO.isReg())
5333 continue;
5334 Register Reg = MO.getReg();
5335 if (!Reg)
5336 continue;
5337
5338 // FIXME: Ideally we would have separate instruction definitions with the
5339 // aligned register constraint.
5340 // FIXME: We do not verify inline asm operands, but custom inline asm
5341 // verification is broken anyway
5342 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5343 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5344 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5345 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5346 if (const TargetRegisterClass *SubRC =
5347 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5348 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5349 if (RC)
5350 RC = SubRC;
5351 }
5352 }
5353
5354 // Check that this is the aligned version of the class.
5355 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5356 ErrInfo = "Subtarget requires even aligned vector registers";
5357 return false;
5358 }
5359 }
5360
5361 if (RegClass != -1) {
5362 if (Reg.isVirtual())
5363 continue;
5364
5365 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5366 if (!RC->contains(Reg)) {
5367 ErrInfo = "Operand has incorrect register class.";
5368 return false;
5369 }
5370 }
5371 }
5372
5373 // Verify SDWA
5374 if (isSDWA(MI)) {
5375 if (!ST.hasSDWA()) {
5376 ErrInfo = "SDWA is not supported on this target";
5377 return false;
5378 }
5379
5380 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5381 AMDGPU::OpName::dst_sel}) {
5382 const MachineOperand *MO = getNamedOperand(MI, Op);
5383 if (!MO)
5384 continue;
5385 int64_t Imm = MO->getImm();
5386 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5387 ErrInfo = "Invalid SDWA selection";
5388 return false;
5389 }
5390 }
5391
5392 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5393
5394 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5395 if (OpIdx == -1)
5396 continue;
5397 const MachineOperand &MO = MI.getOperand(OpIdx);
5398
5399 if (!ST.hasSDWAScalar()) {
5400 // Only VGPRS on VI
5401 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5402 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5403 return false;
5404 }
5405 } else {
5406 // No immediates on GFX9
5407 if (!MO.isReg()) {
5408 ErrInfo =
5409 "Only reg allowed as operands in SDWA instructions on GFX9+";
5410 return false;
5411 }
5412 }
5413 }
5414
5415 if (!ST.hasSDWAOmod()) {
5416 // No omod allowed on VI
5417 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5418 if (OMod != nullptr &&
5419 (!OMod->isImm() || OMod->getImm() != 0)) {
5420 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5421 return false;
5422 }
5423 }
5424
5425 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5426 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5427 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5428 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5429 const MachineOperand *Src0ModsMO =
5430 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5431 unsigned Mods = Src0ModsMO->getImm();
5432 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5433 Mods & SISrcMods::SEXT) {
5434 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5435 return false;
5436 }
5437 }
5438
5439 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5440 if (isVOPC(BasicOpcode)) {
5441 if (!ST.hasSDWASdst() && DstIdx != -1) {
5442 // Only vcc allowed as dst on VI for VOPC
5443 const MachineOperand &Dst = MI.getOperand(DstIdx);
5444 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5445 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5446 return false;
5447 }
5448 } else if (!ST.hasSDWAOutModsVOPC()) {
5449 // No clamp allowed on GFX9 for VOPC
5450 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5451 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5452 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5453 return false;
5454 }
5455
5456 // No omod allowed on GFX9 for VOPC
5457 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5458 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5459 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5460 return false;
5461 }
5462 }
5463 }
5464
5465 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5466 if (DstUnused && DstUnused->isImm() &&
5467 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5468 const MachineOperand &Dst = MI.getOperand(DstIdx);
5469 if (!Dst.isReg() || !Dst.isTied()) {
5470 ErrInfo = "Dst register should have tied register";
5471 return false;
5472 }
5473
5474 const MachineOperand &TiedMO =
5475 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5476 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5477 ErrInfo =
5478 "Dst register should be tied to implicit use of preserved register";
5479 return false;
5480 }
5481 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5482 ErrInfo = "Dst register should use same physical register as preserved";
5483 return false;
5484 }
5485 }
5486 }
5487
5488 // Verify MIMG / VIMAGE / VSAMPLE
5489 if (isImage(Opcode) && !MI.mayStore()) {
5490 // Ensure that the return type used is large enough for all the options
5491 // being used TFE/LWE require an extra result register.
5492 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5493 if (DMask) {
5494 uint64_t DMaskImm = DMask->getImm();
5495 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5496 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5497 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5498 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5499
5500 // Adjust for packed 16 bit values
5501 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5502 RegCount = divideCeil(RegCount, 2);
5503
5504 // Adjust if using LWE or TFE
5505 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5506 RegCount += 1;
5507
5508 const uint32_t DstIdx =
5509 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5510 const MachineOperand &Dst = MI.getOperand(DstIdx);
5511 if (Dst.isReg()) {
5512 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5513 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5514 if (RegCount > DstSize) {
5515 ErrInfo = "Image instruction returns too many registers for dst "
5516 "register class";
5517 return false;
5518 }
5519 }
5520 }
5521 }
5522
5523 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5524 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5525 unsigned ConstantBusCount = 0;
5526 bool UsesLiteral = false;
5527 const MachineOperand *LiteralVal = nullptr;
5528
5529 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5530 if (ImmIdx != -1) {
5531 ++ConstantBusCount;
5532 UsesLiteral = true;
5533 LiteralVal = &MI.getOperand(ImmIdx);
5534 }
5535
5536 SmallVector<Register, 2> SGPRsUsed;
5537 Register SGPRUsed;
5538
5539 // Only look at the true operands. Only a real operand can use the constant
5540 // bus, and we don't want to check pseudo-operands like the source modifier
5541 // flags.
5542 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5543 if (OpIdx == -1)
5544 continue;
5545 const MachineOperand &MO = MI.getOperand(OpIdx);
5546 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5547 if (MO.isReg()) {
5548 SGPRUsed = MO.getReg();
5549 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5550 ++ConstantBusCount;
5551 SGPRsUsed.push_back(SGPRUsed);
5552 }
5553 } else if (!MO.isFI()) { // Treat FI like a register.
5554 if (!UsesLiteral) {
5555 ++ConstantBusCount;
5556 UsesLiteral = true;
5557 LiteralVal = &MO;
5558 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5559 assert(isVOP2(MI) || isVOP3(MI));
5560 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5561 return false;
5562 }
5563 }
5564 }
5565 }
5566
5567 SGPRUsed = findImplicitSGPRRead(MI);
5568 if (SGPRUsed) {
5569 // Implicit uses may safely overlap true operands
5570 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5571 return !RI.regsOverlap(SGPRUsed, SGPR);
5572 })) {
5573 ++ConstantBusCount;
5574 SGPRsUsed.push_back(SGPRUsed);
5575 }
5576 }
5577
5578 // v_writelane_b32 is an exception from constant bus restriction:
5579 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5580 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5581 Opcode != AMDGPU::V_WRITELANE_B32) {
5582 ErrInfo = "VOP* instruction violates constant bus restriction";
5583 return false;
5584 }
5585
5586 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5587 ErrInfo = "VOP3 instruction uses literal";
5588 return false;
5589 }
5590 }
5591
5592 // Special case for writelane - this can break the multiple constant bus rule,
5593 // but still can't use more than one SGPR register
5594 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5595 unsigned SGPRCount = 0;
5596 Register SGPRUsed;
5597
5598 for (int OpIdx : {Src0Idx, Src1Idx}) {
5599 if (OpIdx == -1)
5600 break;
5601
5602 const MachineOperand &MO = MI.getOperand(OpIdx);
5603
5604 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5605 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5606 if (MO.getReg() != SGPRUsed)
5607 ++SGPRCount;
5608 SGPRUsed = MO.getReg();
5609 }
5610 }
5611 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5612 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5613 return false;
5614 }
5615 }
5616 }
5617
5618 // Verify misc. restrictions on specific instructions.
5619 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5620 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5621 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5622 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5623 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5624 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5625 if (!compareMachineOp(Src0, Src1) &&
5626 !compareMachineOp(Src0, Src2)) {
5627 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5628 return false;
5629 }
5630 }
5631 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5632 SISrcMods::ABS) ||
5633 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5634 SISrcMods::ABS) ||
5635 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5636 SISrcMods::ABS)) {
5637 ErrInfo = "ABS not allowed in VOP3B instructions";
5638 return false;
5639 }
5640 }
5641
5642 if (isSOP2(MI) || isSOPC(MI)) {
5643 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5644 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5645
5646 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5647 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5648 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5649 !Src0.isIdenticalTo(Src1)) {
5650 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5651 return false;
5652 }
5653 }
5654
5655 if (isSOPK(MI)) {
5656 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5657 if (Desc.isBranch()) {
5658 if (!Op->isMBB()) {
5659 ErrInfo = "invalid branch target for SOPK instruction";
5660 return false;
5661 }
5662 } else {
5663 uint64_t Imm = Op->getImm();
5664 if (sopkIsZext(Opcode)) {
5665 if (!isUInt<16>(Imm)) {
5666 ErrInfo = "invalid immediate for SOPK instruction";
5667 return false;
5668 }
5669 } else {
5670 if (!isInt<16>(Imm)) {
5671 ErrInfo = "invalid immediate for SOPK instruction";
5672 return false;
5673 }
5674 }
5675 }
5676 }
5677
5678 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5679 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5680 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5681 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5682 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5683 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5684
5685 const unsigned StaticNumOps =
5686 Desc.getNumOperands() + Desc.implicit_uses().size();
5687 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5688
5689 // Require additional implicit operands. This allows a fixup done by the
5690 // post RA scheduler where the main implicit operand is killed and
5691 // implicit-defs are added for sub-registers that remain live after this
5692 // instruction.
5693 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5694 ErrInfo = "missing implicit register operands";
5695 return false;
5696 }
5697
5698 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5699 if (IsDst) {
5700 if (!Dst->isUse()) {
5701 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5702 return false;
5703 }
5704
5705 unsigned UseOpIdx;
5706 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5707 UseOpIdx != StaticNumOps + 1) {
5708 ErrInfo = "movrel implicit operands should be tied";
5709 return false;
5710 }
5711 }
5712
5713 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5714 const MachineOperand &ImpUse
5715 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5716 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5717 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5718 ErrInfo = "src0 should be subreg of implicit vector use";
5719 return false;
5720 }
5721 }
5722
5723 // Make sure we aren't losing exec uses in the td files. This mostly requires
5724 // being careful when using let Uses to try to add other use registers.
5725 if (shouldReadExec(MI)) {
5726 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5727 ErrInfo = "VALU instruction does not implicitly read exec mask";
5728 return false;
5729 }
5730 }
5731
5732 if (isSMRD(MI)) {
5733 if (MI.mayStore() &&
5734 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5735 // The register offset form of scalar stores may only use m0 as the
5736 // soffset register.
5737 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5738 if (Soff && Soff->getReg() != AMDGPU::M0) {
5739 ErrInfo = "scalar stores must use m0 as offset register";
5740 return false;
5741 }
5742 }
5743 }
5744
5745 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5746 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5747 if (Offset->getImm() != 0) {
5748 ErrInfo = "subtarget does not support offsets in flat instructions";
5749 return false;
5750 }
5751 }
5752
5753 if (isDS(MI) && !ST.hasGDS()) {
5754 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5755 if (GDSOp && GDSOp->getImm() != 0) {
5756 ErrInfo = "GDS is not supported on this subtarget";
5757 return false;
5758 }
5759 }
5760
5761 if (isImage(MI)) {
5762 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5763 if (DimOp) {
5764 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5765 AMDGPU::OpName::vaddr0);
5766 AMDGPU::OpName RSrcOpName =
5767 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5768 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5769 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5770 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5771 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5772 const AMDGPU::MIMGDimInfo *Dim =
5774
5775 if (!Dim) {
5776 ErrInfo = "dim is out of range";
5777 return false;
5778 }
5779
5780 bool IsA16 = false;
5781 if (ST.hasR128A16()) {
5782 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5783 IsA16 = R128A16->getImm() != 0;
5784 } else if (ST.hasA16()) {
5785 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5786 IsA16 = A16->getImm() != 0;
5787 }
5788
5789 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5790
5791 unsigned AddrWords =
5792 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5793
5794 unsigned VAddrWords;
5795 if (IsNSA) {
5796 VAddrWords = RsrcIdx - VAddr0Idx;
5797 if (ST.hasPartialNSAEncoding() &&
5798 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5799 unsigned LastVAddrIdx = RsrcIdx - 1;
5800 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5801 }
5802 } else {
5803 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5804 if (AddrWords > 12)
5805 AddrWords = 16;
5806 }
5807
5808 if (VAddrWords != AddrWords) {
5809 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5810 << " but got " << VAddrWords << "\n");
5811 ErrInfo = "bad vaddr size";
5812 return false;
5813 }
5814 }
5815 }
5816
5817 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5818 if (DppCt) {
5819 using namespace AMDGPU::DPP;
5820
5821 unsigned DC = DppCt->getImm();
5822 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5823 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5824 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5825 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5826 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5827 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5828 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5829 ErrInfo = "Invalid dpp_ctrl value";
5830 return false;
5831 }
5832 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5833 !ST.hasDPPWavefrontShifts()) {
5834 ErrInfo = "Invalid dpp_ctrl value: "
5835 "wavefront shifts are not supported on GFX10+";
5836 return false;
5837 }
5838 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5839 !ST.hasDPPBroadcasts()) {
5840 ErrInfo = "Invalid dpp_ctrl value: "
5841 "broadcasts are not supported on GFX10+";
5842 return false;
5843 }
5844 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5845 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5846 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5847 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5848 !ST.hasGFX90AInsts()) {
5849 ErrInfo = "Invalid dpp_ctrl value: "
5850 "row_newbroadcast/row_share is not supported before "
5851 "GFX90A/GFX10";
5852 return false;
5853 }
5854 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5855 ErrInfo = "Invalid dpp_ctrl value: "
5856 "row_share and row_xmask are not supported before GFX10";
5857 return false;
5858 }
5859 }
5860
5861 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5863 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5864 ErrInfo = "Invalid dpp_ctrl value: "
5865 "DP ALU dpp only support row_newbcast";
5866 return false;
5867 }
5868 }
5869
5870 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5871 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5872 AMDGPU::OpName DataName =
5873 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5874 const MachineOperand *Data = getNamedOperand(MI, DataName);
5875 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5876 if (Data && !Data->isReg())
5877 Data = nullptr;
5878
5879 if (ST.hasGFX90AInsts()) {
5880 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5881 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5882 ErrInfo = "Invalid register class: "
5883 "vdata and vdst should be both VGPR or AGPR";
5884 return false;
5885 }
5886 if (Data && Data2 &&
5887 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5888 ErrInfo = "Invalid register class: "
5889 "both data operands should be VGPR or AGPR";
5890 return false;
5891 }
5892 } else {
5893 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5894 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5895 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5896 ErrInfo = "Invalid register class: "
5897 "agpr loads and stores not supported on this GPU";
5898 return false;
5899 }
5900 }
5901 }
5902
5903 if (ST.needsAlignedVGPRs()) {
5904 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5906 if (!Op)
5907 return true;
5908 Register Reg = Op->getReg();
5909 if (Reg.isPhysical())
5910 return !(RI.getHWRegIndex(Reg) & 1);
5911 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5912 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5913 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5914 };
5915
5916 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5917 Opcode == AMDGPU::DS_GWS_BARRIER) {
5918
5919 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5920 ErrInfo = "Subtarget requires even aligned vector registers "
5921 "for DS_GWS instructions";
5922 return false;
5923 }
5924 }
5925
5926 if (isMIMG(MI)) {
5927 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5928 ErrInfo = "Subtarget requires even aligned vector registers "
5929 "for vaddr operand of image instructions";
5930 return false;
5931 }
5932 }
5933 }
5934
5935 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5936 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5937 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5938 ErrInfo = "Invalid register class: "
5939 "v_accvgpr_write with an SGPR is not supported on this GPU";
5940 return false;
5941 }
5942 }
5943
5944 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5945 const MachineOperand &SrcOp = MI.getOperand(1);
5946 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5947 ErrInfo = "pseudo expects only physical SGPRs";
5948 return false;
5949 }
5950 }
5951
5952 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5953 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5954 if (!ST.hasScaleOffset()) {
5955 ErrInfo = "Subtarget does not support offset scaling";
5956 return false;
5957 }
5958 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5959 ErrInfo = "Instruction does not support offset scaling";
5960 return false;
5961 }
5962 }
5963 }
5964
5965 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5966 // information.
5967 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5968 for (unsigned I = 0; I < 3; ++I) {
5970 return false;
5971 }
5972 }
5973
5974 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5975 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5976 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5977 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5978 &AMDGPU::SReg_64RegClass) ||
5979 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5980 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5981 return false;
5982 }
5983 }
5984
5985 return true;
5986}
5987
5989 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5990 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5991 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
5992 ? AMDGPU::COPY
5993 : AMDGPU::V_MOV_B32_e32;
5994 }
5995 return getVALUOp(MI.getOpcode());
5996}
5997
5998// It is more readable to list mapped opcodes on the same line.
5999// clang-format off
6000
6001unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
6002 switch (Opc) {
6003 default: return AMDGPU::INSTRUCTION_LIST_END;
6004 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
6005 case AMDGPU::COPY: return AMDGPU::COPY;
6006 case AMDGPU::PHI: return AMDGPU::PHI;
6007 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
6008 case AMDGPU::WQM: return AMDGPU::WQM;
6009 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
6010 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
6011 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
6012 case AMDGPU::S_ADD_I32:
6013 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
6014 case AMDGPU::S_ADDC_U32:
6015 return AMDGPU::V_ADDC_U32_e32;
6016 case AMDGPU::S_SUB_I32:
6017 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
6018 // FIXME: These are not consistently handled, and selected when the carry is
6019 // used.
6020 case AMDGPU::S_ADD_U32:
6021 return AMDGPU::V_ADD_CO_U32_e32;
6022 case AMDGPU::S_SUB_U32:
6023 return AMDGPU::V_SUB_CO_U32_e32;
6024 case AMDGPU::S_ADD_U64_PSEUDO:
6025 return AMDGPU::V_ADD_U64_PSEUDO;
6026 case AMDGPU::S_SUB_U64_PSEUDO:
6027 return AMDGPU::V_SUB_U64_PSEUDO;
6028 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
6029 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
6030 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
6031 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
6032 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
6033 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
6034 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
6035 case AMDGPU::S_XNOR_B32:
6036 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
6037 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
6038 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
6039 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
6040 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6041 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6042 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6043 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6044 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6045 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6046 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6047 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6048 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6049 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6050 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6051 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6052 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6053 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6054 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6055 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6056 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6057 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6058 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6059 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6060 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6061 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6062 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6063 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6064 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6065 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6066 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6067 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6068 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6069 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6070 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6071 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6072 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6073 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6074 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6075 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6076 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6077 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6078 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6079 case AMDGPU::S_CVT_F32_F16:
6080 case AMDGPU::S_CVT_HI_F32_F16:
6081 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6082 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6083 case AMDGPU::S_CVT_F16_F32:
6084 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6085 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6086 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6087 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6088 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6089 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6090 case AMDGPU::S_CEIL_F16:
6091 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6092 : AMDGPU::V_CEIL_F16_fake16_e64;
6093 case AMDGPU::S_FLOOR_F16:
6094 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6095 : AMDGPU::V_FLOOR_F16_fake16_e64;
6096 case AMDGPU::S_TRUNC_F16:
6097 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6098 : AMDGPU::V_TRUNC_F16_fake16_e64;
6099 case AMDGPU::S_RNDNE_F16:
6100 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6101 : AMDGPU::V_RNDNE_F16_fake16_e64;
6102 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6103 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6104 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6105 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6106 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6107 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6108 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6109 case AMDGPU::S_ADD_F16:
6110 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6111 : AMDGPU::V_ADD_F16_fake16_e64;
6112 case AMDGPU::S_SUB_F16:
6113 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6114 : AMDGPU::V_SUB_F16_fake16_e64;
6115 case AMDGPU::S_MIN_F16:
6116 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6117 : AMDGPU::V_MIN_F16_fake16_e64;
6118 case AMDGPU::S_MAX_F16:
6119 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6120 : AMDGPU::V_MAX_F16_fake16_e64;
6121 case AMDGPU::S_MINIMUM_F16:
6122 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6123 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6124 case AMDGPU::S_MAXIMUM_F16:
6125 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6126 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6127 case AMDGPU::S_MUL_F16:
6128 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6129 : AMDGPU::V_MUL_F16_fake16_e64;
6130 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6131 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6132 case AMDGPU::S_FMAC_F16:
6133 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6134 : AMDGPU::V_FMAC_F16_fake16_e64;
6135 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6136 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6137 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6138 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6139 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6140 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6141 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6142 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6143 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6144 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6145 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6146 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6147 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6148 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6149 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6150 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6151 case AMDGPU::S_CMP_LT_F16:
6152 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6153 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6154 case AMDGPU::S_CMP_EQ_F16:
6155 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6156 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6157 case AMDGPU::S_CMP_LE_F16:
6158 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6159 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6160 case AMDGPU::S_CMP_GT_F16:
6161 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6162 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6163 case AMDGPU::S_CMP_LG_F16:
6164 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6165 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6166 case AMDGPU::S_CMP_GE_F16:
6167 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6168 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6169 case AMDGPU::S_CMP_O_F16:
6170 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6171 : AMDGPU::V_CMP_O_F16_fake16_e64;
6172 case AMDGPU::S_CMP_U_F16:
6173 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6174 : AMDGPU::V_CMP_U_F16_fake16_e64;
6175 case AMDGPU::S_CMP_NGE_F16:
6176 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6177 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6178 case AMDGPU::S_CMP_NLG_F16:
6179 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6180 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6181 case AMDGPU::S_CMP_NGT_F16:
6182 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6183 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6184 case AMDGPU::S_CMP_NLE_F16:
6185 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6186 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6187 case AMDGPU::S_CMP_NEQ_F16:
6188 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6189 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6190 case AMDGPU::S_CMP_NLT_F16:
6191 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6192 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6193 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6194 case AMDGPU::V_S_EXP_F16_e64:
6195 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6196 : AMDGPU::V_EXP_F16_fake16_e64;
6197 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6198 case AMDGPU::V_S_LOG_F16_e64:
6199 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6200 : AMDGPU::V_LOG_F16_fake16_e64;
6201 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6202 case AMDGPU::V_S_RCP_F16_e64:
6203 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6204 : AMDGPU::V_RCP_F16_fake16_e64;
6205 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6206 case AMDGPU::V_S_RSQ_F16_e64:
6207 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6208 : AMDGPU::V_RSQ_F16_fake16_e64;
6209 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6210 case AMDGPU::V_S_SQRT_F16_e64:
6211 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6212 : AMDGPU::V_SQRT_F16_fake16_e64;
6213 }
6215 "Unexpected scalar opcode without corresponding vector one!");
6216}
6217
6218// clang-format on
6219
6223 const DebugLoc &DL, Register Reg,
6224 bool IsSCCLive,
6225 SlotIndexes *Indexes) const {
6226 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6227 const SIInstrInfo *TII = ST.getInstrInfo();
6229 if (IsSCCLive) {
6230 // Insert two move instructions, one to save the original value of EXEC and
6231 // the other to turn on all bits in EXEC. This is required as we can't use
6232 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6233 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6235 auto FlipExecMI =
6236 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6237 if (Indexes) {
6238 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6239 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6240 }
6241 } else {
6242 auto SaveExec =
6243 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6244 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6245 if (Indexes)
6246 Indexes->insertMachineInstrInMaps(*SaveExec);
6247 }
6248}
6249
6252 const DebugLoc &DL, Register Reg,
6253 SlotIndexes *Indexes) const {
6255 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6256 .addReg(Reg, RegState::Kill);
6257 if (Indexes)
6258 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6259}
6260
6264 "Not a whole wave func");
6265 MachineBasicBlock &MBB = *MF.begin();
6266 for (MachineInstr &MI : MBB)
6267 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6268 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6269 return &MI;
6270
6271 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6272}
6273
6275 unsigned OpNo) const {
6276 const MCInstrDesc &Desc = get(MI.getOpcode());
6277 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6278 Desc.operands()[OpNo].RegClass == -1) {
6279 Register Reg = MI.getOperand(OpNo).getReg();
6280
6281 if (Reg.isVirtual()) {
6282 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6283 return MRI.getRegClass(Reg);
6284 }
6285 return RI.getPhysRegBaseClass(Reg);
6286 }
6287
6288 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6289 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6290}
6291
6294 MachineBasicBlock *MBB = MI.getParent();
6295 MachineOperand &MO = MI.getOperand(OpIdx);
6296 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6297 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6298 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6299 unsigned Size = RI.getRegSizeInBits(*RC);
6300 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6301 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6302 : AMDGPU::V_MOV_B32_e32;
6303 if (MO.isReg())
6304 Opcode = AMDGPU::COPY;
6305 else if (RI.isSGPRClass(RC))
6306 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6307
6308 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6309 Register Reg = MRI.createVirtualRegister(VRC);
6310 DebugLoc DL = MBB->findDebugLoc(I);
6311 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6312 MO.ChangeToRegister(Reg, false);
6313}
6314
6317 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6318 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6319 if (!SuperReg.getReg().isVirtual())
6320 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6321
6322 MachineBasicBlock *MBB = MI->getParent();
6323 const DebugLoc &DL = MI->getDebugLoc();
6324 Register SubReg = MRI.createVirtualRegister(SubRC);
6325
6326 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6327 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6328 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6329 return SubReg;
6330}
6331
6334 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6335 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6336 if (Op.isImm()) {
6337 if (SubIdx == AMDGPU::sub0)
6338 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6339 if (SubIdx == AMDGPU::sub1)
6340 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6341
6342 llvm_unreachable("Unhandled register index for immediate");
6343 }
6344
6345 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6346 SubIdx, SubRC);
6347 return MachineOperand::CreateReg(SubReg, false);
6348}
6349
6350// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6351void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6352 assert(Inst.getNumExplicitOperands() == 3);
6353 MachineOperand Op1 = Inst.getOperand(1);
6354 Inst.removeOperand(1);
6355 Inst.addOperand(Op1);
6356}
6357
6359 const MCOperandInfo &OpInfo,
6360 const MachineOperand &MO) const {
6361 if (!MO.isReg())
6362 return false;
6363
6364 Register Reg = MO.getReg();
6365
6366 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6367 if (Reg.isPhysical())
6368 return DRC->contains(Reg);
6369
6370 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6371
6372 if (MO.getSubReg()) {
6373 const MachineFunction *MF = MO.getParent()->getMF();
6374 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6375 if (!SuperRC)
6376 return false;
6377 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6378 }
6379
6380 return RI.getCommonSubClass(DRC, RC) != nullptr;
6381}
6382
6384 const MachineOperand &MO) const {
6385 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6386 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6387 unsigned Opc = MI.getOpcode();
6388
6389 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6390 // information.
6391 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6392 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6393 constexpr AMDGPU::OpName OpNames[] = {
6394 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6395
6396 for (auto [I, OpName] : enumerate(OpNames)) {
6397 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6398 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6400 return false;
6401 }
6402 }
6403
6404 if (!isLegalRegOperand(MRI, OpInfo, MO))
6405 return false;
6406
6407 // check Accumulate GPR operand
6408 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6409 if (IsAGPR && !ST.hasMAIInsts())
6410 return false;
6411 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6412 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6413 return false;
6414 // Atomics should have both vdst and vdata either vgpr or agpr.
6415 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6416 const int DataIdx = AMDGPU::getNamedOperandIdx(
6417 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6418 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6419 MI.getOperand(DataIdx).isReg() &&
6420 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6421 return false;
6422 if ((int)OpIdx == DataIdx) {
6423 if (VDstIdx != -1 &&
6424 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6425 return false;
6426 // DS instructions with 2 src operands also must have tied RC.
6427 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6428 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6429 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6430 return false;
6431 }
6432
6433 // Check V_ACCVGPR_WRITE_B32_e64
6434 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6435 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6436 RI.isSGPRReg(MRI, MO.getReg()))
6437 return false;
6438
6439 if (ST.hasFlatScratchHiInB64InstHazard() &&
6440 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6441 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6442 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6443 64)
6444 return false;
6445 }
6446 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6447 return false;
6448 }
6449
6450 return true;
6451}
6452
6454 const MCOperandInfo &OpInfo,
6455 const MachineOperand &MO) const {
6456 if (MO.isReg())
6457 return isLegalRegOperand(MRI, OpInfo, MO);
6458
6459 // Handle non-register types that are treated like immediates.
6460 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6461 return true;
6462}
6463
6465 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6466 const MachineOperand *MO) const {
6467 constexpr unsigned NumOps = 3;
6468 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6469 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6470 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6471 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6472
6473 assert(SrcN < NumOps);
6474
6475 if (!MO) {
6476 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6477 if (SrcIdx == -1)
6478 return true;
6479 MO = &MI.getOperand(SrcIdx);
6480 }
6481
6482 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6483 return true;
6484
6485 int ModsIdx =
6486 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6487 if (ModsIdx == -1)
6488 return true;
6489
6490 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6491 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6492 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6493
6494 return !OpSel && !OpSelHi;
6495}
6496
6498 const MachineOperand *MO) const {
6499 const MachineFunction &MF = *MI.getMF();
6500 const MachineRegisterInfo &MRI = MF.getRegInfo();
6501 const MCInstrDesc &InstDesc = MI.getDesc();
6502 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6503 int64_t RegClass = getOpRegClassID(OpInfo);
6504 const TargetRegisterClass *DefinedRC =
6505 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6506 if (!MO)
6507 MO = &MI.getOperand(OpIdx);
6508
6509 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6510
6511 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6512 const MachineOperand *UsedLiteral = nullptr;
6513
6514 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6515 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6516
6517 // TODO: Be more permissive with frame indexes.
6518 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6519 if (!LiteralLimit--)
6520 return false;
6521
6522 UsedLiteral = MO;
6523 }
6524
6526 if (MO->isReg())
6527 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6528
6529 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6530 if (i == OpIdx)
6531 continue;
6532 const MachineOperand &Op = MI.getOperand(i);
6533 if (Op.isReg()) {
6534 if (Op.isUse()) {
6535 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6536 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6537 if (--ConstantBusLimit <= 0)
6538 return false;
6539 }
6540 }
6541 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6542 !isInlineConstant(Op, InstDesc.operands()[i])) {
6543 // The same literal may be used multiple times.
6544 if (!UsedLiteral)
6545 UsedLiteral = &Op;
6546 else if (UsedLiteral->isIdenticalTo(Op))
6547 continue;
6548
6549 if (!LiteralLimit--)
6550 return false;
6551 if (--ConstantBusLimit <= 0)
6552 return false;
6553 }
6554 }
6555 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6556 // There can be at most one literal operand, but it can be repeated.
6557 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6558 if (i == OpIdx)
6559 continue;
6560 const MachineOperand &Op = MI.getOperand(i);
6561 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6562 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6563 !Op.isIdenticalTo(*MO))
6564 return false;
6565
6566 // Do not fold a non-inlineable and non-register operand into an
6567 // instruction that already has a frame index. The frame index handling
6568 // code could not handle well when a frame index co-exists with another
6569 // non-register operand, unless that operand is an inlineable immediate.
6570 if (Op.isFI())
6571 return false;
6572 }
6573 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6574 isF16PseudoScalarTrans(MI.getOpcode())) {
6575 return false;
6576 }
6577
6578 if (MO->isReg()) {
6579 if (!DefinedRC)
6580 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6581 return isLegalRegOperand(MI, OpIdx, *MO);
6582 }
6583
6584 if (MO->isImm()) {
6585 uint64_t Imm = MO->getImm();
6586 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6587 bool Is64BitOp = Is64BitFPOp ||
6588 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6589 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6590 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6591 if (Is64BitOp &&
6592 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6593 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6594 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6595 return false;
6596
6597 // FIXME: We can use sign extended 64-bit literals, but only for signed
6598 // operands. At the moment we do not know if an operand is signed.
6599 // Such operand will be encoded as its low 32 bits and then either
6600 // correctly sign extended or incorrectly zero extended by HW.
6601 // If 64-bit literals are supported and the literal will be encoded
6602 // as full 64 bit we still can use it.
6603 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6604 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6605 return false;
6606 }
6607 }
6608
6609 // Handle non-register types that are treated like immediates.
6610 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6611
6612 if (!DefinedRC) {
6613 // This operand expects an immediate.
6614 return true;
6615 }
6616
6617 return isImmOperandLegal(MI, OpIdx, *MO);
6618}
6619
6621 bool IsGFX950Only = ST.hasGFX950Insts();
6622 bool IsGFX940Only = ST.hasGFX940Insts();
6623
6624 if (!IsGFX950Only && !IsGFX940Only)
6625 return false;
6626
6627 if (!isVALU(MI))
6628 return false;
6629
6630 // V_COS, V_EXP, V_RCP, etc.
6631 if (isTRANS(MI))
6632 return true;
6633
6634 // DOT2, DOT2C, DOT4, etc.
6635 if (isDOT(MI))
6636 return true;
6637
6638 // MFMA, SMFMA
6639 if (isMFMA(MI))
6640 return true;
6641
6642 unsigned Opcode = MI.getOpcode();
6643 switch (Opcode) {
6644 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6645 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6646 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6647 case AMDGPU::V_MQSAD_U32_U8_e64:
6648 case AMDGPU::V_PK_ADD_F16:
6649 case AMDGPU::V_PK_ADD_F32:
6650 case AMDGPU::V_PK_ADD_I16:
6651 case AMDGPU::V_PK_ADD_U16:
6652 case AMDGPU::V_PK_ASHRREV_I16:
6653 case AMDGPU::V_PK_FMA_F16:
6654 case AMDGPU::V_PK_FMA_F32:
6655 case AMDGPU::V_PK_FMAC_F16_e32:
6656 case AMDGPU::V_PK_FMAC_F16_e64:
6657 case AMDGPU::V_PK_LSHLREV_B16:
6658 case AMDGPU::V_PK_LSHRREV_B16:
6659 case AMDGPU::V_PK_MAD_I16:
6660 case AMDGPU::V_PK_MAD_U16:
6661 case AMDGPU::V_PK_MAX_F16:
6662 case AMDGPU::V_PK_MAX_I16:
6663 case AMDGPU::V_PK_MAX_U16:
6664 case AMDGPU::V_PK_MIN_F16:
6665 case AMDGPU::V_PK_MIN_I16:
6666 case AMDGPU::V_PK_MIN_U16:
6667 case AMDGPU::V_PK_MOV_B32:
6668 case AMDGPU::V_PK_MUL_F16:
6669 case AMDGPU::V_PK_MUL_F32:
6670 case AMDGPU::V_PK_MUL_LO_U16:
6671 case AMDGPU::V_PK_SUB_I16:
6672 case AMDGPU::V_PK_SUB_U16:
6673 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6674 return true;
6675 default:
6676 return false;
6677 }
6678}
6679
6681 MachineInstr &MI) const {
6682 unsigned Opc = MI.getOpcode();
6683 const MCInstrDesc &InstrDesc = get(Opc);
6684
6685 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6686 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6687
6688 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6689 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6690
6691 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6692 // we need to only have one constant bus use before GFX10.
6693 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6694 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6695 RI.isSGPRReg(MRI, Src0.getReg()))
6696 legalizeOpWithMove(MI, Src0Idx);
6697
6698 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6699 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6700 // src0/src1 with V_READFIRSTLANE.
6701 if (Opc == AMDGPU::V_WRITELANE_B32) {
6702 const DebugLoc &DL = MI.getDebugLoc();
6703 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6704 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6705 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6706 .add(Src0);
6707 Src0.ChangeToRegister(Reg, false);
6708 }
6709 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6710 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6711 const DebugLoc &DL = MI.getDebugLoc();
6712 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6713 .add(Src1);
6714 Src1.ChangeToRegister(Reg, false);
6715 }
6716 return;
6717 }
6718
6719 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6720 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6721 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6722 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6723 legalizeOpWithMove(MI, Src2Idx);
6724 }
6725
6726 // VOP2 src0 instructions support all operand types, so we don't need to check
6727 // their legality. If src1 is already legal, we don't need to do anything.
6728 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6729 return;
6730
6731 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6732 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6733 // select is uniform.
6734 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6735 RI.isVGPR(MRI, Src1.getReg())) {
6736 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6737 const DebugLoc &DL = MI.getDebugLoc();
6738 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6739 .add(Src1);
6740 Src1.ChangeToRegister(Reg, false);
6741 return;
6742 }
6743
6744 // We do not use commuteInstruction here because it is too aggressive and will
6745 // commute if it is possible. We only want to commute here if it improves
6746 // legality. This can be called a fairly large number of times so don't waste
6747 // compile time pointlessly swapping and checking legality again.
6748 if (HasImplicitSGPR || !MI.isCommutable()) {
6749 legalizeOpWithMove(MI, Src1Idx);
6750 return;
6751 }
6752
6753 // If src0 can be used as src1, commuting will make the operands legal.
6754 // Otherwise we have to give up and insert a move.
6755 //
6756 // TODO: Other immediate-like operand kinds could be commuted if there was a
6757 // MachineOperand::ChangeTo* for them.
6758 if ((!Src1.isImm() && !Src1.isReg()) ||
6759 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6760 legalizeOpWithMove(MI, Src1Idx);
6761 return;
6762 }
6763
6764 int CommutedOpc = commuteOpcode(MI);
6765 if (CommutedOpc == -1) {
6766 legalizeOpWithMove(MI, Src1Idx);
6767 return;
6768 }
6769
6770 MI.setDesc(get(CommutedOpc));
6771
6772 Register Src0Reg = Src0.getReg();
6773 unsigned Src0SubReg = Src0.getSubReg();
6774 bool Src0Kill = Src0.isKill();
6775
6776 if (Src1.isImm())
6777 Src0.ChangeToImmediate(Src1.getImm());
6778 else if (Src1.isReg()) {
6779 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6780 Src0.setSubReg(Src1.getSubReg());
6781 } else
6782 llvm_unreachable("Should only have register or immediate operands");
6783
6784 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6785 Src1.setSubReg(Src0SubReg);
6787}
6788
6789// Legalize VOP3 operands. All operand types are supported for any operand
6790// but only one literal constant and only starting from GFX10.
6792 MachineInstr &MI) const {
6793 unsigned Opc = MI.getOpcode();
6794
6795 int VOP3Idx[3] = {
6796 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6797 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6798 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6799 };
6800
6801 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6802 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6803 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6804 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6805 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6806 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6807 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6808 // src1 and src2 must be scalar
6809 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6810 const DebugLoc &DL = MI.getDebugLoc();
6811 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6812 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6813 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6814 .add(Src1);
6815 Src1.ChangeToRegister(Reg, false);
6816 }
6817 if (VOP3Idx[2] != -1) {
6818 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6819 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6820 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6821 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6822 .add(Src2);
6823 Src2.ChangeToRegister(Reg, false);
6824 }
6825 }
6826 }
6827
6828 // Find the one SGPR operand we are allowed to use.
6829 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6830 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6831 SmallDenseSet<unsigned> SGPRsUsed;
6832 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6833 if (SGPRReg) {
6834 SGPRsUsed.insert(SGPRReg);
6835 --ConstantBusLimit;
6836 }
6837
6838 for (int Idx : VOP3Idx) {
6839 if (Idx == -1)
6840 break;
6841 MachineOperand &MO = MI.getOperand(Idx);
6842
6843 if (!MO.isReg()) {
6844 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6845 continue;
6846
6847 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6848 --LiteralLimit;
6849 --ConstantBusLimit;
6850 continue;
6851 }
6852
6853 --LiteralLimit;
6854 --ConstantBusLimit;
6855 legalizeOpWithMove(MI, Idx);
6856 continue;
6857 }
6858
6859 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6860 continue; // VGPRs are legal
6861
6862 // We can use one SGPR in each VOP3 instruction prior to GFX10
6863 // and two starting from GFX10.
6864 if (SGPRsUsed.count(MO.getReg()))
6865 continue;
6866 if (ConstantBusLimit > 0) {
6867 SGPRsUsed.insert(MO.getReg());
6868 --ConstantBusLimit;
6869 continue;
6870 }
6871
6872 // If we make it this far, then the operand is not legal and we must
6873 // legalize it.
6874 legalizeOpWithMove(MI, Idx);
6875 }
6876
6877 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6878 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6879 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6880 legalizeOpWithMove(MI, VOP3Idx[2]);
6881
6882 // Fix the register class of packed FP32 instructions on gfx12+. See
6883 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6885 for (unsigned I = 0; I < 3; ++I) {
6886 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6887 legalizeOpWithMove(MI, VOP3Idx[I]);
6888 }
6889 }
6890}
6891
6894 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6895 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6896 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6897 if (DstRC)
6898 SRC = RI.getCommonSubClass(SRC, DstRC);
6899
6900 Register DstReg = MRI.createVirtualRegister(SRC);
6901 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6902
6903 if (RI.hasAGPRs(VRC)) {
6904 VRC = RI.getEquivalentVGPRClass(VRC);
6905 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6906 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6907 get(TargetOpcode::COPY), NewSrcReg)
6908 .addReg(SrcReg);
6909 SrcReg = NewSrcReg;
6910 }
6911
6912 if (SubRegs == 1) {
6913 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6914 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6915 .addReg(SrcReg);
6916 return DstReg;
6917 }
6918
6920 for (unsigned i = 0; i < SubRegs; ++i) {
6921 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6922 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6923 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6924 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6925 SRegs.push_back(SGPR);
6926 }
6927
6929 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6930 get(AMDGPU::REG_SEQUENCE), DstReg);
6931 for (unsigned i = 0; i < SubRegs; ++i) {
6932 MIB.addReg(SRegs[i]);
6933 MIB.addImm(RI.getSubRegFromChannel(i));
6934 }
6935 return DstReg;
6936}
6937
6939 MachineInstr &MI) const {
6940
6941 // If the pointer is store in VGPRs, then we need to move them to
6942 // SGPRs using v_readfirstlane. This is safe because we only select
6943 // loads with uniform pointers to SMRD instruction so we know the
6944 // pointer value is uniform.
6945 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6946 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6947 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6948 SBase->setReg(SGPR);
6949 }
6950 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6951 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6952 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6953 SOff->setReg(SGPR);
6954 }
6955}
6956
6958 unsigned Opc = Inst.getOpcode();
6959 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6960 if (OldSAddrIdx < 0)
6961 return false;
6962
6963 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6964
6965 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6966 if (NewOpc < 0)
6968 if (NewOpc < 0)
6969 return false;
6970
6971 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6972 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6973 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6974 return false;
6975
6976 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6977 if (NewVAddrIdx < 0)
6978 return false;
6979
6980 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6981
6982 // Check vaddr, it shall be zero or absent.
6983 MachineInstr *VAddrDef = nullptr;
6984 if (OldVAddrIdx >= 0) {
6985 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6986 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6987 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6988 !VAddrDef->getOperand(1).isImm() ||
6989 VAddrDef->getOperand(1).getImm() != 0)
6990 return false;
6991 }
6992
6993 const MCInstrDesc &NewDesc = get(NewOpc);
6994 Inst.setDesc(NewDesc);
6995
6996 // Callers expect iterator to be valid after this call, so modify the
6997 // instruction in place.
6998 if (OldVAddrIdx == NewVAddrIdx) {
6999 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
7000 // Clear use list from the old vaddr holding a zero register.
7001 MRI.removeRegOperandFromUseList(&NewVAddr);
7002 MRI.moveOperands(&NewVAddr, &SAddr, 1);
7003 Inst.removeOperand(OldSAddrIdx);
7004 // Update the use list with the pointer we have just moved from vaddr to
7005 // saddr position. Otherwise new vaddr will be missing from the use list.
7006 MRI.removeRegOperandFromUseList(&NewVAddr);
7007 MRI.addRegOperandToUseList(&NewVAddr);
7008 } else {
7009 assert(OldSAddrIdx == NewVAddrIdx);
7010
7011 if (OldVAddrIdx >= 0) {
7012 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
7013 AMDGPU::OpName::vdst_in);
7014
7015 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
7016 // it asserts. Untie the operands for now and retie them afterwards.
7017 if (NewVDstIn != -1) {
7018 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
7019 Inst.untieRegOperand(OldVDstIn);
7020 }
7021
7022 Inst.removeOperand(OldVAddrIdx);
7023
7024 if (NewVDstIn != -1) {
7025 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
7026 Inst.tieOperands(NewVDst, NewVDstIn);
7027 }
7028 }
7029 }
7030
7031 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
7032 VAddrDef->eraseFromParent();
7033
7034 return true;
7035}
7036
7037// FIXME: Remove this when SelectionDAG is obsoleted.
7039 MachineInstr &MI) const {
7040 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7041 return;
7042
7043 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7044 // thinks they are uniform, so a readfirstlane should be valid.
7045 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
7046 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
7047 return;
7048
7050 return;
7051
7052 const TargetRegisterClass *DeclaredRC =
7053 getRegClass(MI.getDesc(), SAddr->getOperandNo());
7054
7055 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
7056 SAddr->setReg(ToSGPR);
7057}
7058
7061 const TargetRegisterClass *DstRC,
7064 const DebugLoc &DL) const {
7065 Register OpReg = Op.getReg();
7066 unsigned OpSubReg = Op.getSubReg();
7067
7068 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7069 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7070
7071 // Check if operand is already the correct register class.
7072 if (DstRC == OpRC)
7073 return;
7074
7075 Register DstReg = MRI.createVirtualRegister(DstRC);
7076 auto Copy =
7077 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7078 Op.setReg(DstReg);
7079
7080 MachineInstr *Def = MRI.getVRegDef(OpReg);
7081 if (!Def)
7082 return;
7083
7084 // Try to eliminate the copy if it is copying an immediate value.
7085 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7086 foldImmediate(*Copy, *Def, OpReg, &MRI);
7087
7088 bool ImpDef = Def->isImplicitDef();
7089 while (!ImpDef && Def && Def->isCopy()) {
7090 if (Def->getOperand(1).getReg().isPhysical())
7091 break;
7092 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7093 ImpDef = Def && Def->isImplicitDef();
7094 }
7095 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7096 !ImpDef)
7097 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7098}
7099
7100// Emit the actual waterfall loop, executing the wrapped instruction for each
7101// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7102// iteration, in the worst case we execute 64 (once per lane).
7103static void
7106 MachineBasicBlock &LoopBB,
7107 MachineBasicBlock &BodyBB,
7108 const DebugLoc &DL,
7109 ArrayRef<MachineOperand *> ScalarOps) {
7110 MachineFunction &MF = *LoopBB.getParent();
7111 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7112 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7114 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7115
7117 Register CondReg;
7118
7119 for (MachineOperand *ScalarOp : ScalarOps) {
7120 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7121 unsigned NumSubRegs = RegSize / 32;
7122 Register VScalarOp = ScalarOp->getReg();
7123
7124 if (NumSubRegs == 1) {
7125 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7126
7127 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7128 .addReg(VScalarOp);
7129
7130 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7131
7132 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7133 .addReg(CurReg)
7134 .addReg(VScalarOp);
7135
7136 // Combine the comparison results with AND.
7137 if (!CondReg) // First.
7138 CondReg = NewCondReg;
7139 else { // If not the first, we create an AND.
7140 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7141 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7142 .addReg(CondReg)
7143 .addReg(NewCondReg);
7144 CondReg = AndReg;
7145 }
7146
7147 // Update ScalarOp operand to use the SGPR ScalarOp.
7148 ScalarOp->setReg(CurReg);
7149 ScalarOp->setIsKill();
7150 } else {
7151 SmallVector<Register, 8> ReadlanePieces;
7152 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7153 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7154 "Unhandled register size");
7155
7156 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7157 Register CurRegLo =
7158 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7159 Register CurRegHi =
7160 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7161
7162 // Read the next variant <- also loop target.
7163 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7164 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7165
7166 // Read the next variant <- also loop target.
7167 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7168 .addReg(VScalarOp, VScalarOpUndef,
7169 TRI->getSubRegFromChannel(Idx + 1));
7170
7171 ReadlanePieces.push_back(CurRegLo);
7172 ReadlanePieces.push_back(CurRegHi);
7173
7174 // Comparison is to be done as 64-bit.
7175 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7176 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7177 .addReg(CurRegLo)
7178 .addImm(AMDGPU::sub0)
7179 .addReg(CurRegHi)
7180 .addImm(AMDGPU::sub1);
7181
7182 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7183 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7184 NewCondReg)
7185 .addReg(CurReg);
7186 if (NumSubRegs <= 2)
7187 Cmp.addReg(VScalarOp);
7188 else
7189 Cmp.addReg(VScalarOp, VScalarOpUndef,
7190 TRI->getSubRegFromChannel(Idx, 2));
7191
7192 // Combine the comparison results with AND.
7193 if (!CondReg) // First.
7194 CondReg = NewCondReg;
7195 else { // If not the first, we create an AND.
7196 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7197 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7198 .addReg(CondReg)
7199 .addReg(NewCondReg);
7200 CondReg = AndReg;
7201 }
7202 } // End for loop.
7203
7204 const auto *SScalarOpRC =
7205 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7206 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7207
7208 // Build scalar ScalarOp.
7209 auto Merge =
7210 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7211 unsigned Channel = 0;
7212 for (Register Piece : ReadlanePieces) {
7213 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7214 }
7215
7216 // Update ScalarOp operand to use the SGPR ScalarOp.
7217 ScalarOp->setReg(SScalarOp);
7218 ScalarOp->setIsKill();
7219 }
7220 }
7221
7222 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7223 MRI.setSimpleHint(SaveExec, CondReg);
7224
7225 // Update EXEC to matching lanes, saving original to SaveExec.
7226 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7227 .addReg(CondReg, RegState::Kill);
7228
7229 // The original instruction is here; we insert the terminators after it.
7230 I = BodyBB.end();
7231
7232 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7233 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7234 .addReg(LMC.ExecReg)
7235 .addReg(SaveExec);
7236
7237 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7238}
7239
7240// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7241// with SGPRs by iterating over all unique values across all lanes.
7242// Returns the loop basic block that now contains \p MI.
7243static MachineBasicBlock *
7247 MachineBasicBlock::iterator Begin = nullptr,
7248 MachineBasicBlock::iterator End = nullptr) {
7249 MachineBasicBlock &MBB = *MI.getParent();
7250 MachineFunction &MF = *MBB.getParent();
7251 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7252 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7253 MachineRegisterInfo &MRI = MF.getRegInfo();
7254 if (!Begin.isValid())
7255 Begin = &MI;
7256 if (!End.isValid()) {
7257 End = &MI;
7258 ++End;
7259 }
7260 const DebugLoc &DL = MI.getDebugLoc();
7262 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7263
7264 // Save SCC. Waterfall Loop may overwrite SCC.
7265 Register SaveSCCReg;
7266
7267 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7268 // rather than unlimited scan everywhere
7269 bool SCCNotDead =
7270 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7271 std::numeric_limits<unsigned>::max()) !=
7273 if (SCCNotDead) {
7274 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7275 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7276 .addImm(1)
7277 .addImm(0);
7278 }
7279
7280 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7281
7282 // Save the EXEC mask
7283 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7284
7285 // Killed uses in the instruction we are waterfalling around will be
7286 // incorrect due to the added control-flow.
7288 ++AfterMI;
7289 for (auto I = Begin; I != AfterMI; I++) {
7290 for (auto &MO : I->all_uses())
7291 MRI.clearKillFlags(MO.getReg());
7292 }
7293
7294 // To insert the loop we need to split the block. Move everything after this
7295 // point to a new block, and insert a new empty block between the two.
7298 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7300 ++MBBI;
7301
7302 MF.insert(MBBI, LoopBB);
7303 MF.insert(MBBI, BodyBB);
7304 MF.insert(MBBI, RemainderBB);
7305
7306 LoopBB->addSuccessor(BodyBB);
7307 BodyBB->addSuccessor(LoopBB);
7308 BodyBB->addSuccessor(RemainderBB);
7309
7310 // Move Begin to MI to the BodyBB, and the remainder of the block to
7311 // RemainderBB.
7312 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7313 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7314 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7315
7316 MBB.addSuccessor(LoopBB);
7317
7318 // Update dominators. We know that MBB immediately dominates LoopBB, that
7319 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7320 // RemainderBB. RemainderBB immediately dominates all of the successors
7321 // transferred to it from MBB that MBB used to properly dominate.
7322 if (MDT) {
7323 MDT->addNewBlock(LoopBB, &MBB);
7324 MDT->addNewBlock(BodyBB, LoopBB);
7325 MDT->addNewBlock(RemainderBB, BodyBB);
7326 for (auto &Succ : RemainderBB->successors()) {
7327 if (MDT->properlyDominates(&MBB, Succ)) {
7328 MDT->changeImmediateDominator(Succ, RemainderBB);
7329 }
7330 }
7331 }
7332
7333 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7334
7335 MachineBasicBlock::iterator First = RemainderBB->begin();
7336 // Restore SCC
7337 if (SCCNotDead) {
7338 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7339 .addReg(SaveSCCReg, RegState::Kill)
7340 .addImm(0);
7341 }
7342
7343 // Restore the EXEC mask
7344 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7345 .addReg(SaveExec);
7346 return BodyBB;
7347}
7348
7349// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7350static std::tuple<unsigned, unsigned>
7352 MachineBasicBlock &MBB = *MI.getParent();
7353 MachineFunction &MF = *MBB.getParent();
7354 MachineRegisterInfo &MRI = MF.getRegInfo();
7355
7356 // Extract the ptr from the resource descriptor.
7357 unsigned RsrcPtr =
7358 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7359 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7360
7361 // Create an empty resource descriptor
7362 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7363 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7364 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7365 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7366 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7367
7368 // Zero64 = 0
7369 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7370 .addImm(0);
7371
7372 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7373 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7374 .addImm(Lo_32(RsrcDataFormat));
7375
7376 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7377 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7378 .addImm(Hi_32(RsrcDataFormat));
7379
7380 // NewSRsrc = {Zero64, SRsrcFormat}
7381 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7382 .addReg(Zero64)
7383 .addImm(AMDGPU::sub0_sub1)
7384 .addReg(SRsrcFormatLo)
7385 .addImm(AMDGPU::sub2)
7386 .addReg(SRsrcFormatHi)
7387 .addImm(AMDGPU::sub3);
7388
7389 return std::tuple(RsrcPtr, NewSRsrc);
7390}
7391
7394 MachineDominatorTree *MDT) const {
7395 MachineFunction &MF = *MI.getMF();
7396 MachineRegisterInfo &MRI = MF.getRegInfo();
7397 MachineBasicBlock *CreatedBB = nullptr;
7398
7399 // Legalize VOP2
7400 if (isVOP2(MI) || isVOPC(MI)) {
7402 return CreatedBB;
7403 }
7404
7405 // Legalize VOP3
7406 if (isVOP3(MI)) {
7408 return CreatedBB;
7409 }
7410
7411 // Legalize SMRD
7412 if (isSMRD(MI)) {
7414 return CreatedBB;
7415 }
7416
7417 // Legalize FLAT
7418 if (isFLAT(MI)) {
7420 return CreatedBB;
7421 }
7422
7423 // Legalize PHI
7424 // The register class of the operands must be the same type as the register
7425 // class of the output.
7426 if (MI.getOpcode() == AMDGPU::PHI) {
7427 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7428 assert(!RI.isSGPRClass(VRC));
7429
7430 // Update all the operands so they have the same type.
7431 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7432 MachineOperand &Op = MI.getOperand(I);
7433 if (!Op.isReg() || !Op.getReg().isVirtual())
7434 continue;
7435
7436 // MI is a PHI instruction.
7437 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7439
7440 // Avoid creating no-op copies with the same src and dst reg class. These
7441 // confuse some of the machine passes.
7442 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7443 }
7444 }
7445
7446 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7447 // VGPR dest type and SGPR sources, insert copies so all operands are
7448 // VGPRs. This seems to help operand folding / the register coalescer.
7449 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7450 MachineBasicBlock *MBB = MI.getParent();
7451 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7452 if (RI.hasVGPRs(DstRC)) {
7453 // Update all the operands so they are VGPR register classes. These may
7454 // not be the same register class because REG_SEQUENCE supports mixing
7455 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7456 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7457 MachineOperand &Op = MI.getOperand(I);
7458 if (!Op.isReg() || !Op.getReg().isVirtual())
7459 continue;
7460
7461 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7462 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7463 if (VRC == OpRC)
7464 continue;
7465
7466 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7467 Op.setIsKill();
7468 }
7469 }
7470
7471 return CreatedBB;
7472 }
7473
7474 // Legalize INSERT_SUBREG
7475 // src0 must have the same register class as dst
7476 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7477 Register Dst = MI.getOperand(0).getReg();
7478 Register Src0 = MI.getOperand(1).getReg();
7479 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7480 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7481 if (DstRC != Src0RC) {
7482 MachineBasicBlock *MBB = MI.getParent();
7483 MachineOperand &Op = MI.getOperand(1);
7484 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7485 }
7486 return CreatedBB;
7487 }
7488
7489 // Legalize SI_INIT_M0
7490 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7491 MachineOperand &Src = MI.getOperand(0);
7492 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7493 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7494 return CreatedBB;
7495 }
7496
7497 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7498 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7499 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7500 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7501 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7502 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7503 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7504 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7505 MachineOperand &Src = MI.getOperand(1);
7506 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7507 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7508 return CreatedBB;
7509 }
7510
7511 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7512 //
7513 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7514 // scratch memory access. In both cases, the legalization never involves
7515 // conversion to the addr64 form.
7517 (isMUBUF(MI) || isMTBUF(MI)))) {
7518 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7519 ? AMDGPU::OpName::rsrc
7520 : AMDGPU::OpName::srsrc;
7521 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7522 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7523 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7524
7525 AMDGPU::OpName SampOpName =
7526 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7527 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7528 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7529 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7530
7531 return CreatedBB;
7532 }
7533
7534 // Legalize SI_CALL
7535 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7536 MachineOperand *Dest = &MI.getOperand(0);
7537 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7538 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7539 // following copies, we also need to move copies from and to physical
7540 // registers into the loop block.
7541 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7542 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7543
7544 // Also move the copies to physical registers into the loop block
7545 MachineBasicBlock &MBB = *MI.getParent();
7547 while (Start->getOpcode() != FrameSetupOpcode)
7548 --Start;
7550 while (End->getOpcode() != FrameDestroyOpcode)
7551 ++End;
7552 // Also include following copies of the return value
7553 ++End;
7554 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7555 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7556 ++End;
7557 CreatedBB =
7558 loadScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7559 }
7560 }
7561
7562 // Legalize s_sleep_var.
7563 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7564 const DebugLoc &DL = MI.getDebugLoc();
7565 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7566 int Src0Idx =
7567 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7568 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7569 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7570 .add(Src0);
7571 Src0.ChangeToRegister(Reg, false);
7572 return nullptr;
7573 }
7574
7575 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7576 // operands are scalar.
7577 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7578 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7579 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7580 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7581 for (MachineOperand &Src : MI.explicit_operands()) {
7582 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7583 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7584 }
7585 return CreatedBB;
7586 }
7587
7588 // Legalize MUBUF instructions.
7589 bool isSoffsetLegal = true;
7590 int SoffsetIdx =
7591 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7592 if (SoffsetIdx != -1) {
7593 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7594 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7595 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7596 isSoffsetLegal = false;
7597 }
7598 }
7599
7600 bool isRsrcLegal = true;
7601 int RsrcIdx =
7602 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7603 if (RsrcIdx != -1) {
7604 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7605 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7606 isRsrcLegal = false;
7607 }
7608
7609 // The operands are legal.
7610 if (isRsrcLegal && isSoffsetLegal)
7611 return CreatedBB;
7612
7613 if (!isRsrcLegal) {
7614 // Legalize a VGPR Rsrc
7615 //
7616 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7617 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7618 // a zero-value SRsrc.
7619 //
7620 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7621 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7622 // above.
7623 //
7624 // Otherwise we are on non-ADDR64 hardware, and/or we have
7625 // idxen/offen/bothen and we fall back to a waterfall loop.
7626
7627 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7628 MachineBasicBlock &MBB = *MI.getParent();
7629
7630 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7631 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7632 // This is already an ADDR64 instruction so we need to add the pointer
7633 // extracted from the resource descriptor to the current value of VAddr.
7634 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7635 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7636 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7637
7638 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7639 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7640 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7641
7642 unsigned RsrcPtr, NewSRsrc;
7643 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7644
7645 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7646 const DebugLoc &DL = MI.getDebugLoc();
7647 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7648 .addDef(CondReg0)
7649 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7650 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7651 .addImm(0);
7652
7653 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7654 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7655 .addDef(CondReg1, RegState::Dead)
7656 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7657 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7658 .addReg(CondReg0, RegState::Kill)
7659 .addImm(0);
7660
7661 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7662 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7663 .addReg(NewVAddrLo)
7664 .addImm(AMDGPU::sub0)
7665 .addReg(NewVAddrHi)
7666 .addImm(AMDGPU::sub1);
7667
7668 VAddr->setReg(NewVAddr);
7669 Rsrc->setReg(NewSRsrc);
7670 } else if (!VAddr && ST.hasAddr64()) {
7671 // This instructions is the _OFFSET variant, so we need to convert it to
7672 // ADDR64.
7673 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7674 "FIXME: Need to emit flat atomics here");
7675
7676 unsigned RsrcPtr, NewSRsrc;
7677 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7678
7679 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7680 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7681 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7682 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7683 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7684
7685 // Atomics with return have an additional tied operand and are
7686 // missing some of the special bits.
7687 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7688 MachineInstr *Addr64;
7689
7690 if (!VDataIn) {
7691 // Regular buffer load / store.
7693 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7694 .add(*VData)
7695 .addReg(NewVAddr)
7696 .addReg(NewSRsrc)
7697 .add(*SOffset)
7698 .add(*Offset);
7699
7700 if (const MachineOperand *CPol =
7701 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7702 MIB.addImm(CPol->getImm());
7703 }
7704
7705 if (const MachineOperand *TFE =
7706 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7707 MIB.addImm(TFE->getImm());
7708 }
7709
7710 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7711
7712 MIB.cloneMemRefs(MI);
7713 Addr64 = MIB;
7714 } else {
7715 // Atomics with return.
7716 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7717 .add(*VData)
7718 .add(*VDataIn)
7719 .addReg(NewVAddr)
7720 .addReg(NewSRsrc)
7721 .add(*SOffset)
7722 .add(*Offset)
7723 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7724 .cloneMemRefs(MI);
7725 }
7726
7727 MI.removeFromParent();
7728
7729 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7730 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7731 NewVAddr)
7732 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7733 .addImm(AMDGPU::sub0)
7734 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7735 .addImm(AMDGPU::sub1);
7736 } else {
7737 // Legalize a VGPR Rsrc and soffset together.
7738 if (!isSoffsetLegal) {
7739 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7740 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7741 return CreatedBB;
7742 }
7743 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7744 return CreatedBB;
7745 }
7746 }
7747
7748 // Legalize a VGPR soffset.
7749 if (!isSoffsetLegal) {
7750 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7751 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7752 return CreatedBB;
7753 }
7754 return CreatedBB;
7755}
7756
7758 InstrList.insert(MI);
7759 // Add MBUF instructiosn to deferred list.
7760 int RsrcIdx =
7761 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7762 if (RsrcIdx != -1) {
7763 DeferredList.insert(MI);
7764 }
7765}
7766
7768 return DeferredList.contains(MI);
7769}
7770
7771// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7772// lowering (change sgpr to vgpr).
7773// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7774// size. Need to legalize the size of the operands during the vgpr lowering
7775// chain. This can be removed after we have sgpr16 in place
7777 MachineRegisterInfo &MRI) const {
7778 if (!ST.useRealTrue16Insts())
7779 return;
7780
7781 unsigned Opcode = MI.getOpcode();
7782 MachineBasicBlock *MBB = MI.getParent();
7783 // Legalize operands and check for size mismatch
7784 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7785 OpIdx >= get(Opcode).getNumOperands() ||
7786 get(Opcode).operands()[OpIdx].RegClass == -1)
7787 return;
7788
7789 MachineOperand &Op = MI.getOperand(OpIdx);
7790 if (!Op.isReg() || !Op.getReg().isVirtual())
7791 return;
7792
7793 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7794 if (!RI.isVGPRClass(CurrRC))
7795 return;
7796
7797 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7798 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7799 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7800 Op.setSubReg(AMDGPU::lo16);
7801 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7802 const DebugLoc &DL = MI.getDebugLoc();
7803 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7804 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7805 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7806 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7807 .addReg(Op.getReg())
7808 .addImm(AMDGPU::lo16)
7809 .addReg(Undef)
7810 .addImm(AMDGPU::hi16);
7811 Op.setReg(NewDstReg);
7812 }
7813}
7815 MachineRegisterInfo &MRI) const {
7816 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7818}
7819
7821 MachineDominatorTree *MDT) const {
7822
7823 while (!Worklist.empty()) {
7824 MachineInstr &Inst = *Worklist.top();
7825 Worklist.erase_top();
7826 // Skip MachineInstr in the deferred list.
7827 if (Worklist.isDeferred(&Inst))
7828 continue;
7829 moveToVALUImpl(Worklist, MDT, Inst);
7830 }
7831
7832 // Deferred list of instructions will be processed once
7833 // all the MachineInstr in the worklist are done.
7834 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7835 moveToVALUImpl(Worklist, MDT, *Inst);
7836 assert(Worklist.empty() &&
7837 "Deferred MachineInstr are not supposed to re-populate worklist");
7838 }
7839}
7840
7843 MachineInstr &Inst) const {
7844
7846 if (!MBB)
7847 return;
7848 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7849 unsigned Opcode = Inst.getOpcode();
7850 unsigned NewOpcode = getVALUOp(Inst);
7851 const DebugLoc &DL = Inst.getDebugLoc();
7852
7853 // Handle some special cases
7854 switch (Opcode) {
7855 default:
7856 break;
7857 case AMDGPU::S_ADD_I32:
7858 case AMDGPU::S_SUB_I32: {
7859 // FIXME: The u32 versions currently selected use the carry.
7860 bool Changed;
7861 MachineBasicBlock *CreatedBBTmp = nullptr;
7862 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7863 if (Changed)
7864 return;
7865
7866 // Default handling
7867 break;
7868 }
7869
7870 case AMDGPU::S_MUL_U64:
7871 if (ST.hasVectorMulU64()) {
7872 NewOpcode = AMDGPU::V_MUL_U64_e64;
7873 break;
7874 }
7875 // Split s_mul_u64 in 32-bit vector multiplications.
7876 splitScalarSMulU64(Worklist, Inst, MDT);
7877 Inst.eraseFromParent();
7878 return;
7879
7880 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7881 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7882 // This is a special case of s_mul_u64 where all the operands are either
7883 // zero extended or sign extended.
7884 splitScalarSMulPseudo(Worklist, Inst, MDT);
7885 Inst.eraseFromParent();
7886 return;
7887
7888 case AMDGPU::S_AND_B64:
7889 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7890 Inst.eraseFromParent();
7891 return;
7892
7893 case AMDGPU::S_OR_B64:
7894 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7895 Inst.eraseFromParent();
7896 return;
7897
7898 case AMDGPU::S_XOR_B64:
7899 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7900 Inst.eraseFromParent();
7901 return;
7902
7903 case AMDGPU::S_NAND_B64:
7904 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7905 Inst.eraseFromParent();
7906 return;
7907
7908 case AMDGPU::S_NOR_B64:
7909 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7910 Inst.eraseFromParent();
7911 return;
7912
7913 case AMDGPU::S_XNOR_B64:
7914 if (ST.hasDLInsts())
7915 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7916 else
7917 splitScalar64BitXnor(Worklist, Inst, MDT);
7918 Inst.eraseFromParent();
7919 return;
7920
7921 case AMDGPU::S_ANDN2_B64:
7922 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7923 Inst.eraseFromParent();
7924 return;
7925
7926 case AMDGPU::S_ORN2_B64:
7927 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7928 Inst.eraseFromParent();
7929 return;
7930
7931 case AMDGPU::S_BREV_B64:
7932 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7933 Inst.eraseFromParent();
7934 return;
7935
7936 case AMDGPU::S_NOT_B64:
7937 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7938 Inst.eraseFromParent();
7939 return;
7940
7941 case AMDGPU::S_BCNT1_I32_B64:
7942 splitScalar64BitBCNT(Worklist, Inst);
7943 Inst.eraseFromParent();
7944 return;
7945
7946 case AMDGPU::S_BFE_I64:
7947 splitScalar64BitBFE(Worklist, Inst);
7948 Inst.eraseFromParent();
7949 return;
7950
7951 case AMDGPU::S_FLBIT_I32_B64:
7952 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7953 Inst.eraseFromParent();
7954 return;
7955 case AMDGPU::S_FF1_I32_B64:
7956 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7957 Inst.eraseFromParent();
7958 return;
7959
7960 case AMDGPU::S_LSHL_B32:
7961 if (ST.hasOnlyRevVALUShifts()) {
7962 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7963 swapOperands(Inst);
7964 }
7965 break;
7966 case AMDGPU::S_ASHR_I32:
7967 if (ST.hasOnlyRevVALUShifts()) {
7968 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7969 swapOperands(Inst);
7970 }
7971 break;
7972 case AMDGPU::S_LSHR_B32:
7973 if (ST.hasOnlyRevVALUShifts()) {
7974 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7975 swapOperands(Inst);
7976 }
7977 break;
7978 case AMDGPU::S_LSHL_B64:
7979 if (ST.hasOnlyRevVALUShifts()) {
7980 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7981 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7982 : AMDGPU::V_LSHLREV_B64_e64;
7983 swapOperands(Inst);
7984 }
7985 break;
7986 case AMDGPU::S_ASHR_I64:
7987 if (ST.hasOnlyRevVALUShifts()) {
7988 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7989 swapOperands(Inst);
7990 }
7991 break;
7992 case AMDGPU::S_LSHR_B64:
7993 if (ST.hasOnlyRevVALUShifts()) {
7994 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7995 swapOperands(Inst);
7996 }
7997 break;
7998
7999 case AMDGPU::S_ABS_I32:
8000 lowerScalarAbs(Worklist, Inst);
8001 Inst.eraseFromParent();
8002 return;
8003
8004 case AMDGPU::S_ABSDIFF_I32:
8005 lowerScalarAbsDiff(Worklist, Inst);
8006 Inst.eraseFromParent();
8007 return;
8008
8009 case AMDGPU::S_CBRANCH_SCC0:
8010 case AMDGPU::S_CBRANCH_SCC1: {
8011 // Clear unused bits of vcc
8012 Register CondReg = Inst.getOperand(1).getReg();
8013 bool IsSCC = CondReg == AMDGPU::SCC;
8015 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8016 .addReg(LMC.ExecReg)
8017 .addReg(IsSCC ? LMC.VccReg : CondReg);
8018 Inst.removeOperand(1);
8019 } break;
8020
8021 case AMDGPU::S_BFE_U64:
8022 case AMDGPU::S_BFM_B64:
8023 llvm_unreachable("Moving this op to VALU not implemented");
8024
8025 case AMDGPU::S_PACK_LL_B32_B16:
8026 case AMDGPU::S_PACK_LH_B32_B16:
8027 case AMDGPU::S_PACK_HL_B32_B16:
8028 case AMDGPU::S_PACK_HH_B32_B16:
8029 movePackToVALU(Worklist, MRI, Inst);
8030 Inst.eraseFromParent();
8031 return;
8032
8033 case AMDGPU::S_XNOR_B32:
8034 lowerScalarXnor(Worklist, Inst);
8035 Inst.eraseFromParent();
8036 return;
8037
8038 case AMDGPU::S_NAND_B32:
8039 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8040 Inst.eraseFromParent();
8041 return;
8042
8043 case AMDGPU::S_NOR_B32:
8044 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8045 Inst.eraseFromParent();
8046 return;
8047
8048 case AMDGPU::S_ANDN2_B32:
8049 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8050 Inst.eraseFromParent();
8051 return;
8052
8053 case AMDGPU::S_ORN2_B32:
8054 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8055 Inst.eraseFromParent();
8056 return;
8057
8058 // TODO: remove as soon as everything is ready
8059 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8060 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8061 // can only be selected from the uniform SDNode.
8062 case AMDGPU::S_ADD_CO_PSEUDO:
8063 case AMDGPU::S_SUB_CO_PSEUDO: {
8064 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8065 ? AMDGPU::V_ADDC_U32_e64
8066 : AMDGPU::V_SUBB_U32_e64;
8067 const auto *CarryRC = RI.getWaveMaskRegClass();
8068
8069 Register CarryInReg = Inst.getOperand(4).getReg();
8070 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8071 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8072 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8073 .addReg(CarryInReg);
8074 }
8075
8076 Register CarryOutReg = Inst.getOperand(1).getReg();
8077
8078 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8079 MRI.getRegClass(Inst.getOperand(0).getReg())));
8080 MachineInstr *CarryOp =
8081 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8082 .addReg(CarryOutReg, RegState::Define)
8083 .add(Inst.getOperand(2))
8084 .add(Inst.getOperand(3))
8085 .addReg(CarryInReg)
8086 .addImm(0);
8087 legalizeOperands(*CarryOp);
8088 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8089 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8090 Inst.eraseFromParent();
8091 }
8092 return;
8093 case AMDGPU::S_UADDO_PSEUDO:
8094 case AMDGPU::S_USUBO_PSEUDO: {
8095 MachineOperand &Dest0 = Inst.getOperand(0);
8096 MachineOperand &Dest1 = Inst.getOperand(1);
8097 MachineOperand &Src0 = Inst.getOperand(2);
8098 MachineOperand &Src1 = Inst.getOperand(3);
8099
8100 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8101 ? AMDGPU::V_ADD_CO_U32_e64
8102 : AMDGPU::V_SUB_CO_U32_e64;
8103 const TargetRegisterClass *NewRC =
8104 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8105 Register DestReg = MRI.createVirtualRegister(NewRC);
8106 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8107 .addReg(Dest1.getReg(), RegState::Define)
8108 .add(Src0)
8109 .add(Src1)
8110 .addImm(0); // clamp bit
8111
8112 legalizeOperands(*NewInstr, MDT);
8113 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8114 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8115 Inst.eraseFromParent();
8116 }
8117 return;
8118 case AMDGPU::S_LSHL1_ADD_U32:
8119 case AMDGPU::S_LSHL2_ADD_U32:
8120 case AMDGPU::S_LSHL3_ADD_U32:
8121 case AMDGPU::S_LSHL4_ADD_U32: {
8122 MachineOperand &Dest = Inst.getOperand(0);
8123 MachineOperand &Src0 = Inst.getOperand(1);
8124 MachineOperand &Src1 = Inst.getOperand(2);
8125 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8126 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8127 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8128 : 4);
8129
8130 const TargetRegisterClass *NewRC =
8131 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8132 Register DestReg = MRI.createVirtualRegister(NewRC);
8133 MachineInstr *NewInstr =
8134 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8135 .add(Src0)
8136 .addImm(ShiftAmt)
8137 .add(Src1);
8138
8139 legalizeOperands(*NewInstr, MDT);
8140 MRI.replaceRegWith(Dest.getReg(), DestReg);
8141 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8142 Inst.eraseFromParent();
8143 }
8144 return;
8145 case AMDGPU::S_CSELECT_B32:
8146 case AMDGPU::S_CSELECT_B64:
8147 lowerSelect(Worklist, Inst, MDT);
8148 Inst.eraseFromParent();
8149 return;
8150 case AMDGPU::S_CMP_EQ_I32:
8151 case AMDGPU::S_CMP_LG_I32:
8152 case AMDGPU::S_CMP_GT_I32:
8153 case AMDGPU::S_CMP_GE_I32:
8154 case AMDGPU::S_CMP_LT_I32:
8155 case AMDGPU::S_CMP_LE_I32:
8156 case AMDGPU::S_CMP_EQ_U32:
8157 case AMDGPU::S_CMP_LG_U32:
8158 case AMDGPU::S_CMP_GT_U32:
8159 case AMDGPU::S_CMP_GE_U32:
8160 case AMDGPU::S_CMP_LT_U32:
8161 case AMDGPU::S_CMP_LE_U32:
8162 case AMDGPU::S_CMP_EQ_U64:
8163 case AMDGPU::S_CMP_LG_U64:
8164 case AMDGPU::S_CMP_LT_F32:
8165 case AMDGPU::S_CMP_EQ_F32:
8166 case AMDGPU::S_CMP_LE_F32:
8167 case AMDGPU::S_CMP_GT_F32:
8168 case AMDGPU::S_CMP_LG_F32:
8169 case AMDGPU::S_CMP_GE_F32:
8170 case AMDGPU::S_CMP_O_F32:
8171 case AMDGPU::S_CMP_U_F32:
8172 case AMDGPU::S_CMP_NGE_F32:
8173 case AMDGPU::S_CMP_NLG_F32:
8174 case AMDGPU::S_CMP_NGT_F32:
8175 case AMDGPU::S_CMP_NLE_F32:
8176 case AMDGPU::S_CMP_NEQ_F32:
8177 case AMDGPU::S_CMP_NLT_F32: {
8178 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8179 auto NewInstr =
8180 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8181 .setMIFlags(Inst.getFlags());
8182 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8183 0) {
8184 NewInstr
8185 .addImm(0) // src0_modifiers
8186 .add(Inst.getOperand(0)) // src0
8187 .addImm(0) // src1_modifiers
8188 .add(Inst.getOperand(1)) // src1
8189 .addImm(0); // clamp
8190 } else {
8191 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8192 }
8193 legalizeOperands(*NewInstr, MDT);
8194 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8195 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8196 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8197 Inst.eraseFromParent();
8198 return;
8199 }
8200 case AMDGPU::S_CMP_LT_F16:
8201 case AMDGPU::S_CMP_EQ_F16:
8202 case AMDGPU::S_CMP_LE_F16:
8203 case AMDGPU::S_CMP_GT_F16:
8204 case AMDGPU::S_CMP_LG_F16:
8205 case AMDGPU::S_CMP_GE_F16:
8206 case AMDGPU::S_CMP_O_F16:
8207 case AMDGPU::S_CMP_U_F16:
8208 case AMDGPU::S_CMP_NGE_F16:
8209 case AMDGPU::S_CMP_NLG_F16:
8210 case AMDGPU::S_CMP_NGT_F16:
8211 case AMDGPU::S_CMP_NLE_F16:
8212 case AMDGPU::S_CMP_NEQ_F16:
8213 case AMDGPU::S_CMP_NLT_F16: {
8214 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8215 auto NewInstr =
8216 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8217 .setMIFlags(Inst.getFlags());
8218 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8219 NewInstr
8220 .addImm(0) // src0_modifiers
8221 .add(Inst.getOperand(0)) // src0
8222 .addImm(0) // src1_modifiers
8223 .add(Inst.getOperand(1)) // src1
8224 .addImm(0); // clamp
8225 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8226 NewInstr.addImm(0); // op_sel0
8227 } else {
8228 NewInstr
8229 .add(Inst.getOperand(0))
8230 .add(Inst.getOperand(1));
8231 }
8232 legalizeOperandsVALUt16(*NewInstr, MRI);
8233 legalizeOperands(*NewInstr, MDT);
8234 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8235 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8236 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8237 Inst.eraseFromParent();
8238 return;
8239 }
8240 case AMDGPU::S_CVT_HI_F32_F16: {
8241 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8242 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8243 if (ST.useRealTrue16Insts()) {
8244 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8245 .add(Inst.getOperand(1));
8246 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8247 .addImm(0) // src0_modifiers
8248 .addReg(TmpReg, {}, AMDGPU::hi16)
8249 .addImm(0) // clamp
8250 .addImm(0) // omod
8251 .addImm(0); // op_sel0
8252 } else {
8253 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8254 .addImm(16)
8255 .add(Inst.getOperand(1));
8256 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8257 .addImm(0) // src0_modifiers
8258 .addReg(TmpReg)
8259 .addImm(0) // clamp
8260 .addImm(0); // omod
8261 }
8262
8263 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8264 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8265 Inst.eraseFromParent();
8266 return;
8267 }
8268 case AMDGPU::S_MINIMUM_F32:
8269 case AMDGPU::S_MAXIMUM_F32: {
8270 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8271 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8272 .addImm(0) // src0_modifiers
8273 .add(Inst.getOperand(1))
8274 .addImm(0) // src1_modifiers
8275 .add(Inst.getOperand(2))
8276 .addImm(0) // clamp
8277 .addImm(0); // omod
8278 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8279
8280 legalizeOperands(*NewInstr, MDT);
8281 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8282 Inst.eraseFromParent();
8283 return;
8284 }
8285 case AMDGPU::S_MINIMUM_F16:
8286 case AMDGPU::S_MAXIMUM_F16: {
8287 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8288 ? &AMDGPU::VGPR_16RegClass
8289 : &AMDGPU::VGPR_32RegClass);
8290 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8291 .addImm(0) // src0_modifiers
8292 .add(Inst.getOperand(1))
8293 .addImm(0) // src1_modifiers
8294 .add(Inst.getOperand(2))
8295 .addImm(0) // clamp
8296 .addImm(0) // omod
8297 .addImm(0); // opsel0
8298 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8299 legalizeOperandsVALUt16(*NewInstr, MRI);
8300 legalizeOperands(*NewInstr, MDT);
8301 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8302 Inst.eraseFromParent();
8303 return;
8304 }
8305 case AMDGPU::V_S_EXP_F16_e64:
8306 case AMDGPU::V_S_LOG_F16_e64:
8307 case AMDGPU::V_S_RCP_F16_e64:
8308 case AMDGPU::V_S_RSQ_F16_e64:
8309 case AMDGPU::V_S_SQRT_F16_e64: {
8310 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8311 ? &AMDGPU::VGPR_16RegClass
8312 : &AMDGPU::VGPR_32RegClass);
8313 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8314 .add(Inst.getOperand(1)) // src0_modifiers
8315 .add(Inst.getOperand(2))
8316 .add(Inst.getOperand(3)) // clamp
8317 .add(Inst.getOperand(4)) // omod
8318 .setMIFlags(Inst.getFlags());
8319 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8320 NewInstr.addImm(0); // opsel0
8321 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8322 legalizeOperandsVALUt16(*NewInstr, MRI);
8323 legalizeOperands(*NewInstr, MDT);
8324 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8325 Inst.eraseFromParent();
8326 return;
8327 }
8328 }
8329
8330 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8331 // We cannot move this instruction to the VALU, so we should try to
8332 // legalize its operands instead.
8333 legalizeOperands(Inst, MDT);
8334 return;
8335 }
8336 // Handle converting generic instructions like COPY-to-SGPR into
8337 // COPY-to-VGPR.
8338 if (NewOpcode == Opcode) {
8339 Register DstReg = Inst.getOperand(0).getReg();
8340 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8341
8342 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8343 // hope for the best.
8344 if (Inst.isCopy() && DstReg.isPhysical() &&
8345 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8346 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8347 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8348 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8349 .add(Inst.getOperand(1));
8350 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8351 DstReg)
8352 .addReg(NewDst);
8353
8354 Inst.eraseFromParent();
8355 return;
8356 }
8357
8358 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8359 Register NewDstReg = Inst.getOperand(1).getReg();
8360 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8361 if (const TargetRegisterClass *CommonRC =
8362 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8363 // Instead of creating a copy where src and dst are the same register
8364 // class, we just replace all uses of dst with src. These kinds of
8365 // copies interfere with the heuristics MachineSink uses to decide
8366 // whether or not to split a critical edge. Since the pass assumes
8367 // that copies will end up as machine instructions and not be
8368 // eliminated.
8369 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8370 MRI.replaceRegWith(DstReg, NewDstReg);
8371 MRI.clearKillFlags(NewDstReg);
8372 Inst.getOperand(0).setReg(DstReg);
8373
8374 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8375 llvm_unreachable("failed to constrain register");
8376
8377 Inst.eraseFromParent();
8378
8379 for (MachineOperand &UseMO :
8380 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8381 MachineInstr &UseMI = *UseMO.getParent();
8382
8383 // Legalize t16 operands since replaceReg is called after
8384 // addUsersToVALU.
8386
8387 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8388 if (const TargetRegisterClass *OpRC =
8389 getRegClass(UseMI.getDesc(), OpIdx))
8390 MRI.constrainRegClass(NewDstReg, OpRC);
8391 }
8392
8393 return;
8394 }
8395 }
8396
8397 // If this is a v2s copy between 16bit and 32bit reg,
8398 // replace vgpr copy to reg_sequence/extract_subreg
8399 // This can be remove after we have sgpr16 in place
8400 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8401 Inst.getOperand(1).getReg().isVirtual() &&
8402 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8403 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8404 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8405 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8406 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8407 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8408 get(AMDGPU::IMPLICIT_DEF), Undef);
8409 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8410 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8411 .addReg(Inst.getOperand(1).getReg())
8412 .addImm(AMDGPU::lo16)
8413 .addReg(Undef)
8414 .addImm(AMDGPU::hi16);
8415 Inst.eraseFromParent();
8416 MRI.replaceRegWith(DstReg, NewDstReg);
8417 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8418 return;
8419 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8420 AMDGPU::lo16)) {
8421 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8422 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8423 MRI.replaceRegWith(DstReg, NewDstReg);
8424 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8425 return;
8426 }
8427 }
8428
8429 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8430 MRI.replaceRegWith(DstReg, NewDstReg);
8431 legalizeOperands(Inst, MDT);
8432 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8433 return;
8434 }
8435
8436 // Use the new VALU Opcode.
8437 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8438 .setMIFlags(Inst.getFlags());
8439 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8440 // Intersperse VOP3 modifiers among the SALU operands.
8441 NewInstr->addOperand(Inst.getOperand(0));
8442 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8443 AMDGPU::OpName::src0_modifiers) >= 0)
8444 NewInstr.addImm(0);
8445 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8446 const MachineOperand &Src = Inst.getOperand(1);
8447 NewInstr->addOperand(Src);
8448 }
8449
8450 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8451 // We are converting these to a BFE, so we need to add the missing
8452 // operands for the size and offset.
8453 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8454 NewInstr.addImm(0);
8455 NewInstr.addImm(Size);
8456 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8457 // The VALU version adds the second operand to the result, so insert an
8458 // extra 0 operand.
8459 NewInstr.addImm(0);
8460 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8461 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8462 // If we need to move this to VGPRs, we need to unpack the second
8463 // operand back into the 2 separate ones for bit offset and width.
8464 assert(OffsetWidthOp.isImm() &&
8465 "Scalar BFE is only implemented for constant width and offset");
8466 uint32_t Imm = OffsetWidthOp.getImm();
8467
8468 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8469 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8470 NewInstr.addImm(Offset);
8471 NewInstr.addImm(BitWidth);
8472 } else {
8473 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8474 AMDGPU::OpName::src1_modifiers) >= 0)
8475 NewInstr.addImm(0);
8476 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8477 NewInstr->addOperand(Inst.getOperand(2));
8478 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8479 AMDGPU::OpName::src2_modifiers) >= 0)
8480 NewInstr.addImm(0);
8481 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8482 NewInstr->addOperand(Inst.getOperand(3));
8483 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8484 NewInstr.addImm(0);
8485 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8486 NewInstr.addImm(0);
8487 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8488 NewInstr.addImm(0);
8489 }
8490 } else {
8491 // Just copy the SALU operands.
8492 for (const MachineOperand &Op : Inst.explicit_operands())
8493 NewInstr->addOperand(Op);
8494 }
8495
8496 // Remove any references to SCC. Vector instructions can't read from it, and
8497 // We're just about to add the implicit use / defs of VCC, and we don't want
8498 // both.
8499 for (MachineOperand &Op : Inst.implicit_operands()) {
8500 if (Op.getReg() == AMDGPU::SCC) {
8501 // Only propagate through live-def of SCC.
8502 if (Op.isDef() && !Op.isDead())
8503 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8504 if (Op.isUse())
8505 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8506 }
8507 }
8508 Inst.eraseFromParent();
8509 Register NewDstReg;
8510 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8511 Register DstReg = NewInstr->getOperand(0).getReg();
8512 assert(DstReg.isVirtual());
8513 // Update the destination register class.
8514 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8515 assert(NewDstRC);
8516 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8517 MRI.replaceRegWith(DstReg, NewDstReg);
8518 }
8519 fixImplicitOperands(*NewInstr);
8520
8521 legalizeOperandsVALUt16(*NewInstr, MRI);
8522
8523 // Legalize the operands
8524 legalizeOperands(*NewInstr, MDT);
8525 if (NewDstReg)
8526 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8527}
8528
8529// Add/sub require special handling to deal with carry outs.
8530std::pair<bool, MachineBasicBlock *>
8531SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8532 MachineDominatorTree *MDT) const {
8533 if (ST.hasAddNoCarryInsts()) {
8534 // Assume there is no user of scc since we don't select this in that case.
8535 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8536 // is used.
8537
8538 MachineBasicBlock &MBB = *Inst.getParent();
8539 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8540
8541 Register OldDstReg = Inst.getOperand(0).getReg();
8542 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8543
8544 unsigned Opc = Inst.getOpcode();
8545 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8546
8547 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8548 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8549
8550 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8551 Inst.removeOperand(3);
8552
8553 Inst.setDesc(get(NewOpc));
8554 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8555 Inst.addImplicitDefUseOperands(*MBB.getParent());
8556 MRI.replaceRegWith(OldDstReg, ResultReg);
8557 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8558
8559 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8560 return std::pair(true, NewBB);
8561 }
8562
8563 return std::pair(false, nullptr);
8564}
8565
8566void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8567 MachineDominatorTree *MDT) const {
8568
8569 MachineBasicBlock &MBB = *Inst.getParent();
8570 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8571 MachineBasicBlock::iterator MII = Inst;
8572 const DebugLoc &DL = Inst.getDebugLoc();
8573
8574 MachineOperand &Dest = Inst.getOperand(0);
8575 MachineOperand &Src0 = Inst.getOperand(1);
8576 MachineOperand &Src1 = Inst.getOperand(2);
8577 MachineOperand &Cond = Inst.getOperand(3);
8578
8579 Register CondReg = Cond.getReg();
8580 bool IsSCC = (CondReg == AMDGPU::SCC);
8581
8582 // If this is a trivial select where the condition is effectively not SCC
8583 // (CondReg is a source of copy to SCC), then the select is semantically
8584 // equivalent to copying CondReg. Hence, there is no need to create
8585 // V_CNDMASK, we can just use that and bail out.
8586 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8587 (Src1.getImm() == 0)) {
8588 MRI.replaceRegWith(Dest.getReg(), CondReg);
8589 return;
8590 }
8591
8592 Register NewCondReg = CondReg;
8593 if (IsSCC) {
8594 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8595 NewCondReg = MRI.createVirtualRegister(TC);
8596
8597 // Now look for the closest SCC def if it is a copy
8598 // replacing the CondReg with the COPY source register
8599 bool CopyFound = false;
8600 for (MachineInstr &CandI :
8602 Inst.getParent()->rend())) {
8603 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8604 -1) {
8605 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8606 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8607 .addReg(CandI.getOperand(1).getReg());
8608 CopyFound = true;
8609 }
8610 break;
8611 }
8612 }
8613 if (!CopyFound) {
8614 // SCC def is not a copy
8615 // Insert a trivial select instead of creating a copy, because a copy from
8616 // SCC would semantically mean just copying a single bit, but we may need
8617 // the result to be a vector condition mask that needs preserving.
8618 unsigned Opcode =
8619 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8620 auto NewSelect =
8621 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8622 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8623 }
8624 }
8625
8626 Register NewDestReg = MRI.createVirtualRegister(
8627 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8628 MachineInstr *NewInst;
8629 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8630 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8631 .addImm(0)
8632 .add(Src1) // False
8633 .addImm(0)
8634 .add(Src0) // True
8635 .addReg(NewCondReg);
8636 } else {
8637 NewInst =
8638 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8639 .add(Src1) // False
8640 .add(Src0) // True
8641 .addReg(NewCondReg);
8642 }
8643 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8644 legalizeOperands(*NewInst, MDT);
8645 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8646}
8647
8648void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8649 MachineInstr &Inst) const {
8650 MachineBasicBlock &MBB = *Inst.getParent();
8651 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8652 MachineBasicBlock::iterator MII = Inst;
8653 const DebugLoc &DL = Inst.getDebugLoc();
8654
8655 MachineOperand &Dest = Inst.getOperand(0);
8656 MachineOperand &Src = Inst.getOperand(1);
8657 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8658 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8659
8660 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8661 : AMDGPU::V_SUB_CO_U32_e32;
8662
8663 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8664 .addImm(0)
8665 .addReg(Src.getReg());
8666
8667 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8668 .addReg(Src.getReg())
8669 .addReg(TmpReg);
8670
8671 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8672 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8673}
8674
8675void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8676 MachineInstr &Inst) const {
8677 MachineBasicBlock &MBB = *Inst.getParent();
8678 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8679 MachineBasicBlock::iterator MII = Inst;
8680 const DebugLoc &DL = Inst.getDebugLoc();
8681
8682 MachineOperand &Dest = Inst.getOperand(0);
8683 MachineOperand &Src1 = Inst.getOperand(1);
8684 MachineOperand &Src2 = Inst.getOperand(2);
8685 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8686 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8687 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8688
8689 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8690 : AMDGPU::V_SUB_CO_U32_e32;
8691
8692 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8693 .addReg(Src1.getReg())
8694 .addReg(Src2.getReg());
8695
8696 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8697
8698 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8699 .addReg(SubResultReg)
8700 .addReg(TmpReg);
8701
8702 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8703 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8704}
8705
8706void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8707 MachineInstr &Inst) const {
8708 MachineBasicBlock &MBB = *Inst.getParent();
8709 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8710 MachineBasicBlock::iterator MII = Inst;
8711 const DebugLoc &DL = Inst.getDebugLoc();
8712
8713 MachineOperand &Dest = Inst.getOperand(0);
8714 MachineOperand &Src0 = Inst.getOperand(1);
8715 MachineOperand &Src1 = Inst.getOperand(2);
8716
8717 if (ST.hasDLInsts()) {
8718 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8719 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8720 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8721
8722 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8723 .add(Src0)
8724 .add(Src1);
8725
8726 MRI.replaceRegWith(Dest.getReg(), NewDest);
8727 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8728 } else {
8729 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8730 // invert either source and then perform the XOR. If either source is a
8731 // scalar register, then we can leave the inversion on the scalar unit to
8732 // achieve a better distribution of scalar and vector instructions.
8733 bool Src0IsSGPR = Src0.isReg() &&
8734 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8735 bool Src1IsSGPR = Src1.isReg() &&
8736 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8737 MachineInstr *Xor;
8738 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8739 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8740
8741 // Build a pair of scalar instructions and add them to the work list.
8742 // The next iteration over the work list will lower these to the vector
8743 // unit as necessary.
8744 if (Src0IsSGPR) {
8745 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8746 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8747 .addReg(Temp)
8748 .add(Src1);
8749 } else if (Src1IsSGPR) {
8750 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8751 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8752 .add(Src0)
8753 .addReg(Temp);
8754 } else {
8755 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8756 .add(Src0)
8757 .add(Src1);
8758 MachineInstr *Not =
8759 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8760 Worklist.insert(Not);
8761 }
8762
8763 MRI.replaceRegWith(Dest.getReg(), NewDest);
8764
8765 Worklist.insert(Xor);
8766
8767 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8768 }
8769}
8770
8771void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8772 MachineInstr &Inst,
8773 unsigned Opcode) const {
8774 MachineBasicBlock &MBB = *Inst.getParent();
8775 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8776 MachineBasicBlock::iterator MII = Inst;
8777 const DebugLoc &DL = Inst.getDebugLoc();
8778
8779 MachineOperand &Dest = Inst.getOperand(0);
8780 MachineOperand &Src0 = Inst.getOperand(1);
8781 MachineOperand &Src1 = Inst.getOperand(2);
8782
8783 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8784 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8785
8786 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8787 .add(Src0)
8788 .add(Src1);
8789
8790 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8791 .addReg(Interm);
8792
8793 Worklist.insert(&Op);
8794 Worklist.insert(&Not);
8795
8796 MRI.replaceRegWith(Dest.getReg(), NewDest);
8797 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8798}
8799
8800void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8801 MachineInstr &Inst,
8802 unsigned Opcode) const {
8803 MachineBasicBlock &MBB = *Inst.getParent();
8804 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8805 MachineBasicBlock::iterator MII = Inst;
8806 const DebugLoc &DL = Inst.getDebugLoc();
8807
8808 MachineOperand &Dest = Inst.getOperand(0);
8809 MachineOperand &Src0 = Inst.getOperand(1);
8810 MachineOperand &Src1 = Inst.getOperand(2);
8811
8812 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8813 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8814
8815 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8816 .add(Src1);
8817
8818 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8819 .add(Src0)
8820 .addReg(Interm);
8821
8822 Worklist.insert(&Not);
8823 Worklist.insert(&Op);
8824
8825 MRI.replaceRegWith(Dest.getReg(), NewDest);
8826 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8827}
8828
8829void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8830 MachineInstr &Inst, unsigned Opcode,
8831 bool Swap) const {
8832 MachineBasicBlock &MBB = *Inst.getParent();
8833 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8834
8835 MachineOperand &Dest = Inst.getOperand(0);
8836 MachineOperand &Src0 = Inst.getOperand(1);
8837 const DebugLoc &DL = Inst.getDebugLoc();
8838
8839 MachineBasicBlock::iterator MII = Inst;
8840
8841 const MCInstrDesc &InstDesc = get(Opcode);
8842 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8843 MRI.getRegClass(Src0.getReg()) :
8844 &AMDGPU::SGPR_32RegClass;
8845
8846 const TargetRegisterClass *Src0SubRC =
8847 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8848
8849 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8850 AMDGPU::sub0, Src0SubRC);
8851
8852 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8853 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8854 const TargetRegisterClass *NewDestSubRC =
8855 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8856
8857 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8858 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8859
8860 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8861 AMDGPU::sub1, Src0SubRC);
8862
8863 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8864 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8865
8866 if (Swap)
8867 std::swap(DestSub0, DestSub1);
8868
8869 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8870 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8871 .addReg(DestSub0)
8872 .addImm(AMDGPU::sub0)
8873 .addReg(DestSub1)
8874 .addImm(AMDGPU::sub1);
8875
8876 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8877
8878 Worklist.insert(&LoHalf);
8879 Worklist.insert(&HiHalf);
8880
8881 // We don't need to legalizeOperands here because for a single operand, src0
8882 // will support any kind of input.
8883
8884 // Move all users of this moved value.
8885 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8886}
8887
8888// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8889// split the s_mul_u64 in 32-bit vector multiplications.
8890void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8891 MachineInstr &Inst,
8892 MachineDominatorTree *MDT) const {
8893 MachineBasicBlock &MBB = *Inst.getParent();
8894 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8895
8896 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8897 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8898 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8899
8900 MachineOperand &Dest = Inst.getOperand(0);
8901 MachineOperand &Src0 = Inst.getOperand(1);
8902 MachineOperand &Src1 = Inst.getOperand(2);
8903 const DebugLoc &DL = Inst.getDebugLoc();
8904 MachineBasicBlock::iterator MII = Inst;
8905
8906 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8907 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8908 const TargetRegisterClass *Src0SubRC =
8909 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8910 if (RI.isSGPRClass(Src0SubRC))
8911 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8912 const TargetRegisterClass *Src1SubRC =
8913 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8914 if (RI.isSGPRClass(Src1SubRC))
8915 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8916
8917 // First, we extract the low 32-bit and high 32-bit values from each of the
8918 // operands.
8919 MachineOperand Op0L =
8920 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8921 MachineOperand Op1L =
8922 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8923 MachineOperand Op0H =
8924 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8925 MachineOperand Op1H =
8926 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8927
8928 // The multilication is done as follows:
8929 //
8930 // Op1H Op1L
8931 // * Op0H Op0L
8932 // --------------------
8933 // Op1H*Op0L Op1L*Op0L
8934 // + Op1H*Op0H Op1L*Op0H
8935 // -----------------------------------------
8936 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8937 //
8938 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8939 // value and that would overflow.
8940 // The low 32-bit value is Op1L*Op0L.
8941 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8942
8943 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8944 MachineInstr *Op1L_Op0H =
8945 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8946 .add(Op1L)
8947 .add(Op0H);
8948
8949 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8950 MachineInstr *Op1H_Op0L =
8951 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8952 .add(Op1H)
8953 .add(Op0L);
8954
8955 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8956 MachineInstr *Carry =
8957 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8958 .add(Op1L)
8959 .add(Op0L);
8960
8961 MachineInstr *LoHalf =
8962 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8963 .add(Op1L)
8964 .add(Op0L);
8965
8966 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8967 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8968 .addReg(Op1L_Op0H_Reg)
8969 .addReg(Op1H_Op0L_Reg);
8970
8971 MachineInstr *HiHalf =
8972 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8973 .addReg(AddReg)
8974 .addReg(CarryReg);
8975
8976 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8977 .addReg(DestSub0)
8978 .addImm(AMDGPU::sub0)
8979 .addReg(DestSub1)
8980 .addImm(AMDGPU::sub1);
8981
8982 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8983
8984 // Try to legalize the operands in case we need to swap the order to keep it
8985 // valid.
8986 legalizeOperands(*Op1L_Op0H, MDT);
8987 legalizeOperands(*Op1H_Op0L, MDT);
8988 legalizeOperands(*Carry, MDT);
8989 legalizeOperands(*LoHalf, MDT);
8990 legalizeOperands(*Add, MDT);
8991 legalizeOperands(*HiHalf, MDT);
8992
8993 // Move all users of this moved value.
8994 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8995}
8996
8997// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8998// multiplications.
8999void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9000 MachineInstr &Inst,
9001 MachineDominatorTree *MDT) const {
9002 MachineBasicBlock &MBB = *Inst.getParent();
9003 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9004
9005 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9006 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9007 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9008
9009 MachineOperand &Dest = Inst.getOperand(0);
9010 MachineOperand &Src0 = Inst.getOperand(1);
9011 MachineOperand &Src1 = Inst.getOperand(2);
9012 const DebugLoc &DL = Inst.getDebugLoc();
9013 MachineBasicBlock::iterator MII = Inst;
9014
9015 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9016 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9017 const TargetRegisterClass *Src0SubRC =
9018 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9019 if (RI.isSGPRClass(Src0SubRC))
9020 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9021 const TargetRegisterClass *Src1SubRC =
9022 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9023 if (RI.isSGPRClass(Src1SubRC))
9024 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9025
9026 // First, we extract the low 32-bit and high 32-bit values from each of the
9027 // operands.
9028 MachineOperand Op0L =
9029 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9030 MachineOperand Op1L =
9031 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9032
9033 unsigned Opc = Inst.getOpcode();
9034 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9035 ? AMDGPU::V_MUL_HI_U32_e64
9036 : AMDGPU::V_MUL_HI_I32_e64;
9037 MachineInstr *HiHalf =
9038 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9039
9040 MachineInstr *LoHalf =
9041 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9042 .add(Op1L)
9043 .add(Op0L);
9044
9045 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9046 .addReg(DestSub0)
9047 .addImm(AMDGPU::sub0)
9048 .addReg(DestSub1)
9049 .addImm(AMDGPU::sub1);
9050
9051 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9052
9053 // Try to legalize the operands in case we need to swap the order to keep it
9054 // valid.
9055 legalizeOperands(*HiHalf, MDT);
9056 legalizeOperands(*LoHalf, MDT);
9057
9058 // Move all users of this moved value.
9059 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9060}
9061
9062void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9063 MachineInstr &Inst, unsigned Opcode,
9064 MachineDominatorTree *MDT) const {
9065 MachineBasicBlock &MBB = *Inst.getParent();
9066 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9067
9068 MachineOperand &Dest = Inst.getOperand(0);
9069 MachineOperand &Src0 = Inst.getOperand(1);
9070 MachineOperand &Src1 = Inst.getOperand(2);
9071 const DebugLoc &DL = Inst.getDebugLoc();
9072
9073 MachineBasicBlock::iterator MII = Inst;
9074
9075 const MCInstrDesc &InstDesc = get(Opcode);
9076 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9077 MRI.getRegClass(Src0.getReg()) :
9078 &AMDGPU::SGPR_32RegClass;
9079
9080 const TargetRegisterClass *Src0SubRC =
9081 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9082 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9083 MRI.getRegClass(Src1.getReg()) :
9084 &AMDGPU::SGPR_32RegClass;
9085
9086 const TargetRegisterClass *Src1SubRC =
9087 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9088
9089 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9090 AMDGPU::sub0, Src0SubRC);
9091 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9092 AMDGPU::sub0, Src1SubRC);
9093 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9094 AMDGPU::sub1, Src0SubRC);
9095 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9096 AMDGPU::sub1, Src1SubRC);
9097
9098 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9099 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9100 const TargetRegisterClass *NewDestSubRC =
9101 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9102
9103 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9104 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9105 .add(SrcReg0Sub0)
9106 .add(SrcReg1Sub0);
9107
9108 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9109 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9110 .add(SrcReg0Sub1)
9111 .add(SrcReg1Sub1);
9112
9113 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9114 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9115 .addReg(DestSub0)
9116 .addImm(AMDGPU::sub0)
9117 .addReg(DestSub1)
9118 .addImm(AMDGPU::sub1);
9119
9120 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9121
9122 Worklist.insert(&LoHalf);
9123 Worklist.insert(&HiHalf);
9124
9125 // Move all users of this moved value.
9126 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9127}
9128
9129void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9130 MachineInstr &Inst,
9131 MachineDominatorTree *MDT) const {
9132 MachineBasicBlock &MBB = *Inst.getParent();
9133 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9134
9135 MachineOperand &Dest = Inst.getOperand(0);
9136 MachineOperand &Src0 = Inst.getOperand(1);
9137 MachineOperand &Src1 = Inst.getOperand(2);
9138 const DebugLoc &DL = Inst.getDebugLoc();
9139
9140 MachineBasicBlock::iterator MII = Inst;
9141
9142 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9143
9144 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9145
9146 MachineOperand* Op0;
9147 MachineOperand* Op1;
9148
9149 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9150 Op0 = &Src0;
9151 Op1 = &Src1;
9152 } else {
9153 Op0 = &Src1;
9154 Op1 = &Src0;
9155 }
9156
9157 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9158 .add(*Op0);
9159
9160 Register NewDest = MRI.createVirtualRegister(DestRC);
9161
9162 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9163 .addReg(Interm)
9164 .add(*Op1);
9165
9166 MRI.replaceRegWith(Dest.getReg(), NewDest);
9167
9168 Worklist.insert(&Xor);
9169}
9170
9171void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9172 MachineInstr &Inst) const {
9173 MachineBasicBlock &MBB = *Inst.getParent();
9174 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9175
9176 MachineBasicBlock::iterator MII = Inst;
9177 const DebugLoc &DL = Inst.getDebugLoc();
9178
9179 MachineOperand &Dest = Inst.getOperand(0);
9180 MachineOperand &Src = Inst.getOperand(1);
9181
9182 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9183 const TargetRegisterClass *SrcRC = Src.isReg() ?
9184 MRI.getRegClass(Src.getReg()) :
9185 &AMDGPU::SGPR_32RegClass;
9186
9187 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9188 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9189
9190 const TargetRegisterClass *SrcSubRC =
9191 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9192
9193 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9194 AMDGPU::sub0, SrcSubRC);
9195 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9196 AMDGPU::sub1, SrcSubRC);
9197
9198 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9199
9200 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9201
9202 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9203
9204 // We don't need to legalize operands here. src0 for either instruction can be
9205 // an SGPR, and the second input is unused or determined here.
9206 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9207}
9208
9209void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9210 MachineInstr &Inst) const {
9211 MachineBasicBlock &MBB = *Inst.getParent();
9212 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9213 MachineBasicBlock::iterator MII = Inst;
9214 const DebugLoc &DL = Inst.getDebugLoc();
9215
9216 MachineOperand &Dest = Inst.getOperand(0);
9217 uint32_t Imm = Inst.getOperand(2).getImm();
9218 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9219 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9220
9221 (void) Offset;
9222
9223 // Only sext_inreg cases handled.
9224 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9225 Offset == 0 && "Not implemented");
9226
9227 if (BitWidth < 32) {
9228 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9229 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9230 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9231
9232 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9233 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9234 .addImm(0)
9235 .addImm(BitWidth);
9236
9237 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9238 .addImm(31)
9239 .addReg(MidRegLo);
9240
9241 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9242 .addReg(MidRegLo)
9243 .addImm(AMDGPU::sub0)
9244 .addReg(MidRegHi)
9245 .addImm(AMDGPU::sub1);
9246
9247 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9248 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9249 return;
9250 }
9251
9252 MachineOperand &Src = Inst.getOperand(1);
9253 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9254 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9255
9256 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9257 .addImm(31)
9258 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9259
9260 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9261 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9262 .addImm(AMDGPU::sub0)
9263 .addReg(TmpReg)
9264 .addImm(AMDGPU::sub1);
9265
9266 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9267 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9268}
9269
9270void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9271 MachineInstr &Inst, unsigned Opcode,
9272 MachineDominatorTree *MDT) const {
9273 // (S_FLBIT_I32_B64 hi:lo) ->
9274 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9275 // (S_FF1_I32_B64 hi:lo) ->
9276 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9277
9278 MachineBasicBlock &MBB = *Inst.getParent();
9279 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9280 MachineBasicBlock::iterator MII = Inst;
9281 const DebugLoc &DL = Inst.getDebugLoc();
9282
9283 MachineOperand &Dest = Inst.getOperand(0);
9284 MachineOperand &Src = Inst.getOperand(1);
9285
9286 const MCInstrDesc &InstDesc = get(Opcode);
9287
9288 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9289 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9290 : AMDGPU::V_ADD_CO_U32_e32;
9291
9292 const TargetRegisterClass *SrcRC =
9293 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9294 const TargetRegisterClass *SrcSubRC =
9295 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9296
9297 MachineOperand SrcRegSub0 =
9298 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9299 MachineOperand SrcRegSub1 =
9300 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9301
9302 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9303 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9304 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9305 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9306
9307 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9308
9309 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9310
9311 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9312 .addReg(IsCtlz ? MidReg1 : MidReg2)
9313 .addImm(32)
9314 .addImm(1); // enable clamp
9315
9316 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9317 .addReg(MidReg3)
9318 .addReg(IsCtlz ? MidReg2 : MidReg1);
9319
9320 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9321
9322 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9323}
9324
9325void SIInstrInfo::addUsersToMoveToVALUWorklist(
9326 Register DstReg, MachineRegisterInfo &MRI,
9327 SIInstrWorklist &Worklist) const {
9328 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9329 MachineInstr &UseMI = *MO.getParent();
9330
9331 unsigned OpNo = 0;
9332
9333 switch (UseMI.getOpcode()) {
9334 case AMDGPU::COPY:
9335 case AMDGPU::WQM:
9336 case AMDGPU::SOFT_WQM:
9337 case AMDGPU::STRICT_WWM:
9338 case AMDGPU::STRICT_WQM:
9339 case AMDGPU::REG_SEQUENCE:
9340 case AMDGPU::PHI:
9341 case AMDGPU::INSERT_SUBREG:
9342 break;
9343 default:
9344 OpNo = MO.getOperandNo();
9345 break;
9346 }
9347
9348 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9349 MRI.constrainRegClass(DstReg, OpRC);
9350
9351 if (!RI.hasVectorRegisters(OpRC))
9352 Worklist.insert(&UseMI);
9353 else
9354 // Legalization could change user list.
9355 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9356 }
9357}
9358
9359void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9361 MachineInstr &Inst) const {
9362 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9363 MachineBasicBlock *MBB = Inst.getParent();
9364 MachineOperand &Src0 = Inst.getOperand(1);
9365 MachineOperand &Src1 = Inst.getOperand(2);
9366 const DebugLoc &DL = Inst.getDebugLoc();
9367
9368 if (ST.useRealTrue16Insts()) {
9369 Register SrcReg0, SrcReg1;
9370 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9371 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9372 BuildMI(*MBB, Inst, DL,
9373 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9374 .add(Src0);
9375 } else {
9376 SrcReg0 = Src0.getReg();
9377 }
9378
9379 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9380 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9381 BuildMI(*MBB, Inst, DL,
9382 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9383 .add(Src1);
9384 } else {
9385 SrcReg1 = Src1.getReg();
9386 }
9387
9388 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9389 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9390
9391 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9392 switch (Inst.getOpcode()) {
9393 case AMDGPU::S_PACK_LL_B32_B16:
9394 NewMI
9395 .addReg(SrcReg0, {},
9396 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9397 .addImm(AMDGPU::lo16)
9398 .addReg(SrcReg1, {},
9399 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9400 .addImm(AMDGPU::hi16);
9401 break;
9402 case AMDGPU::S_PACK_LH_B32_B16:
9403 NewMI
9404 .addReg(SrcReg0, {},
9405 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9406 .addImm(AMDGPU::lo16)
9407 .addReg(SrcReg1, {}, AMDGPU::hi16)
9408 .addImm(AMDGPU::hi16);
9409 break;
9410 case AMDGPU::S_PACK_HL_B32_B16:
9411 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9412 .addImm(AMDGPU::lo16)
9413 .addReg(SrcReg1, {},
9414 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9415 .addImm(AMDGPU::hi16);
9416 break;
9417 case AMDGPU::S_PACK_HH_B32_B16:
9418 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9419 .addImm(AMDGPU::lo16)
9420 .addReg(SrcReg1, {}, AMDGPU::hi16)
9421 .addImm(AMDGPU::hi16);
9422 break;
9423 default:
9424 llvm_unreachable("unhandled s_pack_* instruction");
9425 }
9426
9427 MachineOperand &Dest = Inst.getOperand(0);
9428 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9429 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9430 return;
9431 }
9432
9433 switch (Inst.getOpcode()) {
9434 case AMDGPU::S_PACK_LL_B32_B16: {
9435 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9436 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9437
9438 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9439 // 0.
9440 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9441 .addImm(0xffff);
9442
9443 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9444 .addReg(ImmReg, RegState::Kill)
9445 .add(Src0);
9446
9447 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9448 .add(Src1)
9449 .addImm(16)
9450 .addReg(TmpReg, RegState::Kill);
9451 break;
9452 }
9453 case AMDGPU::S_PACK_LH_B32_B16: {
9454 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9455 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9456 .addImm(0xffff);
9457 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9458 .addReg(ImmReg, RegState::Kill)
9459 .add(Src0)
9460 .add(Src1);
9461 break;
9462 }
9463 case AMDGPU::S_PACK_HL_B32_B16: {
9464 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9465 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9466 .addImm(16)
9467 .add(Src0);
9468 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9469 .add(Src1)
9470 .addImm(16)
9471 .addReg(TmpReg, RegState::Kill);
9472 break;
9473 }
9474 case AMDGPU::S_PACK_HH_B32_B16: {
9475 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9476 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9477 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9478 .addImm(16)
9479 .add(Src0);
9480 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9481 .addImm(0xffff0000);
9482 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9483 .add(Src1)
9484 .addReg(ImmReg, RegState::Kill)
9485 .addReg(TmpReg, RegState::Kill);
9486 break;
9487 }
9488 default:
9489 llvm_unreachable("unhandled s_pack_* instruction");
9490 }
9491
9492 MachineOperand &Dest = Inst.getOperand(0);
9493 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9494 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9495}
9496
9497void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9498 MachineInstr &SCCDefInst,
9499 SIInstrWorklist &Worklist,
9500 Register NewCond) const {
9501
9502 // Ensure that def inst defines SCC, which is still live.
9503 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9504 !Op.isDead() && Op.getParent() == &SCCDefInst);
9505 SmallVector<MachineInstr *, 4> CopyToDelete;
9506 // This assumes that all the users of SCC are in the same block
9507 // as the SCC def.
9508 for (MachineInstr &MI : // Skip the def inst itself.
9509 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9510 SCCDefInst.getParent()->end())) {
9511 // Check if SCC is used first.
9512 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9513 if (SCCIdx != -1) {
9514 if (MI.isCopy()) {
9515 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9516 Register DestReg = MI.getOperand(0).getReg();
9517
9518 MRI.replaceRegWith(DestReg, NewCond);
9519 CopyToDelete.push_back(&MI);
9520 } else {
9521
9522 if (NewCond.isValid())
9523 MI.getOperand(SCCIdx).setReg(NewCond);
9524
9525 Worklist.insert(&MI);
9526 }
9527 }
9528 // Exit if we find another SCC def.
9529 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9530 break;
9531 }
9532 for (auto &Copy : CopyToDelete)
9533 Copy->eraseFromParent();
9534}
9535
9536// Instructions that use SCC may be converted to VALU instructions. When that
9537// happens, the SCC register is changed to VCC_LO. The instruction that defines
9538// SCC must be changed to an instruction that defines VCC. This function makes
9539// sure that the instruction that defines SCC is added to the moveToVALU
9540// worklist.
9541void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9542 SIInstrWorklist &Worklist) const {
9543 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9544 // then there is nothing to do because the defining instruction has been
9545 // converted to a VALU already. If SCC then that instruction needs to be
9546 // converted to a VALU.
9547 for (MachineInstr &MI :
9548 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9549 SCCUseInst->getParent()->rend())) {
9550 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9551 break;
9552 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9553 Worklist.insert(&MI);
9554 break;
9555 }
9556 }
9557}
9558
9559const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9560 const MachineInstr &Inst) const {
9561 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9562
9563 switch (Inst.getOpcode()) {
9564 // For target instructions, getOpRegClass just returns the virtual register
9565 // class associated with the operand, so we need to find an equivalent VGPR
9566 // register class in order to move the instruction to the VALU.
9567 case AMDGPU::COPY:
9568 case AMDGPU::PHI:
9569 case AMDGPU::REG_SEQUENCE:
9570 case AMDGPU::INSERT_SUBREG:
9571 case AMDGPU::WQM:
9572 case AMDGPU::SOFT_WQM:
9573 case AMDGPU::STRICT_WWM:
9574 case AMDGPU::STRICT_WQM: {
9575 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9576 if (RI.isAGPRClass(SrcRC)) {
9577 if (RI.isAGPRClass(NewDstRC))
9578 return nullptr;
9579
9580 switch (Inst.getOpcode()) {
9581 case AMDGPU::PHI:
9582 case AMDGPU::REG_SEQUENCE:
9583 case AMDGPU::INSERT_SUBREG:
9584 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9585 break;
9586 default:
9587 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9588 }
9589
9590 if (!NewDstRC)
9591 return nullptr;
9592 } else {
9593 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9594 return nullptr;
9595
9596 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9597 if (!NewDstRC)
9598 return nullptr;
9599 }
9600
9601 return NewDstRC;
9602 }
9603 default:
9604 return NewDstRC;
9605 }
9606}
9607
9608// Find the one SGPR operand we are allowed to use.
9609Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9610 int OpIndices[3]) const {
9611 const MCInstrDesc &Desc = MI.getDesc();
9612
9613 // Find the one SGPR operand we are allowed to use.
9614 //
9615 // First we need to consider the instruction's operand requirements before
9616 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9617 // of VCC, but we are still bound by the constant bus requirement to only use
9618 // one.
9619 //
9620 // If the operand's class is an SGPR, we can never move it.
9621
9622 Register SGPRReg = findImplicitSGPRRead(MI);
9623 if (SGPRReg)
9624 return SGPRReg;
9625
9626 Register UsedSGPRs[3] = {Register()};
9627 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9628
9629 for (unsigned i = 0; i < 3; ++i) {
9630 int Idx = OpIndices[i];
9631 if (Idx == -1)
9632 break;
9633
9634 const MachineOperand &MO = MI.getOperand(Idx);
9635 if (!MO.isReg())
9636 continue;
9637
9638 // Is this operand statically required to be an SGPR based on the operand
9639 // constraints?
9640 const TargetRegisterClass *OpRC =
9641 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9642 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9643 if (IsRequiredSGPR)
9644 return MO.getReg();
9645
9646 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9647 Register Reg = MO.getReg();
9648 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9649 if (RI.isSGPRClass(RegRC))
9650 UsedSGPRs[i] = Reg;
9651 }
9652
9653 // We don't have a required SGPR operand, so we have a bit more freedom in
9654 // selecting operands to move.
9655
9656 // Try to select the most used SGPR. If an SGPR is equal to one of the
9657 // others, we choose that.
9658 //
9659 // e.g.
9660 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9661 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9662
9663 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9664 // prefer those.
9665
9666 if (UsedSGPRs[0]) {
9667 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9668 SGPRReg = UsedSGPRs[0];
9669 }
9670
9671 if (!SGPRReg && UsedSGPRs[1]) {
9672 if (UsedSGPRs[1] == UsedSGPRs[2])
9673 SGPRReg = UsedSGPRs[1];
9674 }
9675
9676 return SGPRReg;
9677}
9678
9680 AMDGPU::OpName OperandName) const {
9681 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9682 return nullptr;
9683
9684 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9685 if (Idx == -1)
9686 return nullptr;
9687
9688 return &MI.getOperand(Idx);
9689}
9690
9692 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9693 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9696 return (Format << 44) |
9697 (1ULL << 56) | // RESOURCE_LEVEL = 1
9698 (3ULL << 60); // OOB_SELECT = 3
9699 }
9700
9701 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9702 if (ST.isAmdHsaOS()) {
9703 // Set ATC = 1. GFX9 doesn't have this bit.
9704 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9705 RsrcDataFormat |= (1ULL << 56);
9706
9707 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9708 // BTW, it disables TC L2 and therefore decreases performance.
9709 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9710 RsrcDataFormat |= (2ULL << 59);
9711 }
9712
9713 return RsrcDataFormat;
9714}
9715
9719 0xffffffff; // Size;
9720
9721 // GFX9 doesn't have ELEMENT_SIZE.
9722 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9723 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9724 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9725 }
9726
9727 // IndexStride = 64 / 32.
9728 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9729 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9730
9731 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9732 // Clear them unless we want a huge stride.
9733 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9734 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9735 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9736
9737 return Rsrc23;
9738}
9739
9741 unsigned Opc = MI.getOpcode();
9742
9743 return isSMRD(Opc);
9744}
9745
9747 return get(Opc).mayLoad() &&
9748 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9749}
9750
9752 int &FrameIndex) const {
9753 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9754 if (!Addr || !Addr->isFI())
9755 return Register();
9756
9757 assert(!MI.memoperands_empty() &&
9758 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9759
9760 FrameIndex = Addr->getIndex();
9761 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9762}
9763
9765 int &FrameIndex) const {
9766 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9767 assert(Addr && Addr->isFI());
9768 FrameIndex = Addr->getIndex();
9769 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9770}
9771
9773 int &FrameIndex) const {
9774 if (!MI.mayLoad())
9775 return Register();
9776
9777 if (isMUBUF(MI) || isVGPRSpill(MI))
9778 return isStackAccess(MI, FrameIndex);
9779
9780 if (isSGPRSpill(MI))
9781 return isSGPRStackAccess(MI, FrameIndex);
9782
9783 return Register();
9784}
9785
9787 int &FrameIndex) const {
9788 if (!MI.mayStore())
9789 return Register();
9790
9791 if (isMUBUF(MI) || isVGPRSpill(MI))
9792 return isStackAccess(MI, FrameIndex);
9793
9794 if (isSGPRSpill(MI))
9795 return isSGPRStackAccess(MI, FrameIndex);
9796
9797 return Register();
9798}
9799
9801 unsigned Size = 0;
9803 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9804 while (++I != E && I->isInsideBundle()) {
9805 assert(!I->isBundle() && "No nested bundle!");
9807 }
9808
9809 return Size;
9810}
9811
9813 unsigned Opc = MI.getOpcode();
9815 unsigned DescSize = Desc.getSize();
9816
9817 // If we have a definitive size, we can use it. Otherwise we need to inspect
9818 // the operands to know the size.
9819 if (isFixedSize(MI)) {
9820 unsigned Size = DescSize;
9821
9822 // If we hit the buggy offset, an extra nop will be inserted in MC so
9823 // estimate the worst case.
9824 if (MI.isBranch() && ST.hasOffset3fBug())
9825 Size += 4;
9826
9827 return Size;
9828 }
9829
9830 // Instructions may have a 32-bit literal encoded after them. Check
9831 // operands that could ever be literals.
9832 if (isVALU(MI) || isSALU(MI)) {
9833 if (isDPP(MI))
9834 return DescSize;
9835 bool HasLiteral = false;
9836 unsigned LiteralSize = 4;
9837 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9838 const MachineOperand &Op = MI.getOperand(I);
9839 const MCOperandInfo &OpInfo = Desc.operands()[I];
9840 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9841 HasLiteral = true;
9842 if (ST.has64BitLiterals()) {
9843 switch (OpInfo.OperandType) {
9844 default:
9845 break;
9847 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9848 LiteralSize = 8;
9849 break;
9851 // A 32-bit literal is only valid when the value fits in BOTH signed
9852 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9853 // emitter's getLit64Encoding logic. This is because of the lack of
9854 // abilility to tell signedness of the literal, therefore we need to
9855 // be conservative and assume values outside this range require a
9856 // 64-bit literal encoding (8 bytes).
9857 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
9858 !isUInt<32>(Op.getImm()))
9859 LiteralSize = 8;
9860 break;
9861 }
9862 }
9863 break;
9864 }
9865 }
9866 return HasLiteral ? DescSize + LiteralSize : DescSize;
9867 }
9868
9869 // Check whether we have extra NSA words.
9870 if (isMIMG(MI)) {
9871 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9872 if (VAddr0Idx < 0)
9873 return 8;
9874
9875 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9876 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9877 }
9878
9879 switch (Opc) {
9880 case TargetOpcode::BUNDLE:
9881 return getInstBundleSize(MI);
9882 case TargetOpcode::INLINEASM:
9883 case TargetOpcode::INLINEASM_BR: {
9884 const MachineFunction *MF = MI.getMF();
9885 const char *AsmStr = MI.getOperand(0).getSymbolName();
9886 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9887 }
9888 default:
9889 if (MI.isMetaInstruction())
9890 return 0;
9891
9892 // If D16 Pseudo inst, get correct MC code size
9893 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9894 if (D16Info) {
9895 // Assume d16_lo/hi inst are always in same size
9896 unsigned LoInstOpcode = D16Info->LoOp;
9897 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9898 DescSize = Desc.getSize();
9899 }
9900
9901 // If FMA Pseudo inst, get correct MC code size
9902 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9903 // All potential lowerings are the same size; arbitrarily pick one.
9904 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9905 DescSize = Desc.getSize();
9906 }
9907
9908 return DescSize;
9909 }
9910}
9911
9913 if (!isFLAT(MI))
9914 return false;
9915
9916 if (MI.memoperands_empty())
9917 return true;
9918
9919 for (const MachineMemOperand *MMO : MI.memoperands()) {
9920 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9921 return true;
9922 }
9923 return false;
9924}
9925
9928 static const std::pair<int, const char *> TargetIndices[] = {
9929 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9930 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9931 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9932 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9933 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9934 return ArrayRef(TargetIndices);
9935}
9936
9937/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9938/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9944
9945/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9946/// pass.
9952
9953// Called during:
9954// - pre-RA scheduling and post-RA scheduling
9957 const ScheduleDAGMI *DAG) const {
9958 // Borrowed from Arm Target
9959 // We would like to restrict this hazard recognizer to only
9960 // post-RA scheduling; we can tell that we're post-RA because we don't
9961 // track VRegLiveness.
9962 if (!DAG->hasVRegLiveness())
9963 return new GCNHazardRecognizer(DAG->MF);
9965}
9966
9967std::pair<unsigned, unsigned>
9969 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9970}
9971
9974 static const std::pair<unsigned, const char *> TargetFlags[] = {
9975 {MO_GOTPCREL, "amdgpu-gotprel"},
9976 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9977 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9978 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9979 {MO_REL32_LO, "amdgpu-rel32-lo"},
9980 {MO_REL32_HI, "amdgpu-rel32-hi"},
9981 {MO_REL64, "amdgpu-rel64"},
9982 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9983 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9984 {MO_ABS64, "amdgpu-abs64"},
9985 };
9986
9987 return ArrayRef(TargetFlags);
9988}
9989
9992 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9993 {
9994 {MONoClobber, "amdgpu-noclobber"},
9995 {MOLastUse, "amdgpu-last-use"},
9996 {MOCooperative, "amdgpu-cooperative"},
9997 {MOThreadPrivate, "amdgpu-thread-private"},
9998 };
9999
10000 return ArrayRef(TargetFlags);
10001}
10002
10004 const MachineFunction &MF) const {
10006 assert(SrcReg.isVirtual());
10007 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10008 return AMDGPU::WWM_COPY;
10009
10010 return AMDGPU::COPY;
10011}
10012
10014 uint32_t Opcode = MI.getOpcode();
10015 // Check if it is SGPR spill or wwm-register spill Opcode.
10016 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10017 return true;
10018
10019 const MachineFunction *MF = MI.getMF();
10020 const MachineRegisterInfo &MRI = MF->getRegInfo();
10022
10023 // See if this is Liverange split instruction inserted for SGPR or
10024 // wwm-register. The implicit def inserted for wwm-registers should also be
10025 // included as they can appear at the bb begin.
10026 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10027 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10028 return false;
10029
10030 Register Reg = MI.getOperand(0).getReg();
10031 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10032 return IsLRSplitInst;
10033
10034 return MFI->isWWMReg(Reg);
10035}
10036
10038 Register Reg) const {
10039 // We need to handle instructions which may be inserted during register
10040 // allocation to handle the prolog. The initial prolog instruction may have
10041 // been separated from the start of the block by spills and copies inserted
10042 // needed by the prolog. However, the insertions for scalar registers can
10043 // always be placed at the BB top as they are independent of the exec mask
10044 // value.
10045 bool IsNullOrVectorRegister = true;
10046 if (Reg) {
10047 const MachineFunction *MF = MI.getMF();
10048 const MachineRegisterInfo &MRI = MF->getRegInfo();
10049 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10050 }
10051
10052 return IsNullOrVectorRegister &&
10053 (canAddToBBProlog(MI) ||
10054 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10055 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10056}
10057
10061 const DebugLoc &DL,
10062 Register DestReg) const {
10063 if (ST.hasAddNoCarryInsts())
10064 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10065
10066 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10067 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10068 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10069
10070 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10071 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10072}
10073
10076 const DebugLoc &DL,
10077 Register DestReg,
10078 RegScavenger &RS) const {
10079 if (ST.hasAddNoCarryInsts())
10080 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10081
10082 // If available, prefer to use vcc.
10083 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10084 ? Register(RI.getVCC())
10085 : RS.scavengeRegisterBackwards(
10086 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10087 0, /* AllowSpill */ false);
10088
10089 // TODO: Users need to deal with this.
10090 if (!UnusedCarry.isValid())
10091 return MachineInstrBuilder();
10092
10093 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10094 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10095}
10096
10097bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10098 switch (Opcode) {
10099 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10100 case AMDGPU::SI_KILL_I1_TERMINATOR:
10101 return true;
10102 default:
10103 return false;
10104 }
10105}
10106
10108 switch (Opcode) {
10109 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10110 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10111 case AMDGPU::SI_KILL_I1_PSEUDO:
10112 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10113 default:
10114 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10115 }
10116}
10117
10118bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10119 return Imm <= getMaxMUBUFImmOffset(ST);
10120}
10121
10123 // GFX12 field is non-negative 24-bit signed byte offset.
10124 const unsigned OffsetBits =
10125 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10126 return (1 << OffsetBits) - 1;
10127}
10128
10130 if (!ST.isWave32())
10131 return;
10132
10133 if (MI.isInlineAsm())
10134 return;
10135
10136 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10137 return;
10138
10139 for (auto &Op : MI.implicit_operands()) {
10140 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10141 Op.setReg(AMDGPU::VCC_LO);
10142 }
10143}
10144
10146 if (!isSMRD(MI))
10147 return false;
10148
10149 // Check that it is using a buffer resource.
10150 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10151 if (Idx == -1) // e.g. s_memtime
10152 return false;
10153
10154 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10155 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10156}
10157
10158// Given Imm, split it into the values to put into the SOffset and ImmOffset
10159// fields in an MUBUF instruction. Return false if it is not possible (due to a
10160// hardware bug needing a workaround).
10161//
10162// The required alignment ensures that individual address components remain
10163// aligned if they are aligned to begin with. It also ensures that additional
10164// offsets within the given alignment can be added to the resulting ImmOffset.
10166 uint32_t &ImmOffset, Align Alignment) const {
10167 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10168 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10169 uint32_t Overflow = 0;
10170
10171 if (Imm > MaxImm) {
10172 if (Imm <= MaxImm + 64) {
10173 // Use an SOffset inline constant for 4..64
10174 Overflow = Imm - MaxImm;
10175 Imm = MaxImm;
10176 } else {
10177 // Try to keep the same value in SOffset for adjacent loads, so that
10178 // the corresponding register contents can be re-used.
10179 //
10180 // Load values with all low-bits (except for alignment bits) set into
10181 // SOffset, so that a larger range of values can be covered using
10182 // s_movk_i32.
10183 //
10184 // Atomic operations fail to work correctly when individual address
10185 // components are unaligned, even if their sum is aligned.
10186 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10187 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10188 Imm = Low;
10189 Overflow = High - Alignment.value();
10190 }
10191 }
10192
10193 if (Overflow > 0) {
10194 // There is a hardware bug in SI and CI which prevents address clamping in
10195 // MUBUF instructions from working correctly with SOffsets. The immediate
10196 // offset is unaffected.
10197 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10198 return false;
10199
10200 // It is not possible to set immediate in SOffset field on some targets.
10201 if (ST.hasRestrictedSOffset())
10202 return false;
10203 }
10204
10205 ImmOffset = Imm;
10206 SOffset = Overflow;
10207 return true;
10208}
10209
10210// Depending on the used address space and instructions, some immediate offsets
10211// are allowed and some are not.
10212// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10213// scratch instruction offsets can also be negative. On GFX12, offsets can be
10214// negative for all variants.
10215//
10216// There are several bugs related to these offsets:
10217// On gfx10.1, flat instructions that go into the global address space cannot
10218// use an offset.
10219//
10220// For scratch instructions, the address can be either an SGPR or a VGPR.
10221// The following offsets can be used, depending on the architecture (x means
10222// cannot be used):
10223// +----------------------------+------+------+
10224// | Address-Mode | SGPR | VGPR |
10225// +----------------------------+------+------+
10226// | gfx9 | | |
10227// | negative, 4-aligned offset | x | ok |
10228// | negative, unaligned offset | x | ok |
10229// +----------------------------+------+------+
10230// | gfx10 | | |
10231// | negative, 4-aligned offset | ok | ok |
10232// | negative, unaligned offset | ok | x |
10233// +----------------------------+------+------+
10234// | gfx10.3 | | |
10235// | negative, 4-aligned offset | ok | ok |
10236// | negative, unaligned offset | ok | ok |
10237// +----------------------------+------+------+
10238//
10239// This function ignores the addressing mode, so if an offset cannot be used in
10240// one addressing mode, it is considered illegal.
10241bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10242 uint64_t FlatVariant) const {
10243 // TODO: Should 0 be special cased?
10244 if (!ST.hasFlatInstOffsets())
10245 return false;
10246
10247 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10248 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10249 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10250 return false;
10251
10252 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10253 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10254 (Offset % 4) != 0) {
10255 return false;
10256 }
10257
10258 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10259 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10260 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10261}
10262
10263// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10264std::pair<int64_t, int64_t>
10265SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10266 uint64_t FlatVariant) const {
10267 int64_t RemainderOffset = COffsetVal;
10268 int64_t ImmField = 0;
10269
10270 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10271 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10272
10273 if (AllowNegative) {
10274 // Use signed division by a power of two to truncate towards 0.
10275 int64_t D = 1LL << NumBits;
10276 RemainderOffset = (COffsetVal / D) * D;
10277 ImmField = COffsetVal - RemainderOffset;
10278
10279 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10280 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10281 (ImmField % 4) != 0) {
10282 // Make ImmField a multiple of 4
10283 RemainderOffset += ImmField % 4;
10284 ImmField -= ImmField % 4;
10285 }
10286 } else if (COffsetVal >= 0) {
10287 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10288 RemainderOffset = COffsetVal - ImmField;
10289 }
10290
10291 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10292 assert(RemainderOffset + ImmField == COffsetVal);
10293 return {ImmField, RemainderOffset};
10294}
10295
10297 if (ST.hasNegativeScratchOffsetBug() &&
10298 FlatVariant == SIInstrFlags::FlatScratch)
10299 return false;
10300
10301 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10302}
10303
10304static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10305 switch (ST.getGeneration()) {
10306 default:
10307 break;
10310 return SIEncodingFamily::SI;
10313 return SIEncodingFamily::VI;
10317 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10320 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10324 }
10325 llvm_unreachable("Unknown subtarget generation!");
10326}
10327
10328bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10329 switch(MCOp) {
10330 // These opcodes use indirect register addressing so
10331 // they need special handling by codegen (currently missing).
10332 // Therefore it is too risky to allow these opcodes
10333 // to be selected by dpp combiner or sdwa peepholer.
10334 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10335 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10336 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10337 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10338 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10339 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10340 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10341 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10342 return true;
10343 default:
10344 return false;
10345 }
10346}
10347
10348#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10349 case OPCODE##_dpp: \
10350 case OPCODE##_e32: \
10351 case OPCODE##_e64: \
10352 case OPCODE##_e64_dpp: \
10353 case OPCODE##_sdwa:
10354
10355static bool isRenamedInGFX9(int Opcode) {
10356 switch (Opcode) {
10357 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10358 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10359 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10360 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10361 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10362 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10363 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10364 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10365 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10366 //
10367 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10368 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10369 case AMDGPU::V_FMA_F16_gfx9_e64:
10370 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10371 case AMDGPU::V_INTERP_P2_F16:
10372 case AMDGPU::V_MAD_F16_e64:
10373 case AMDGPU::V_MAD_U16_e64:
10374 case AMDGPU::V_MAD_I16_e64:
10375 return true;
10376 default:
10377 return false;
10378 }
10379}
10380
10381int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10382 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10383 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10384
10385 unsigned Gen = subtargetEncodingFamily(ST);
10386
10387 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10389
10390 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10391 // subtarget has UnpackedD16VMem feature.
10392 // TODO: remove this when we discard GFX80 encoding.
10393 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10395
10396 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10397 switch (ST.getGeneration()) {
10398 default:
10400 break;
10403 break;
10406 break;
10407 }
10408 }
10409
10410 if (isMAI(Opcode)) {
10411 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10412 if (MFMAOp != -1)
10413 Opcode = MFMAOp;
10414 }
10415
10416 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10417
10418 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10420
10421 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10423
10424 // -1 means that Opcode is already a native instruction.
10425 if (MCOp == -1)
10426 return Opcode;
10427
10428 if (ST.hasGFX90AInsts()) {
10429 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10430 if (ST.hasGFX940Insts())
10432 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10434 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10436 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10437 MCOp = NMCOp;
10438 }
10439
10440 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10441 // encoding in the given subtarget generation.
10442 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10443 return -1;
10444
10445 if (isAsmOnlyOpcode(MCOp))
10446 return -1;
10447
10448 return MCOp;
10449}
10450
10451static
10453 assert(RegOpnd.isReg());
10454 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10455 getRegSubRegPair(RegOpnd);
10456}
10457
10460 assert(MI.isRegSequence());
10461 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10462 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10463 auto &RegOp = MI.getOperand(1 + 2 * I);
10464 return getRegOrUndef(RegOp);
10465 }
10467}
10468
10469// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10470// Following a subreg of reg:subreg isn't supported
10473 if (!RSR.SubReg)
10474 return false;
10475 switch (MI.getOpcode()) {
10476 default: break;
10477 case AMDGPU::REG_SEQUENCE:
10478 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10479 return true;
10480 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10481 case AMDGPU::INSERT_SUBREG:
10482 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10483 // inserted the subreg we're looking for
10484 RSR = getRegOrUndef(MI.getOperand(2));
10485 else { // the subreg in the rest of the reg
10486 auto R1 = getRegOrUndef(MI.getOperand(1));
10487 if (R1.SubReg) // subreg of subreg isn't supported
10488 return false;
10489 RSR.Reg = R1.Reg;
10490 }
10491 return true;
10492 }
10493 return false;
10494}
10495
10497 const MachineRegisterInfo &MRI) {
10498 assert(MRI.isSSA());
10499 if (!P.Reg.isVirtual())
10500 return nullptr;
10501
10502 auto RSR = P;
10503 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10504 while (auto *MI = DefInst) {
10505 DefInst = nullptr;
10506 switch (MI->getOpcode()) {
10507 case AMDGPU::COPY:
10508 case AMDGPU::V_MOV_B32_e32: {
10509 auto &Op1 = MI->getOperand(1);
10510 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10511 if (Op1.isUndef())
10512 return nullptr;
10513 RSR = getRegSubRegPair(Op1);
10514 DefInst = MRI.getVRegDef(RSR.Reg);
10515 }
10516 break;
10517 }
10518 default:
10519 if (followSubRegDef(*MI, RSR)) {
10520 if (!RSR.Reg)
10521 return nullptr;
10522 DefInst = MRI.getVRegDef(RSR.Reg);
10523 }
10524 }
10525 if (!DefInst)
10526 return MI;
10527 }
10528 return nullptr;
10529}
10530
10532 Register VReg,
10533 const MachineInstr &DefMI,
10534 const MachineInstr &UseMI) {
10535 assert(MRI.isSSA() && "Must be run on SSA");
10536
10537 auto *TRI = MRI.getTargetRegisterInfo();
10538 auto *DefBB = DefMI.getParent();
10539
10540 // Don't bother searching between blocks, although it is possible this block
10541 // doesn't modify exec.
10542 if (UseMI.getParent() != DefBB)
10543 return true;
10544
10545 const int MaxInstScan = 20;
10546 int NumInst = 0;
10547
10548 // Stop scan at the use.
10549 auto E = UseMI.getIterator();
10550 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10551 if (I->isDebugInstr())
10552 continue;
10553
10554 if (++NumInst > MaxInstScan)
10555 return true;
10556
10557 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10558 return true;
10559 }
10560
10561 return false;
10562}
10563
10565 Register VReg,
10566 const MachineInstr &DefMI) {
10567 assert(MRI.isSSA() && "Must be run on SSA");
10568
10569 auto *TRI = MRI.getTargetRegisterInfo();
10570 auto *DefBB = DefMI.getParent();
10571
10572 const int MaxUseScan = 10;
10573 int NumUse = 0;
10574
10575 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10576 auto &UseInst = *Use.getParent();
10577 // Don't bother searching between blocks, although it is possible this block
10578 // doesn't modify exec.
10579 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10580 return true;
10581
10582 if (++NumUse > MaxUseScan)
10583 return true;
10584 }
10585
10586 if (NumUse == 0)
10587 return false;
10588
10589 const int MaxInstScan = 20;
10590 int NumInst = 0;
10591
10592 // Stop scan when we have seen all the uses.
10593 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10594 assert(I != DefBB->end());
10595
10596 if (I->isDebugInstr())
10597 continue;
10598
10599 if (++NumInst > MaxInstScan)
10600 return true;
10601
10602 for (const MachineOperand &Op : I->operands()) {
10603 // We don't check reg masks here as they're used only on calls:
10604 // 1. EXEC is only considered const within one BB
10605 // 2. Call should be a terminator instruction if present in a BB
10606
10607 if (!Op.isReg())
10608 continue;
10609
10610 Register Reg = Op.getReg();
10611 if (Op.isUse()) {
10612 if (Reg == VReg && --NumUse == 0)
10613 return false;
10614 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10615 return true;
10616 }
10617 }
10618}
10619
10622 const DebugLoc &DL, Register Src, Register Dst) const {
10623 auto Cur = MBB.begin();
10624 if (Cur != MBB.end())
10625 do {
10626 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10627 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10628 ++Cur;
10629 } while (Cur != MBB.end() && Cur != LastPHIIt);
10630
10631 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10632 Dst);
10633}
10634
10637 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10638 if (InsPt != MBB.end() &&
10639 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10640 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10641 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10642 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10643 InsPt++;
10644 return BuildMI(MBB, InsPt, DL,
10645 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10646 .addReg(Src, {}, SrcSubReg)
10647 .addReg(AMDGPU::EXEC, RegState::Implicit);
10648 }
10649 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10650 Dst);
10651}
10652
10653bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10654
10657 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10658 VirtRegMap *VRM) const {
10659 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10660 //
10661 // %0:sreg_32 = COPY $m0
10662 //
10663 // We explicitly chose SReg_32 for the virtual register so such a copy might
10664 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10665 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10666 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10667 // TargetInstrInfo::foldMemoryOperand() is going to try.
10668 // A similar issue also exists with spilling and reloading $exec registers.
10669 //
10670 // To prevent that, constrain the %0 register class here.
10671 if (isFullCopyInstr(MI)) {
10672 Register DstReg = MI.getOperand(0).getReg();
10673 Register SrcReg = MI.getOperand(1).getReg();
10674 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10675 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10676 MachineRegisterInfo &MRI = MF.getRegInfo();
10677 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10678 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10679 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10680 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10681 return nullptr;
10682 }
10683 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10684 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10685 return nullptr;
10686 }
10687 }
10688 }
10689
10690 return nullptr;
10691}
10692
10694 const MachineInstr &MI,
10695 unsigned *PredCost) const {
10696 if (MI.isBundle()) {
10698 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10699 unsigned Lat = 0, Count = 0;
10700 for (++I; I != E && I->isBundledWithPred(); ++I) {
10701 ++Count;
10702 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10703 }
10704 return Lat + Count - 1;
10705 }
10706
10707 return SchedModel.computeInstrLatency(&MI);
10708}
10709
10710const MachineOperand &
10712 if (const MachineOperand *CallAddrOp =
10713 getNamedOperand(MI, AMDGPU::OpName::src0))
10714 return *CallAddrOp;
10716}
10717
10720 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10721 unsigned Opcode = MI.getOpcode();
10722
10723 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10724 Register Dst = MI.getOperand(0).getReg();
10725 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10726 : MI.getOperand(1).getReg();
10727 LLT DstTy = MRI.getType(Dst);
10728 LLT SrcTy = MRI.getType(Src);
10729 unsigned DstAS = DstTy.getAddressSpace();
10730 unsigned SrcAS = SrcTy.getAddressSpace();
10731 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10732 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10733 ST.hasGloballyAddressableScratch()
10736 };
10737
10738 // If the target supports globally addressable scratch, the mapping from
10739 // scratch memory to the flat aperture changes therefore an address space cast
10740 // is no longer uniform.
10741 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10742 return HandleAddrSpaceCast(MI);
10743
10744 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10745 auto IID = GI->getIntrinsicID();
10750
10751 switch (IID) {
10752 case Intrinsic::amdgcn_addrspacecast_nonnull:
10753 return HandleAddrSpaceCast(MI);
10754 case Intrinsic::amdgcn_if:
10755 case Intrinsic::amdgcn_else:
10756 // FIXME: Uniform if second result
10757 break;
10758 }
10759
10761 }
10762
10763 // Loads from the private and flat address spaces are divergent, because
10764 // threads can execute the load instruction with the same inputs and get
10765 // different results.
10766 //
10767 // All other loads are not divergent, because if threads issue loads with the
10768 // same arguments, they will always get the same result.
10769 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10770 Opcode == AMDGPU::G_SEXTLOAD) {
10771 if (MI.memoperands_empty())
10772 return InstructionUniformity::NeverUniform; // conservative assumption
10773
10774 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10775 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10776 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10777 })) {
10778 // At least one MMO in a non-global address space.
10780 }
10782 }
10783
10784 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10785 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10786 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10787 AMDGPU::isGenericAtomic(Opcode)) {
10789 }
10791}
10792
10794 if (!Formatter)
10795 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10796 return Formatter.get();
10797}
10798
10801
10802 if (isNeverUniform(MI))
10804
10805 unsigned opcode = MI.getOpcode();
10806 if (opcode == AMDGPU::V_READLANE_B32 ||
10807 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10808 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10810
10811 if (isCopyInstr(MI)) {
10812 const MachineOperand &srcOp = MI.getOperand(1);
10813 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10814 const TargetRegisterClass *regClass =
10815 RI.getPhysRegBaseClass(srcOp.getReg());
10816 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10818 }
10820 }
10821
10822 // GMIR handling
10823 if (MI.isPreISelOpcode())
10825
10826 // Atomics are divergent because they are executed sequentially: when an
10827 // atomic operation refers to the same address in each thread, then each
10828 // thread after the first sees the value written by the previous thread as
10829 // original value.
10830
10831 if (isAtomic(MI))
10833
10834 // Loads from the private and flat address spaces are divergent, because
10835 // threads can execute the load instruction with the same inputs and get
10836 // different results.
10837 if (isFLAT(MI) && MI.mayLoad()) {
10838 if (MI.memoperands_empty())
10839 return InstructionUniformity::NeverUniform; // conservative assumption
10840
10841 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10842 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10843 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10844 })) {
10845 // At least one MMO in a non-global address space.
10847 }
10848
10850 }
10851
10852 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10853 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10854
10855 // FIXME: It's conceptually broken to report this for an instruction, and not
10856 // a specific def operand. For inline asm in particular, there could be mixed
10857 // uniform and divergent results.
10858 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10859 const MachineOperand &SrcOp = MI.getOperand(I);
10860 if (!SrcOp.isReg())
10861 continue;
10862
10863 Register Reg = SrcOp.getReg();
10864 if (!Reg || !SrcOp.readsReg())
10865 continue;
10866
10867 // If RegBank is null, this is unassigned or an unallocatable special
10868 // register, which are all scalars.
10869 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10870 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10872 }
10873
10874 // TODO: Uniformity check condtions above can be rearranged for more
10875 // redability
10876
10877 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10878 // currently turned into no-op COPYs by SelectionDAG ISel and are
10879 // therefore no longer recognizable.
10880
10882}
10883
10885 switch (MF.getFunction().getCallingConv()) {
10887 return 1;
10889 return 2;
10891 return 3;
10895 const Function &F = MF.getFunction();
10896 F.getContext().diagnose(DiagnosticInfoUnsupported(
10897 F, "ds_ordered_count unsupported for this calling conv"));
10898 [[fallthrough]];
10899 }
10902 case CallingConv::C:
10903 case CallingConv::Fast:
10904 default:
10905 // Assume other calling conventions are various compute callable functions
10906 return 0;
10907 }
10908}
10909
10911 Register &SrcReg2, int64_t &CmpMask,
10912 int64_t &CmpValue) const {
10913 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10914 return false;
10915
10916 switch (MI.getOpcode()) {
10917 default:
10918 break;
10919 case AMDGPU::S_CMP_EQ_U32:
10920 case AMDGPU::S_CMP_EQ_I32:
10921 case AMDGPU::S_CMP_LG_U32:
10922 case AMDGPU::S_CMP_LG_I32:
10923 case AMDGPU::S_CMP_LT_U32:
10924 case AMDGPU::S_CMP_LT_I32:
10925 case AMDGPU::S_CMP_GT_U32:
10926 case AMDGPU::S_CMP_GT_I32:
10927 case AMDGPU::S_CMP_LE_U32:
10928 case AMDGPU::S_CMP_LE_I32:
10929 case AMDGPU::S_CMP_GE_U32:
10930 case AMDGPU::S_CMP_GE_I32:
10931 case AMDGPU::S_CMP_EQ_U64:
10932 case AMDGPU::S_CMP_LG_U64:
10933 SrcReg = MI.getOperand(0).getReg();
10934 if (MI.getOperand(1).isReg()) {
10935 if (MI.getOperand(1).getSubReg())
10936 return false;
10937 SrcReg2 = MI.getOperand(1).getReg();
10938 CmpValue = 0;
10939 } else if (MI.getOperand(1).isImm()) {
10940 SrcReg2 = Register();
10941 CmpValue = MI.getOperand(1).getImm();
10942 } else {
10943 return false;
10944 }
10945 CmpMask = ~0;
10946 return true;
10947 case AMDGPU::S_CMPK_EQ_U32:
10948 case AMDGPU::S_CMPK_EQ_I32:
10949 case AMDGPU::S_CMPK_LG_U32:
10950 case AMDGPU::S_CMPK_LG_I32:
10951 case AMDGPU::S_CMPK_LT_U32:
10952 case AMDGPU::S_CMPK_LT_I32:
10953 case AMDGPU::S_CMPK_GT_U32:
10954 case AMDGPU::S_CMPK_GT_I32:
10955 case AMDGPU::S_CMPK_LE_U32:
10956 case AMDGPU::S_CMPK_LE_I32:
10957 case AMDGPU::S_CMPK_GE_U32:
10958 case AMDGPU::S_CMPK_GE_I32:
10959 SrcReg = MI.getOperand(0).getReg();
10960 SrcReg2 = Register();
10961 CmpValue = MI.getOperand(1).getImm();
10962 CmpMask = ~0;
10963 return true;
10964 }
10965
10966 return false;
10967}
10968
10970 for (MachineBasicBlock *S : MBB->successors()) {
10971 if (S->isLiveIn(AMDGPU::SCC))
10972 return false;
10973 }
10974 return true;
10975}
10976
10977// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10978// (incoming SCC) = !(SCC defined by SCCDef).
10979// Return true if all uses can be re-written, false otherwise.
10980bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10981 MachineBasicBlock *MBB = SCCDef->getParent();
10982 SmallVector<MachineInstr *> InvertInstr;
10983 bool SCCIsDead = false;
10984
10985 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10986 constexpr unsigned ScanLimit = 12;
10987 unsigned Count = 0;
10988 for (MachineInstr &MI :
10989 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
10990 if (++Count > ScanLimit)
10991 return false;
10992 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
10993 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10994 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10995 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10996 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10997 InvertInstr.push_back(&MI);
10998 else
10999 return false;
11000 }
11001 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11002 SCCIsDead = true;
11003 break;
11004 }
11005 }
11006 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11007 SCCIsDead = true;
11008
11009 // SCC may have more uses. Can't invert all of them.
11010 if (!SCCIsDead)
11011 return false;
11012
11013 // Invert uses
11014 for (MachineInstr *MI : InvertInstr) {
11015 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11016 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11017 swapOperands(*MI);
11018 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11019 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11020 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11021 ? AMDGPU::S_CBRANCH_SCC1
11022 : AMDGPU::S_CBRANCH_SCC0));
11023 } else {
11024 llvm_unreachable("SCC used but no inversion handling");
11025 }
11026 }
11027 return true;
11028}
11029
11030// SCC is already valid after SCCValid.
11031// SCCRedefine will redefine SCC to the same value already available after
11032// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11033// update kill/dead flags if necessary.
11034bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11035 bool NeedInversion) const {
11036 MachineInstr *KillsSCC = nullptr;
11037 if (SCCValid->getParent() != SCCRedefine->getParent())
11038 return false;
11039 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11040 SCCRedefine->getIterator())) {
11041 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11042 return false;
11043 if (MI.killsRegister(AMDGPU::SCC, &RI))
11044 KillsSCC = &MI;
11045 }
11046 if (NeedInversion && !invertSCCUse(SCCRedefine))
11047 return false;
11048 if (MachineOperand *SccDef =
11049 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11050 SccDef->setIsDead(false);
11051 if (KillsSCC)
11052 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11053 SCCRedefine->eraseFromParent();
11054 return true;
11055}
11056
11057static bool foldableSelect(const MachineInstr &Def) {
11058 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11059 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11060 return false;
11061 bool Op1IsNonZeroImm =
11062 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11063 bool Op2IsZeroImm =
11064 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11065 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11066 return false;
11067 return true;
11068}
11069
11070static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11071 unsigned &NewDefOpc) {
11072 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11073 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11074 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11075 Def.getOpcode() != AMDGPU::S_ADD_U32)
11076 return false;
11077 const MachineOperand &AddSrc1 = Def.getOperand(1);
11078 const MachineOperand &AddSrc2 = Def.getOperand(2);
11079 int64_t addend;
11080
11081 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11082 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11083 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11084 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11085 return false;
11086
11087 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11088 const MachineOperand *SccDef =
11089 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11090 if (!SccDef->isDead())
11091 return false;
11092 NewDefOpc = AMDGPU::S_ADD_U32;
11093 }
11094 NeedInversion = !NeedInversion;
11095 return true;
11096}
11097
11099 Register SrcReg2, int64_t CmpMask,
11100 int64_t CmpValue,
11101 const MachineRegisterInfo *MRI) const {
11102 if (!SrcReg || SrcReg.isPhysical())
11103 return false;
11104
11105 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11106 return false;
11107
11108 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11109 this](bool NeedInversion) -> bool {
11110 if (CmpValue != 0)
11111 return false;
11112
11113 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11114 if (!Def)
11115 return false;
11116
11117 // For S_OP that set SCC = DST!=0, do the transformation
11118 //
11119 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11120 //
11121 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11122 // do the transformation:
11123 //
11124 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11125 //
11126 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11127 // for S_CSELECT* already has the same value that will be calculated by
11128 // s_cmp_lg_*
11129 //
11130 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11131 // (non-zero imm), 0)
11132
11133 unsigned NewDefOpc = Def->getOpcode();
11134 if (!setsSCCIfResultIsNonZero(*Def) &&
11135 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11136 !foldableSelect(*Def))
11137 return false;
11138
11139 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11140 return false;
11141
11142 if (NewDefOpc != Def->getOpcode())
11143 Def->setDesc(get(NewDefOpc));
11144
11145 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11146 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11147 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11148 // sX = s_cselect_b64 (non-zero imm), 0
11149 // sLo = copy sX.sub0
11150 // sHi = copy sX.sub1
11151 // sY = s_or_b32 sLo, sHi
11152 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11153 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11154 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11155 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11156 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11157 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11158 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11159 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11160 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11161 Def2->getOperand(1).isReg() &&
11162 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11163 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11164 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11165 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11166 if (Select && foldableSelect(*Select))
11167 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11168 }
11169 }
11170 }
11171 return true;
11172 };
11173
11174 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11175 this](int64_t ExpectedValue, unsigned SrcSize,
11176 bool IsReversible, bool IsSigned) -> bool {
11177 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11178 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11179 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11180 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11181 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11182 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11183 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11184 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11185 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11186 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11187 //
11188 // Signed ge/gt are not used for the sign bit.
11189 //
11190 // If result of the AND is unused except in the compare:
11191 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11192 //
11193 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11194 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11195 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11196 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11197 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11198 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11199
11200 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11201 if (!Def)
11202 return false;
11203
11204 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11205 Def->getOpcode() != AMDGPU::S_AND_B64)
11206 return false;
11207
11208 int64_t Mask;
11209 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11210 if (MO->isImm())
11211 Mask = MO->getImm();
11212 else if (!getFoldableImm(MO, Mask))
11213 return false;
11214 Mask &= maxUIntN(SrcSize);
11215 return isPowerOf2_64(Mask);
11216 };
11217
11218 MachineOperand *SrcOp = &Def->getOperand(1);
11219 if (isMask(SrcOp))
11220 SrcOp = &Def->getOperand(2);
11221 else if (isMask(&Def->getOperand(2)))
11222 SrcOp = &Def->getOperand(1);
11223 else
11224 return false;
11225
11226 // A valid Mask is required to have a single bit set, hence a non-zero and
11227 // power-of-two value. This verifies that we will not do 64-bit shift below.
11228 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11229 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11230 if (IsSigned && BitNo == SrcSize - 1)
11231 return false;
11232
11233 ExpectedValue <<= BitNo;
11234
11235 bool IsReversedCC = false;
11236 if (CmpValue != ExpectedValue) {
11237 if (!IsReversible)
11238 return false;
11239 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11240 if (!IsReversedCC)
11241 return false;
11242 }
11243
11244 Register DefReg = Def->getOperand(0).getReg();
11245 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11246 return false;
11247
11248 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11249 return false;
11250
11251 if (!MRI->use_nodbg_empty(DefReg)) {
11252 assert(!IsReversedCC);
11253 return true;
11254 }
11255
11256 // Replace AND with unused result with a S_BITCMP.
11257 MachineBasicBlock *MBB = Def->getParent();
11258
11259 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11260 : AMDGPU::S_BITCMP1_B32
11261 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11262 : AMDGPU::S_BITCMP1_B64;
11263
11264 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11265 .add(*SrcOp)
11266 .addImm(BitNo);
11267 Def->eraseFromParent();
11268
11269 return true;
11270 };
11271
11272 switch (CmpInstr.getOpcode()) {
11273 default:
11274 break;
11275 case AMDGPU::S_CMP_EQ_U32:
11276 case AMDGPU::S_CMP_EQ_I32:
11277 case AMDGPU::S_CMPK_EQ_U32:
11278 case AMDGPU::S_CMPK_EQ_I32:
11279 return optimizeCmpAnd(1, 32, true, false) ||
11280 optimizeCmpSelect(/*NeedInversion=*/true);
11281 case AMDGPU::S_CMP_GE_U32:
11282 case AMDGPU::S_CMPK_GE_U32:
11283 return optimizeCmpAnd(1, 32, false, false);
11284 case AMDGPU::S_CMP_GE_I32:
11285 case AMDGPU::S_CMPK_GE_I32:
11286 return optimizeCmpAnd(1, 32, false, true);
11287 case AMDGPU::S_CMP_EQ_U64:
11288 return optimizeCmpAnd(1, 64, true, false);
11289 case AMDGPU::S_CMP_LG_U32:
11290 case AMDGPU::S_CMP_LG_I32:
11291 case AMDGPU::S_CMPK_LG_U32:
11292 case AMDGPU::S_CMPK_LG_I32:
11293 return optimizeCmpAnd(0, 32, true, false) ||
11294 optimizeCmpSelect(/*NeedInversion=*/false);
11295 case AMDGPU::S_CMP_GT_U32:
11296 case AMDGPU::S_CMPK_GT_U32:
11297 return optimizeCmpAnd(0, 32, false, false);
11298 case AMDGPU::S_CMP_GT_I32:
11299 case AMDGPU::S_CMPK_GT_I32:
11300 return optimizeCmpAnd(0, 32, false, true);
11301 case AMDGPU::S_CMP_LG_U64:
11302 return optimizeCmpAnd(0, 64, true, false) ||
11303 optimizeCmpSelect(/*NeedInversion=*/false);
11304 }
11305
11306 return false;
11307}
11308
11310 AMDGPU::OpName OpName) const {
11311 if (!ST.needsAlignedVGPRs())
11312 return;
11313
11314 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11315 if (OpNo < 0)
11316 return;
11317 MachineOperand &Op = MI.getOperand(OpNo);
11318 if (getOpSize(MI, OpNo) > 4)
11319 return;
11320
11321 // Add implicit aligned super-reg to force alignment on the data operand.
11322 const DebugLoc &DL = MI.getDebugLoc();
11323 MachineBasicBlock *BB = MI.getParent();
11325 Register DataReg = Op.getReg();
11326 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11328 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11329 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11330 Register NewVR =
11331 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11332 : &AMDGPU::VReg_64_Align2RegClass);
11333 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11334 .addReg(DataReg, {}, Op.getSubReg())
11335 .addImm(AMDGPU::sub0)
11336 .addReg(Undef)
11337 .addImm(AMDGPU::sub1);
11338 Op.setReg(NewVR);
11339 Op.setSubReg(AMDGPU::sub0);
11340 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11341}
11342
11344 if (isIGLP(*MI))
11345 return false;
11346
11348}
11349
11351 if (!isWMMA(MI) && !isSWMMAC(MI))
11352 return false;
11353
11354 if (ST.hasGFX1250Insts())
11355 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11356
11357 return true;
11358}
11359
11361 unsigned Opcode = MI.getOpcode();
11362
11363 if (AMDGPU::isGFX12Plus(ST))
11364 return isDOT(MI) || isXDLWMMA(MI);
11365
11366 if (!isMAI(MI) || isDGEMM(Opcode) ||
11367 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11368 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11369 return false;
11370
11371 if (!ST.hasGFX940Insts())
11372 return true;
11373
11374 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11375}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static MachineBasicBlock * loadScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:145
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:204
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:212
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:219
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:214
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:240
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:215
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:251
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:226
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:245
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:216
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:241
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:223
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:205
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:231
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:598
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:600
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:597
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:599
@ TI_CONSTDATA_START
Definition AMDGPU.h:596
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr RegState getUndefRegState(bool B)
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:57
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:61
MachineInstr * top() const
Definition SIInstrInfo.h:66
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:85
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.