LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
67 RI(ST), ST(ST) {
68 SchedModel.init(&ST);
69}
70
71//===----------------------------------------------------------------------===//
72// TargetInstrInfo callbacks
73//===----------------------------------------------------------------------===//
74
75static unsigned getNumOperandsNoGlue(SDNode *Node) {
76 unsigned N = Node->getNumOperands();
77 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
78 --N;
79 return N;
80}
81
82/// Returns true if both nodes have the same value for the given
83/// operand \p Op, or if both nodes do not have this operand.
85 AMDGPU::OpName OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
152bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
157 if (!Dst)
158 return true;
159
160 Register DstReg = Dst->getReg();
161 if (!DstReg.isVirtual())
162 return true;
163
164 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
165 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
166 switch (Use.getOpcode()) {
167 case AMDGPU::S_AND_SAVEEXEC_B32:
168 case AMDGPU::S_AND_SAVEEXEC_B64:
169 break;
170 case AMDGPU::S_AND_B32:
171 case AMDGPU::S_AND_B64:
172 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
173 return true;
174 break;
175 default:
176 return true;
177 }
178 }
179 return false;
180 }
181
182 switch (MI.getOpcode()) {
183 default:
184 break;
185 case AMDGPU::V_READFIRSTLANE_B32:
186 return true;
187 }
188
189 return false;
190}
191
193 // Any implicit use of exec by VALU is not a real register read.
194 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
195 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
196}
197
199 MachineBasicBlock *SuccToSinkTo,
200 MachineCycleInfo *CI) const {
201 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
202 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
203 return true;
204
205 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
206 // Check if sinking of MI would create temporal divergent use.
207 for (auto Op : MI.uses()) {
208 if (Op.isReg() && Op.getReg().isVirtual() &&
209 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
210 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
211
212 // SgprDef defined inside cycle
213 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
214 if (FromCycle == nullptr)
215 continue;
216
217 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
218 // Check if there is a FromCycle that contains SgprDef's basic block but
219 // does not contain SuccToSinkTo and also has divergent exit condition.
220 while (FromCycle && !FromCycle->contains(ToCycle)) {
222 FromCycle->getExitingBlocks(ExitingBlocks);
223
224 // FromCycle has divergent exit condition.
225 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
226 if (hasDivergentBranch(ExitingBlock))
227 return false;
228 }
229
230 FromCycle = FromCycle->getParentCycle();
231 }
232 }
233 }
234
235 return true;
236}
237
239 int64_t &Offset0,
240 int64_t &Offset1) const {
241 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
242 return false;
243
244 unsigned Opc0 = Load0->getMachineOpcode();
245 unsigned Opc1 = Load1->getMachineOpcode();
246
247 // Make sure both are actually loads.
248 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
249 return false;
250
251 // A mayLoad instruction without a def is not a load. Likely a prefetch.
252 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
253 return false;
254
255 if (isDS(Opc0) && isDS(Opc1)) {
256
257 // FIXME: Handle this case:
258 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
259 return false;
260
261 // Check base reg.
262 if (Load0->getOperand(0) != Load1->getOperand(0))
263 return false;
264
265 // Skip read2 / write2 variants for simplicity.
266 // TODO: We should report true if the used offsets are adjacent (excluded
267 // st64 versions).
268 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
269 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
270 if (Offset0Idx == -1 || Offset1Idx == -1)
271 return false;
272
273 // XXX - be careful of dataless loads
274 // getNamedOperandIdx returns the index for MachineInstrs. Since they
275 // include the output in the operand list, but SDNodes don't, we need to
276 // subtract the index by one.
277 Offset0Idx -= get(Opc0).NumDefs;
278 Offset1Idx -= get(Opc1).NumDefs;
279 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
280 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
281 return true;
282 }
283
284 if (isSMRD(Opc0) && isSMRD(Opc1)) {
285 // Skip time and cache invalidation instructions.
286 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
287 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
288 return false;
289
290 unsigned NumOps = getNumOperandsNoGlue(Load0);
291 if (NumOps != getNumOperandsNoGlue(Load1))
292 return false;
293
294 // Check base reg.
295 if (Load0->getOperand(0) != Load1->getOperand(0))
296 return false;
297
298 // Match register offsets, if both register and immediate offsets present.
299 assert(NumOps == 4 || NumOps == 5);
300 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
301 return false;
302
303 const ConstantSDNode *Load0Offset =
305 const ConstantSDNode *Load1Offset =
307
308 if (!Load0Offset || !Load1Offset)
309 return false;
310
311 Offset0 = Load0Offset->getZExtValue();
312 Offset1 = Load1Offset->getZExtValue();
313 return true;
314 }
315
316 // MUBUF and MTBUF can access the same addresses.
317 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
318
319 // MUBUF and MTBUF have vaddr at different indices.
320 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
323 return false;
324
325 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
326 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
327
328 if (OffIdx0 == -1 || OffIdx1 == -1)
329 return false;
330
331 // getNamedOperandIdx returns the index for MachineInstrs. Since they
332 // include the output in the operand list, but SDNodes don't, we need to
333 // subtract the index by one.
334 OffIdx0 -= get(Opc0).NumDefs;
335 OffIdx1 -= get(Opc1).NumDefs;
336
337 SDValue Off0 = Load0->getOperand(OffIdx0);
338 SDValue Off1 = Load1->getOperand(OffIdx1);
339
340 // The offset might be a FrameIndexSDNode.
341 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
342 return false;
343
344 Offset0 = Off0->getAsZExtVal();
345 Offset1 = Off1->getAsZExtVal();
346 return true;
347 }
348
349 return false;
350}
351
352static bool isStride64(unsigned Opc) {
353 switch (Opc) {
354 case AMDGPU::DS_READ2ST64_B32:
355 case AMDGPU::DS_READ2ST64_B64:
356 case AMDGPU::DS_WRITE2ST64_B32:
357 case AMDGPU::DS_WRITE2ST64_B64:
358 return true;
359 default:
360 return false;
361 }
362}
363
366 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
367 const TargetRegisterInfo *TRI) const {
368 if (!LdSt.mayLoadOrStore())
369 return false;
370
371 unsigned Opc = LdSt.getOpcode();
372 OffsetIsScalable = false;
373 const MachineOperand *BaseOp, *OffsetOp;
374 int DataOpIdx;
375
376 if (isDS(LdSt)) {
377 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
378 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
379 if (OffsetOp) {
380 // Normal, single offset LDS instruction.
381 if (!BaseOp) {
382 // DS_CONSUME/DS_APPEND use M0 for the base address.
383 // TODO: find the implicit use operand for M0 and use that as BaseOp?
384 return false;
385 }
386 BaseOps.push_back(BaseOp);
387 Offset = OffsetOp->getImm();
388 // Get appropriate operand, and compute width accordingly.
389 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
390 if (DataOpIdx == -1)
391 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
392 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
393 Width = LocationSize::precise(64);
394 else
395 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
396 } else {
397 // The 2 offset instructions use offset0 and offset1 instead. We can treat
398 // these as a load with a single offset if the 2 offsets are consecutive.
399 // We will use this for some partially aligned loads.
400 const MachineOperand *Offset0Op =
401 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
402 const MachineOperand *Offset1Op =
403 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
404
405 unsigned Offset0 = Offset0Op->getImm() & 0xff;
406 unsigned Offset1 = Offset1Op->getImm() & 0xff;
407 if (Offset0 + 1 != Offset1)
408 return false;
409
410 // Each of these offsets is in element sized units, so we need to convert
411 // to bytes of the individual reads.
412
413 unsigned EltSize;
414 if (LdSt.mayLoad())
415 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
416 else {
417 assert(LdSt.mayStore());
418 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
419 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
420 }
421
422 if (isStride64(Opc))
423 EltSize *= 64;
424
425 BaseOps.push_back(BaseOp);
426 Offset = EltSize * Offset0;
427 // Get appropriate operand(s), and compute width accordingly.
428 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
429 if (DataOpIdx == -1) {
430 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
431 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
432 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
433 Width = LocationSize::precise(
434 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
435 } else {
436 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
437 }
438 }
439 return true;
440 }
441
442 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
443 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
444 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
445 return false;
446 BaseOps.push_back(RSrc);
447 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
448 if (BaseOp && !BaseOp->isFI())
449 BaseOps.push_back(BaseOp);
450 const MachineOperand *OffsetImm =
451 getNamedOperand(LdSt, AMDGPU::OpName::offset);
452 Offset = OffsetImm->getImm();
453 const MachineOperand *SOffset =
454 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
455 if (SOffset) {
456 if (SOffset->isReg())
457 BaseOps.push_back(SOffset);
458 else
459 Offset += SOffset->getImm();
460 }
461 // Get appropriate operand, and compute width accordingly.
462 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
463 if (DataOpIdx == -1)
464 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
465 if (DataOpIdx == -1) // LDS DMA
466 return false;
467 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
468 return true;
469 }
470
471 if (isImage(LdSt)) {
472 auto RsrcOpName =
473 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
474 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
475 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
476 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
477 if (VAddr0Idx >= 0) {
478 // GFX10 possible NSA encoding.
479 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
480 BaseOps.push_back(&LdSt.getOperand(I));
481 } else {
482 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
483 }
484 Offset = 0;
485 // Get appropriate operand, and compute width accordingly.
486 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
487 if (DataOpIdx == -1)
488 return false; // no return sampler
489 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
490 return true;
491 }
492
493 if (isSMRD(LdSt)) {
494 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
495 if (!BaseOp) // e.g. S_MEMTIME
496 return false;
497 BaseOps.push_back(BaseOp);
498 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
499 Offset = OffsetOp ? OffsetOp->getImm() : 0;
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
502 if (DataOpIdx == -1)
503 return false;
504 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
505 return true;
506 }
507
508 if (isFLAT(LdSt)) {
509 // Instructions have either vaddr or saddr or both or none.
510 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
511 if (BaseOp)
512 BaseOps.push_back(BaseOp);
513 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
514 if (BaseOp)
515 BaseOps.push_back(BaseOp);
516 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
517 // Get appropriate operand, and compute width accordingly.
518 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
519 if (DataOpIdx == -1)
520 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
521 if (DataOpIdx == -1) // LDS DMA
522 return false;
523 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
524 return true;
525 }
526
527 return false;
528}
529
530static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
532 const MachineInstr &MI2,
534 // Only examine the first "base" operand of each instruction, on the
535 // assumption that it represents the real base address of the memory access.
536 // Other operands are typically offsets or indices from this base address.
537 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
538 return true;
539
540 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
541 return false;
542
543 auto *MO1 = *MI1.memoperands_begin();
544 auto *MO2 = *MI2.memoperands_begin();
545 if (MO1->getAddrSpace() != MO2->getAddrSpace())
546 return false;
547
548 const auto *Base1 = MO1->getValue();
549 const auto *Base2 = MO2->getValue();
550 if (!Base1 || !Base2)
551 return false;
552 Base1 = getUnderlyingObject(Base1);
553 Base2 = getUnderlyingObject(Base2);
554
555 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
556 return false;
557
558 return Base1 == Base2;
559}
560
562 int64_t Offset1, bool OffsetIsScalable1,
564 int64_t Offset2, bool OffsetIsScalable2,
565 unsigned ClusterSize,
566 unsigned NumBytes) const {
567 // If the mem ops (to be clustered) do not have the same base ptr, then they
568 // should not be clustered
569 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
570 if (!BaseOps1.empty() && !BaseOps2.empty()) {
571 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
572 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
573 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
574 return false;
575
576 const SIMachineFunctionInfo *MFI =
577 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
578 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
579 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
580 // If only one base op is empty, they do not have the same base ptr
581 return false;
582 }
583
584 // In order to avoid register pressure, on an average, the number of DWORDS
585 // loaded together by all clustered mem ops should not exceed
586 // MaxMemoryClusterDWords. This is an empirical value based on certain
587 // observations and performance related experiments.
588 // The good thing about this heuristic is - it avoids clustering of too many
589 // sub-word loads, and also avoids clustering of wide loads. Below is the
590 // brief summary of how the heuristic behaves for various `LoadSize` when
591 // MaxMemoryClusterDWords is 8.
592 //
593 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
594 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
595 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
596 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
597 // (5) LoadSize >= 17: do not cluster
598 const unsigned LoadSize = NumBytes / ClusterSize;
599 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
600 return NumDWords <= MaxMemoryClusterDWords;
601}
602
603// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
604// the first 16 loads will be interleaved with the stores, and the next 16 will
605// be clustered as expected. It should really split into 2 16 store batches.
606//
607// Loads are clustered until this returns false, rather than trying to schedule
608// groups of stores. This also means we have to deal with saying different
609// address space loads should be clustered, and ones which might cause bank
610// conflicts.
611//
612// This might be deprecated so it might not be worth that much effort to fix.
614 int64_t Offset0, int64_t Offset1,
615 unsigned NumLoads) const {
616 assert(Offset1 > Offset0 &&
617 "Second offset should be larger than first offset!");
618 // If we have less than 16 loads in a row, and the offsets are within 64
619 // bytes, then schedule together.
620
621 // A cacheline is 64 bytes (for global memory).
622 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
623}
624
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 const char *Msg = "illegal VGPR to SGPR copy") {
630 MachineFunction *MF = MBB.getParent();
631
633 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
634
635 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
636 .addReg(SrcReg, getKillRegState(KillSrc));
637}
638
639/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
640/// possible to have a direct copy in these cases on GFX908, so an intermediate
641/// VGPR copy is required.
645 const DebugLoc &DL, MCRegister DestReg,
646 MCRegister SrcReg, bool KillSrc,
647 RegScavenger &RS, bool RegsOverlap,
648 Register ImpDefSuperReg = Register(),
649 Register ImpUseSuperReg = Register()) {
650 assert((TII.getSubtarget().hasMAIInsts() &&
651 !TII.getSubtarget().hasGFX90AInsts()) &&
652 "Expected GFX908 subtarget.");
653
654 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
655 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
656 "Source register of the copy should be either an SGPR or an AGPR.");
657
658 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
659 "Destination register of the copy should be an AGPR.");
660
661 const SIRegisterInfo &RI = TII.getRegisterInfo();
662
663 // First try to find defining accvgpr_write to avoid temporary registers.
664 // In the case of copies of overlapping AGPRs, we conservatively do not
665 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
666 // an accvgpr_write used for this same copy due to implicit-defs
667 if (!RegsOverlap) {
668 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
669 --Def;
670
671 if (!Def->modifiesRegister(SrcReg, &RI))
672 continue;
673
674 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
675 Def->getOperand(0).getReg() != SrcReg)
676 break;
677
678 MachineOperand &DefOp = Def->getOperand(1);
679 assert(DefOp.isReg() || DefOp.isImm());
680
681 if (DefOp.isReg()) {
682 bool SafeToPropagate = true;
683 // Check that register source operand is not clobbered before MI.
684 // Immediate operands are always safe to propagate.
685 for (auto I = Def; I != MI && SafeToPropagate; ++I)
686 if (I->modifiesRegister(DefOp.getReg(), &RI))
687 SafeToPropagate = false;
688
689 if (!SafeToPropagate)
690 break;
691
692 for (auto I = Def; I != MI; ++I)
693 I->clearRegisterKills(DefOp.getReg(), &RI);
694 }
695
696 MachineInstrBuilder Builder =
697 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
698 .add(DefOp);
699 if (ImpDefSuperReg)
700 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
701
702 if (ImpUseSuperReg) {
703 Builder.addReg(ImpUseSuperReg,
705 }
706
707 return;
708 }
709 }
710
711 RS.enterBasicBlockEnd(MBB);
712 RS.backward(std::next(MI));
713
714 // Ideally we want to have three registers for a long reg_sequence copy
715 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
716 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
717 *MBB.getParent());
718
719 // Registers in the sequence are allocated contiguously so we can just
720 // use register number to pick one of three round-robin temps.
721 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
722 Register Tmp =
723 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
724 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
725 "VGPR used for an intermediate copy should have been reserved.");
726
727 // Only loop through if there are any free registers left. We don't want to
728 // spill.
729 while (RegNo--) {
730 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
731 /* RestoreAfter */ false, 0,
732 /* AllowSpill */ false);
733 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
734 break;
735 Tmp = Tmp2;
736 RS.setRegUsed(Tmp);
737 }
738
739 // Insert copy to temporary VGPR.
740 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
741 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
742 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
743 } else {
744 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
745 }
746
747 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
748 .addReg(SrcReg, getKillRegState(KillSrc));
749 if (ImpUseSuperReg) {
750 UseBuilder.addReg(ImpUseSuperReg,
752 }
753
754 MachineInstrBuilder DefBuilder
755 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
756 .addReg(Tmp, RegState::Kill);
757
758 if (ImpDefSuperReg)
759 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
760}
761
764 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
765 const TargetRegisterClass *RC, bool Forward) {
766 const SIRegisterInfo &RI = TII.getRegisterInfo();
767 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
769 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
770
771 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
772 int16_t SubIdx = BaseIndices[Idx];
773 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
774 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
775 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
776 unsigned Opcode = AMDGPU::S_MOV_B32;
777
778 // Is SGPR aligned? If so try to combine with next.
779 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
780 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
781 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
782 // Can use SGPR64 copy
783 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
784 SubIdx = RI.getSubRegFromChannel(Channel, 2);
785 DestSubReg = RI.getSubReg(DestReg, SubIdx);
786 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
787 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
788 Opcode = AMDGPU::S_MOV_B64;
789 Idx++;
790 }
791
792 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
793 .addReg(SrcSubReg)
794 .addReg(SrcReg, RegState::Implicit);
795
796 if (!FirstMI)
797 FirstMI = LastMI;
798
799 if (!Forward)
800 I--;
801 }
802
803 assert(FirstMI && LastMI);
804 if (!Forward)
805 std::swap(FirstMI, LastMI);
806
807 FirstMI->addOperand(
808 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
809
810 if (KillSrc)
811 LastMI->addRegisterKilled(SrcReg, &RI);
812}
813
816 const DebugLoc &DL, Register DestReg,
817 Register SrcReg, bool KillSrc, bool RenamableDest,
818 bool RenamableSrc) const {
819 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
820 unsigned Size = RI.getRegSizeInBits(*RC);
821 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
822 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
823
824 // The rest of copyPhysReg assumes Src and Dst size are the same size.
825 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
826 // we remove Fix16BitCopies and this code block?
827 if (Fix16BitCopies) {
828 if (((Size == 16) != (SrcSize == 16))) {
829 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
830 assert(ST.useRealTrue16Insts());
831 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
832 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
833 RegToFix = SubReg;
834
835 if (DestReg == SrcReg) {
836 // Identity copy. Insert empty bundle since ExpandPostRA expects an
837 // instruction here.
838 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
839 return;
840 }
841 RC = RI.getPhysRegBaseClass(DestReg);
842 Size = RI.getRegSizeInBits(*RC);
843 SrcRC = RI.getPhysRegBaseClass(SrcReg);
844 SrcSize = RI.getRegSizeInBits(*SrcRC);
845 }
846 }
847
848 if (RC == &AMDGPU::VGPR_32RegClass) {
849 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
850 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
851 AMDGPU::AGPR_32RegClass.contains(SrcReg));
852 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
853 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
854 BuildMI(MBB, MI, DL, get(Opc), DestReg)
855 .addReg(SrcReg, getKillRegState(KillSrc));
856 return;
857 }
858
859 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
860 RC == &AMDGPU::SReg_32RegClass) {
861 if (SrcReg == AMDGPU::SCC) {
862 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
863 .addImm(1)
864 .addImm(0);
865 return;
866 }
867
868 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
869 if (DestReg == AMDGPU::VCC_LO) {
870 // FIXME: Hack until VReg_1 removed.
871 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
872 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
873 .addImm(0)
874 .addReg(SrcReg, getKillRegState(KillSrc));
875 return;
876 }
877
878 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
879 return;
880 }
881
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
883 .addReg(SrcReg, getKillRegState(KillSrc));
884 return;
885 }
886
887 if (RC == &AMDGPU::SReg_64RegClass) {
888 if (SrcReg == AMDGPU::SCC) {
889 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
890 .addImm(1)
891 .addImm(0);
892 return;
893 }
894
895 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
896 if (DestReg == AMDGPU::VCC) {
897 // FIXME: Hack until VReg_1 removed.
898 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
899 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
900 .addImm(0)
901 .addReg(SrcReg, getKillRegState(KillSrc));
902 return;
903 }
904
905 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
906 return;
907 }
908
909 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
910 .addReg(SrcReg, getKillRegState(KillSrc));
911 return;
912 }
913
914 if (DestReg == AMDGPU::SCC) {
915 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
916 // but SelectionDAG emits such copies for i1 sources.
917 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
918 // This copy can only be produced by patterns
919 // with explicit SCC, which are known to be enabled
920 // only for subtargets with S_CMP_LG_U64 present.
921 assert(ST.hasScalarCompareEq64());
922 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
923 .addReg(SrcReg, getKillRegState(KillSrc))
924 .addImm(0);
925 } else {
926 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
927 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
928 .addReg(SrcReg, getKillRegState(KillSrc))
929 .addImm(0);
930 }
931
932 return;
933 }
934
935 if (RC == &AMDGPU::AGPR_32RegClass) {
936 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
937 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
938 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
939 .addReg(SrcReg, getKillRegState(KillSrc));
940 return;
941 }
942
943 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
944 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
945 .addReg(SrcReg, getKillRegState(KillSrc));
946 return;
947 }
948
949 // FIXME: Pass should maintain scavenger to avoid scan through the block on
950 // every AGPR spill.
951 RegScavenger RS;
952 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
953 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
954 return;
955 }
956
957 if (Size == 16) {
958 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
959 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
960 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
961
962 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
963 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
964 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
965 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
966 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
967 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
968 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
969 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
970
971 if (IsSGPRDst) {
972 if (!IsSGPRSrc) {
973 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
974 return;
975 }
976
977 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
978 .addReg(NewSrcReg, getKillRegState(KillSrc));
979 return;
980 }
981
982 if (IsAGPRDst || IsAGPRSrc) {
983 if (!DstLow || !SrcLow) {
984 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
985 "Cannot use hi16 subreg with an AGPR!");
986 }
987
988 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
989 return;
990 }
991
992 if (ST.useRealTrue16Insts()) {
993 if (IsSGPRSrc) {
994 assert(SrcLow);
995 SrcReg = NewSrcReg;
996 }
997 // Use the smaller instruction encoding if possible.
998 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
999 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1000 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1001 .addReg(SrcReg);
1002 } else {
1003 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1004 .addImm(0) // src0_modifiers
1005 .addReg(SrcReg)
1006 .addImm(0); // op_sel
1007 }
1008 return;
1009 }
1010
1011 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1012 if (!DstLow || !SrcLow) {
1013 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1014 "Cannot use hi16 subreg on VI!");
1015 }
1016
1017 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1018 .addReg(NewSrcReg, getKillRegState(KillSrc));
1019 return;
1020 }
1021
1022 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1023 .addImm(0) // src0_modifiers
1024 .addReg(NewSrcReg)
1025 .addImm(0) // clamp
1032 // First implicit operand is $exec.
1033 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1034 return;
1035 }
1036
1037 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1038 if (ST.hasMovB64()) {
1039 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1040 .addReg(SrcReg, getKillRegState(KillSrc));
1041 return;
1042 }
1043 if (ST.hasPkMovB32()) {
1044 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1046 .addReg(SrcReg)
1048 .addReg(SrcReg)
1049 .addImm(0) // op_sel_lo
1050 .addImm(0) // op_sel_hi
1051 .addImm(0) // neg_lo
1052 .addImm(0) // neg_hi
1053 .addImm(0) // clamp
1054 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1055 return;
1056 }
1057 }
1058
1059 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1060 if (RI.isSGPRClass(RC)) {
1061 if (!RI.isSGPRClass(SrcRC)) {
1062 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1063 return;
1064 }
1065 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1066 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1067 Forward);
1068 return;
1069 }
1070
1071 unsigned EltSize = 4;
1072 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1073 if (RI.isAGPRClass(RC)) {
1074 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1075 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1076 else if (RI.hasVGPRs(SrcRC) ||
1077 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1078 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1079 else
1080 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1081 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1082 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1083 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1084 (RI.isProperlyAlignedRC(*RC) &&
1085 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1086 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1087 if (ST.hasMovB64()) {
1088 Opcode = AMDGPU::V_MOV_B64_e32;
1089 EltSize = 8;
1090 } else if (ST.hasPkMovB32()) {
1091 Opcode = AMDGPU::V_PK_MOV_B32;
1092 EltSize = 8;
1093 }
1094 }
1095
1096 // For the cases where we need an intermediate instruction/temporary register
1097 // (destination is an AGPR), we need a scavenger.
1098 //
1099 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1100 // whole block for every handled copy.
1101 std::unique_ptr<RegScavenger> RS;
1102 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1103 RS = std::make_unique<RegScavenger>();
1104
1105 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1106
1107 // If there is an overlap, we can't kill the super-register on the last
1108 // instruction, since it will also kill the components made live by this def.
1109 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1110 const bool CanKillSuperReg = KillSrc && !Overlap;
1111
1112 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1113 unsigned SubIdx;
1114 if (Forward)
1115 SubIdx = SubIndices[Idx];
1116 else
1117 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1118 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1119 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1120 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1121
1122 bool IsFirstSubreg = Idx == 0;
1123 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1124
1125 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1126 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1127 Register ImpUseSuper = SrcReg;
1128 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1129 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1130 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1132 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1134 .addReg(SrcSubReg)
1136 .addReg(SrcSubReg)
1137 .addImm(0) // op_sel_lo
1138 .addImm(0) // op_sel_hi
1139 .addImm(0) // neg_lo
1140 .addImm(0) // neg_hi
1141 .addImm(0) // clamp
1142 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1143 if (IsFirstSubreg)
1145 } else {
1146 MachineInstrBuilder Builder =
1147 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1148 if (IsFirstSubreg)
1149 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1150
1151 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1152 }
1153 }
1154}
1155
1156int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1157 int NewOpc;
1158
1159 // Try to map original to commuted opcode
1160 NewOpc = AMDGPU::getCommuteRev(Opcode);
1161 if (NewOpc != -1)
1162 // Check if the commuted (REV) opcode exists on the target.
1163 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1164
1165 // Try to map commuted to original opcode
1166 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1167 if (NewOpc != -1)
1168 // Check if the original (non-REV) opcode exists on the target.
1169 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1170
1171 return Opcode;
1172}
1173
1174const TargetRegisterClass *
1176 return &AMDGPU::VGPR_32RegClass;
1177}
1178
1181 const DebugLoc &DL, Register DstReg,
1183 Register TrueReg,
1184 Register FalseReg) const {
1185 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1186 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1188 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1189 "Not a VGPR32 reg");
1190
1191 if (Cond.size() == 1) {
1192 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1193 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1194 .add(Cond[0]);
1195 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1196 .addImm(0)
1197 .addReg(FalseReg)
1198 .addImm(0)
1199 .addReg(TrueReg)
1200 .addReg(SReg);
1201 } else if (Cond.size() == 2) {
1202 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1203 switch (Cond[0].getImm()) {
1204 case SIInstrInfo::SCC_TRUE: {
1205 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1206 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1207 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1208 .addImm(0)
1209 .addReg(FalseReg)
1210 .addImm(0)
1211 .addReg(TrueReg)
1212 .addReg(SReg);
1213 break;
1214 }
1215 case SIInstrInfo::SCC_FALSE: {
1216 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1217 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1218 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1219 .addImm(0)
1220 .addReg(FalseReg)
1221 .addImm(0)
1222 .addReg(TrueReg)
1223 .addReg(SReg);
1224 break;
1225 }
1226 case SIInstrInfo::VCCNZ: {
1227 MachineOperand RegOp = Cond[1];
1228 RegOp.setImplicit(false);
1229 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1230 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1231 .add(RegOp);
1232 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1233 .addImm(0)
1234 .addReg(FalseReg)
1235 .addImm(0)
1236 .addReg(TrueReg)
1237 .addReg(SReg);
1238 break;
1239 }
1240 case SIInstrInfo::VCCZ: {
1241 MachineOperand RegOp = Cond[1];
1242 RegOp.setImplicit(false);
1243 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1244 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1245 .add(RegOp);
1246 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1247 .addImm(0)
1248 .addReg(TrueReg)
1249 .addImm(0)
1250 .addReg(FalseReg)
1251 .addReg(SReg);
1252 break;
1253 }
1254 case SIInstrInfo::EXECNZ: {
1255 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1256 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1257 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1258 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1259 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1260 .addImm(0)
1261 .addReg(FalseReg)
1262 .addImm(0)
1263 .addReg(TrueReg)
1264 .addReg(SReg);
1265 break;
1266 }
1267 case SIInstrInfo::EXECZ: {
1268 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1269 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1270 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1271 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1272 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1273 .addImm(0)
1274 .addReg(FalseReg)
1275 .addImm(0)
1276 .addReg(TrueReg)
1277 .addReg(SReg);
1278 llvm_unreachable("Unhandled branch predicate EXECZ");
1279 break;
1280 }
1281 default:
1282 llvm_unreachable("invalid branch predicate");
1283 }
1284 } else {
1285 llvm_unreachable("Can only handle Cond size 1 or 2");
1286 }
1287}
1288
1291 const DebugLoc &DL,
1292 Register SrcReg, int Value) const {
1293 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1294 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1295 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1296 .addImm(Value)
1297 .addReg(SrcReg);
1298
1299 return Reg;
1300}
1301
1304 const DebugLoc &DL,
1305 Register SrcReg, int Value) const {
1306 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1307 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1308 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1309 .addImm(Value)
1310 .addReg(SrcReg);
1311
1312 return Reg;
1313}
1314
1316 const Register Reg,
1317 int64_t &ImmVal) const {
1318 switch (MI.getOpcode()) {
1319 case AMDGPU::V_MOV_B32_e32:
1320 case AMDGPU::S_MOV_B32:
1321 case AMDGPU::S_MOVK_I32:
1322 case AMDGPU::S_MOV_B64:
1323 case AMDGPU::V_MOV_B64_e32:
1324 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1325 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1326 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1327 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::V_MOV_B64_PSEUDO: {
1329 const MachineOperand &Src0 = MI.getOperand(1);
1330 if (Src0.isImm()) {
1331 ImmVal = Src0.getImm();
1332 return MI.getOperand(0).getReg() == Reg;
1333 }
1334
1335 return false;
1336 }
1337 case AMDGPU::S_BREV_B32:
1338 case AMDGPU::V_BFREV_B32_e32:
1339 case AMDGPU::V_BFREV_B32_e64: {
1340 const MachineOperand &Src0 = MI.getOperand(1);
1341 if (Src0.isImm()) {
1342 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1343 return MI.getOperand(0).getReg() == Reg;
1344 }
1345
1346 return false;
1347 }
1348 case AMDGPU::S_NOT_B32:
1349 case AMDGPU::V_NOT_B32_e32:
1350 case AMDGPU::V_NOT_B32_e64: {
1351 const MachineOperand &Src0 = MI.getOperand(1);
1352 if (Src0.isImm()) {
1353 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1354 return MI.getOperand(0).getReg() == Reg;
1355 }
1356
1357 return false;
1358 }
1359 default:
1360 return false;
1361 }
1362}
1363
1365
1366 if (RI.isAGPRClass(DstRC))
1367 return AMDGPU::COPY;
1368 if (RI.getRegSizeInBits(*DstRC) == 16) {
1369 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1370 // before RA.
1371 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1372 }
1373 if (RI.getRegSizeInBits(*DstRC) == 32)
1374 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1375 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1376 return AMDGPU::S_MOV_B64;
1377 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1378 return AMDGPU::V_MOV_B64_PSEUDO;
1379 return AMDGPU::COPY;
1380}
1381
1382const MCInstrDesc &
1384 bool IsIndirectSrc) const {
1385 if (IsIndirectSrc) {
1386 if (VecSize <= 32) // 4 bytes
1387 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1388 if (VecSize <= 64) // 8 bytes
1389 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1390 if (VecSize <= 96) // 12 bytes
1391 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1392 if (VecSize <= 128) // 16 bytes
1393 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1394 if (VecSize <= 160) // 20 bytes
1395 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1396 if (VecSize <= 256) // 32 bytes
1397 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1398 if (VecSize <= 288) // 36 bytes
1399 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1400 if (VecSize <= 320) // 40 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1402 if (VecSize <= 352) // 44 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1404 if (VecSize <= 384) // 48 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1406 if (VecSize <= 512) // 64 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1408 if (VecSize <= 1024) // 128 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1410
1411 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1412 }
1413
1414 if (VecSize <= 32) // 4 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1416 if (VecSize <= 64) // 8 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1418 if (VecSize <= 96) // 12 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1420 if (VecSize <= 128) // 16 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1422 if (VecSize <= 160) // 20 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1424 if (VecSize <= 256) // 32 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1426 if (VecSize <= 288) // 36 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1428 if (VecSize <= 320) // 40 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1430 if (VecSize <= 352) // 44 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1432 if (VecSize <= 384) // 48 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1434 if (VecSize <= 512) // 64 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1436 if (VecSize <= 1024) // 128 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1438
1439 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1440}
1441
1442static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1443 if (VecSize <= 32) // 4 bytes
1444 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1445 if (VecSize <= 64) // 8 bytes
1446 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1447 if (VecSize <= 96) // 12 bytes
1448 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1449 if (VecSize <= 128) // 16 bytes
1450 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1451 if (VecSize <= 160) // 20 bytes
1452 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1453 if (VecSize <= 256) // 32 bytes
1454 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1455 if (VecSize <= 288) // 36 bytes
1456 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1457 if (VecSize <= 320) // 40 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1459 if (VecSize <= 352) // 44 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1461 if (VecSize <= 384) // 48 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1463 if (VecSize <= 512) // 64 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1465 if (VecSize <= 1024) // 128 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1467
1468 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1469}
1470
1471static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1472 if (VecSize <= 32) // 4 bytes
1473 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1474 if (VecSize <= 64) // 8 bytes
1475 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1476 if (VecSize <= 96) // 12 bytes
1477 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1478 if (VecSize <= 128) // 16 bytes
1479 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1480 if (VecSize <= 160) // 20 bytes
1481 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1482 if (VecSize <= 256) // 32 bytes
1483 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1484 if (VecSize <= 288) // 36 bytes
1485 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1486 if (VecSize <= 320) // 40 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1488 if (VecSize <= 352) // 44 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1490 if (VecSize <= 384) // 48 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1492 if (VecSize <= 512) // 64 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1494 if (VecSize <= 1024) // 128 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1496
1497 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1498}
1499
1500static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1501 if (VecSize <= 64) // 8 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1503 if (VecSize <= 128) // 16 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1505 if (VecSize <= 256) // 32 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1507 if (VecSize <= 512) // 64 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1509 if (VecSize <= 1024) // 128 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1511
1512 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1513}
1514
1515const MCInstrDesc &
1516SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1517 bool IsSGPR) const {
1518 if (IsSGPR) {
1519 switch (EltSize) {
1520 case 32:
1521 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1522 case 64:
1523 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1524 default:
1525 llvm_unreachable("invalid reg indexing elt size");
1526 }
1527 }
1528
1529 assert(EltSize == 32 && "invalid reg indexing elt size");
1531}
1532
1533static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1534 switch (Size) {
1535 case 4:
1536 return AMDGPU::SI_SPILL_S32_SAVE;
1537 case 8:
1538 return AMDGPU::SI_SPILL_S64_SAVE;
1539 case 12:
1540 return AMDGPU::SI_SPILL_S96_SAVE;
1541 case 16:
1542 return AMDGPU::SI_SPILL_S128_SAVE;
1543 case 20:
1544 return AMDGPU::SI_SPILL_S160_SAVE;
1545 case 24:
1546 return AMDGPU::SI_SPILL_S192_SAVE;
1547 case 28:
1548 return AMDGPU::SI_SPILL_S224_SAVE;
1549 case 32:
1550 return AMDGPU::SI_SPILL_S256_SAVE;
1551 case 36:
1552 return AMDGPU::SI_SPILL_S288_SAVE;
1553 case 40:
1554 return AMDGPU::SI_SPILL_S320_SAVE;
1555 case 44:
1556 return AMDGPU::SI_SPILL_S352_SAVE;
1557 case 48:
1558 return AMDGPU::SI_SPILL_S384_SAVE;
1559 case 64:
1560 return AMDGPU::SI_SPILL_S512_SAVE;
1561 case 128:
1562 return AMDGPU::SI_SPILL_S1024_SAVE;
1563 default:
1564 llvm_unreachable("unknown register size");
1565 }
1566}
1567
1568static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1569 switch (Size) {
1570 case 2:
1571 return AMDGPU::SI_SPILL_V16_SAVE;
1572 case 4:
1573 return AMDGPU::SI_SPILL_V32_SAVE;
1574 case 8:
1575 return AMDGPU::SI_SPILL_V64_SAVE;
1576 case 12:
1577 return AMDGPU::SI_SPILL_V96_SAVE;
1578 case 16:
1579 return AMDGPU::SI_SPILL_V128_SAVE;
1580 case 20:
1581 return AMDGPU::SI_SPILL_V160_SAVE;
1582 case 24:
1583 return AMDGPU::SI_SPILL_V192_SAVE;
1584 case 28:
1585 return AMDGPU::SI_SPILL_V224_SAVE;
1586 case 32:
1587 return AMDGPU::SI_SPILL_V256_SAVE;
1588 case 36:
1589 return AMDGPU::SI_SPILL_V288_SAVE;
1590 case 40:
1591 return AMDGPU::SI_SPILL_V320_SAVE;
1592 case 44:
1593 return AMDGPU::SI_SPILL_V352_SAVE;
1594 case 48:
1595 return AMDGPU::SI_SPILL_V384_SAVE;
1596 case 64:
1597 return AMDGPU::SI_SPILL_V512_SAVE;
1598 case 128:
1599 return AMDGPU::SI_SPILL_V1024_SAVE;
1600 default:
1601 llvm_unreachable("unknown register size");
1602 }
1603}
1604
1605static unsigned getAVSpillSaveOpcode(unsigned Size) {
1606 switch (Size) {
1607 case 4:
1608 return AMDGPU::SI_SPILL_AV32_SAVE;
1609 case 8:
1610 return AMDGPU::SI_SPILL_AV64_SAVE;
1611 case 12:
1612 return AMDGPU::SI_SPILL_AV96_SAVE;
1613 case 16:
1614 return AMDGPU::SI_SPILL_AV128_SAVE;
1615 case 20:
1616 return AMDGPU::SI_SPILL_AV160_SAVE;
1617 case 24:
1618 return AMDGPU::SI_SPILL_AV192_SAVE;
1619 case 28:
1620 return AMDGPU::SI_SPILL_AV224_SAVE;
1621 case 32:
1622 return AMDGPU::SI_SPILL_AV256_SAVE;
1623 case 36:
1624 return AMDGPU::SI_SPILL_AV288_SAVE;
1625 case 40:
1626 return AMDGPU::SI_SPILL_AV320_SAVE;
1627 case 44:
1628 return AMDGPU::SI_SPILL_AV352_SAVE;
1629 case 48:
1630 return AMDGPU::SI_SPILL_AV384_SAVE;
1631 case 64:
1632 return AMDGPU::SI_SPILL_AV512_SAVE;
1633 case 128:
1634 return AMDGPU::SI_SPILL_AV1024_SAVE;
1635 default:
1636 llvm_unreachable("unknown register size");
1637 }
1638}
1639
1640static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1641 bool IsVectorSuperClass) {
1642 // Currently, there is only 32-bit WWM register spills needed.
1643 if (Size != 4)
1644 llvm_unreachable("unknown wwm register spill size");
1645
1646 if (IsVectorSuperClass)
1647 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1648
1649 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1650}
1651
1653 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1654 const SIMachineFunctionInfo &MFI) const {
1655 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1656
1657 // Choose the right opcode if spilling a WWM register.
1659 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1660
1661 // TODO: Check if AGPRs are available
1662 if (ST.hasMAIInsts())
1663 return getAVSpillSaveOpcode(Size);
1664
1666}
1667
1670 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1671 const TargetRegisterInfo *TRI, Register VReg,
1672 MachineInstr::MIFlag Flags) const {
1673 MachineFunction *MF = MBB.getParent();
1675 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1676 const DebugLoc &DL = MBB.findDebugLoc(MI);
1677
1678 MachinePointerInfo PtrInfo
1679 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1681 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1682 FrameInfo.getObjectAlign(FrameIndex));
1683 unsigned SpillSize = TRI->getSpillSize(*RC);
1684
1686 if (RI.isSGPRClass(RC)) {
1687 MFI->setHasSpilledSGPRs();
1688 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1689 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1690 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1691
1692 // We are only allowed to create one new instruction when spilling
1693 // registers, so we need to use pseudo instruction for spilling SGPRs.
1694 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1695
1696 // The SGPR spill/restore instructions only work on number sgprs, so we need
1697 // to make sure we are using the correct register class.
1698 if (SrcReg.isVirtual() && SpillSize == 4) {
1699 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1700 }
1701
1702 BuildMI(MBB, MI, DL, OpDesc)
1703 .addReg(SrcReg, getKillRegState(isKill)) // data
1704 .addFrameIndex(FrameIndex) // addr
1705 .addMemOperand(MMO)
1707
1708 if (RI.spillSGPRToVGPR())
1709 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1710 return;
1711 }
1712
1713 unsigned Opcode =
1714 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1715 MFI->setHasSpilledVGPRs();
1716
1717 BuildMI(MBB, MI, DL, get(Opcode))
1718 .addReg(SrcReg, getKillRegState(isKill)) // data
1719 .addFrameIndex(FrameIndex) // addr
1720 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1721 .addImm(0) // offset
1722 .addMemOperand(MMO);
1723}
1724
1725static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1726 switch (Size) {
1727 case 4:
1728 return AMDGPU::SI_SPILL_S32_RESTORE;
1729 case 8:
1730 return AMDGPU::SI_SPILL_S64_RESTORE;
1731 case 12:
1732 return AMDGPU::SI_SPILL_S96_RESTORE;
1733 case 16:
1734 return AMDGPU::SI_SPILL_S128_RESTORE;
1735 case 20:
1736 return AMDGPU::SI_SPILL_S160_RESTORE;
1737 case 24:
1738 return AMDGPU::SI_SPILL_S192_RESTORE;
1739 case 28:
1740 return AMDGPU::SI_SPILL_S224_RESTORE;
1741 case 32:
1742 return AMDGPU::SI_SPILL_S256_RESTORE;
1743 case 36:
1744 return AMDGPU::SI_SPILL_S288_RESTORE;
1745 case 40:
1746 return AMDGPU::SI_SPILL_S320_RESTORE;
1747 case 44:
1748 return AMDGPU::SI_SPILL_S352_RESTORE;
1749 case 48:
1750 return AMDGPU::SI_SPILL_S384_RESTORE;
1751 case 64:
1752 return AMDGPU::SI_SPILL_S512_RESTORE;
1753 case 128:
1754 return AMDGPU::SI_SPILL_S1024_RESTORE;
1755 default:
1756 llvm_unreachable("unknown register size");
1757 }
1758}
1759
1760static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1761 switch (Size) {
1762 case 2:
1763 return AMDGPU::SI_SPILL_V16_RESTORE;
1764 case 4:
1765 return AMDGPU::SI_SPILL_V32_RESTORE;
1766 case 8:
1767 return AMDGPU::SI_SPILL_V64_RESTORE;
1768 case 12:
1769 return AMDGPU::SI_SPILL_V96_RESTORE;
1770 case 16:
1771 return AMDGPU::SI_SPILL_V128_RESTORE;
1772 case 20:
1773 return AMDGPU::SI_SPILL_V160_RESTORE;
1774 case 24:
1775 return AMDGPU::SI_SPILL_V192_RESTORE;
1776 case 28:
1777 return AMDGPU::SI_SPILL_V224_RESTORE;
1778 case 32:
1779 return AMDGPU::SI_SPILL_V256_RESTORE;
1780 case 36:
1781 return AMDGPU::SI_SPILL_V288_RESTORE;
1782 case 40:
1783 return AMDGPU::SI_SPILL_V320_RESTORE;
1784 case 44:
1785 return AMDGPU::SI_SPILL_V352_RESTORE;
1786 case 48:
1787 return AMDGPU::SI_SPILL_V384_RESTORE;
1788 case 64:
1789 return AMDGPU::SI_SPILL_V512_RESTORE;
1790 case 128:
1791 return AMDGPU::SI_SPILL_V1024_RESTORE;
1792 default:
1793 llvm_unreachable("unknown register size");
1794 }
1795}
1796
1797static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1798 switch (Size) {
1799 case 4:
1800 return AMDGPU::SI_SPILL_AV32_RESTORE;
1801 case 8:
1802 return AMDGPU::SI_SPILL_AV64_RESTORE;
1803 case 12:
1804 return AMDGPU::SI_SPILL_AV96_RESTORE;
1805 case 16:
1806 return AMDGPU::SI_SPILL_AV128_RESTORE;
1807 case 20:
1808 return AMDGPU::SI_SPILL_AV160_RESTORE;
1809 case 24:
1810 return AMDGPU::SI_SPILL_AV192_RESTORE;
1811 case 28:
1812 return AMDGPU::SI_SPILL_AV224_RESTORE;
1813 case 32:
1814 return AMDGPU::SI_SPILL_AV256_RESTORE;
1815 case 36:
1816 return AMDGPU::SI_SPILL_AV288_RESTORE;
1817 case 40:
1818 return AMDGPU::SI_SPILL_AV320_RESTORE;
1819 case 44:
1820 return AMDGPU::SI_SPILL_AV352_RESTORE;
1821 case 48:
1822 return AMDGPU::SI_SPILL_AV384_RESTORE;
1823 case 64:
1824 return AMDGPU::SI_SPILL_AV512_RESTORE;
1825 case 128:
1826 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1827 default:
1828 llvm_unreachable("unknown register size");
1829 }
1830}
1831
1832static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1833 bool IsVectorSuperClass) {
1834 // Currently, there is only 32-bit WWM register spills needed.
1835 if (Size != 4)
1836 llvm_unreachable("unknown wwm register spill size");
1837
1838 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1839 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1840
1841 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1842}
1843
1845 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1846 const SIMachineFunctionInfo &MFI) const {
1847 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1848
1849 // Choose the right opcode if restoring a WWM register.
1851 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1852
1853 // TODO: Check if AGPRs are available
1854 if (ST.hasMAIInsts())
1856
1857 assert(!RI.isAGPRClass(RC));
1859}
1860
1863 Register DestReg, int FrameIndex,
1864 const TargetRegisterClass *RC,
1865 const TargetRegisterInfo *TRI,
1866 Register VReg,
1867 MachineInstr::MIFlag Flags) const {
1868 MachineFunction *MF = MBB.getParent();
1870 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1871 const DebugLoc &DL = MBB.findDebugLoc(MI);
1872 unsigned SpillSize = TRI->getSpillSize(*RC);
1873
1874 MachinePointerInfo PtrInfo
1875 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1876
1878 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1879 FrameInfo.getObjectAlign(FrameIndex));
1880
1881 if (RI.isSGPRClass(RC)) {
1882 MFI->setHasSpilledSGPRs();
1883 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1884 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1885 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1886
1887 // FIXME: Maybe this should not include a memoperand because it will be
1888 // lowered to non-memory instructions.
1889 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1890 if (DestReg.isVirtual() && SpillSize == 4) {
1892 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1893 }
1894
1895 if (RI.spillSGPRToVGPR())
1896 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1897 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1898 .addFrameIndex(FrameIndex) // addr
1899 .addMemOperand(MMO)
1901
1902 return;
1903 }
1904
1905 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1906 SpillSize, *MFI);
1907 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1908 .addFrameIndex(FrameIndex) // vaddr
1909 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1910 .addImm(0) // offset
1911 .addMemOperand(MMO);
1912}
1913
1918
1921 unsigned Quantity) const {
1922 DebugLoc DL = MBB.findDebugLoc(MI);
1923 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1924 while (Quantity > 0) {
1925 unsigned Arg = std::min(Quantity, MaxSNopCount);
1926 Quantity -= Arg;
1927 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1928 }
1929}
1930
1932 auto *MF = MBB.getParent();
1933 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1934
1935 assert(Info->isEntryFunction());
1936
1937 if (MBB.succ_empty()) {
1938 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1939 if (HasNoTerminator) {
1940 if (Info->returnsVoid()) {
1941 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1942 } else {
1943 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1944 }
1945 }
1946 }
1947}
1948
1952 const DebugLoc &DL) const {
1953 MachineFunction *MF = MBB.getParent();
1954 constexpr unsigned DoorbellIDMask = 0x3ff;
1955 constexpr unsigned ECQueueWaveAbort = 0x400;
1956
1957 MachineBasicBlock *TrapBB = &MBB;
1958 MachineBasicBlock *ContBB = &MBB;
1959 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1960
1961 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1962 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1963 TrapBB = MF->CreateMachineBasicBlock();
1964 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1965 MF->push_back(TrapBB);
1966 MBB.addSuccessor(TrapBB);
1967 }
1968
1969 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1970 // will be a nop.
1971 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1972 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1973 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1974 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1975 DoorbellReg)
1977 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1978 .addUse(AMDGPU::M0);
1979 Register DoorbellRegMasked =
1980 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1981 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1982 .addUse(DoorbellReg)
1983 .addImm(DoorbellIDMask);
1984 Register SetWaveAbortBit =
1985 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1986 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1987 .addUse(DoorbellRegMasked)
1988 .addImm(ECQueueWaveAbort);
1989 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1990 .addUse(SetWaveAbortBit);
1991 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
1993 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1994 .addUse(AMDGPU::TTMP2);
1995 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
1996 TrapBB->addSuccessor(HaltLoopBB);
1997
1998 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
1999 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2000 .addMBB(HaltLoopBB);
2001 MF->push_back(HaltLoopBB);
2002 HaltLoopBB->addSuccessor(HaltLoopBB);
2003
2004 return ContBB;
2005}
2006
2008 switch (MI.getOpcode()) {
2009 default:
2010 if (MI.isMetaInstruction())
2011 return 0;
2012 return 1; // FIXME: Do wait states equal cycles?
2013
2014 case AMDGPU::S_NOP:
2015 return MI.getOperand(0).getImm() + 1;
2016 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2017 // hazard, even if one exist, won't really be visible. Should we handle it?
2018 }
2019}
2020
2022 MachineBasicBlock &MBB = *MI.getParent();
2023 DebugLoc DL = MBB.findDebugLoc(MI);
2025 switch (MI.getOpcode()) {
2026 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2027 case AMDGPU::S_MOV_B64_term:
2028 // This is only a terminator to get the correct spill code placement during
2029 // register allocation.
2030 MI.setDesc(get(AMDGPU::S_MOV_B64));
2031 break;
2032
2033 case AMDGPU::S_MOV_B32_term:
2034 // This is only a terminator to get the correct spill code placement during
2035 // register allocation.
2036 MI.setDesc(get(AMDGPU::S_MOV_B32));
2037 break;
2038
2039 case AMDGPU::S_XOR_B64_term:
2040 // This is only a terminator to get the correct spill code placement during
2041 // register allocation.
2042 MI.setDesc(get(AMDGPU::S_XOR_B64));
2043 break;
2044
2045 case AMDGPU::S_XOR_B32_term:
2046 // This is only a terminator to get the correct spill code placement during
2047 // register allocation.
2048 MI.setDesc(get(AMDGPU::S_XOR_B32));
2049 break;
2050 case AMDGPU::S_OR_B64_term:
2051 // This is only a terminator to get the correct spill code placement during
2052 // register allocation.
2053 MI.setDesc(get(AMDGPU::S_OR_B64));
2054 break;
2055 case AMDGPU::S_OR_B32_term:
2056 // This is only a terminator to get the correct spill code placement during
2057 // register allocation.
2058 MI.setDesc(get(AMDGPU::S_OR_B32));
2059 break;
2060
2061 case AMDGPU::S_ANDN2_B64_term:
2062 // This is only a terminator to get the correct spill code placement during
2063 // register allocation.
2064 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2065 break;
2066
2067 case AMDGPU::S_ANDN2_B32_term:
2068 // This is only a terminator to get the correct spill code placement during
2069 // register allocation.
2070 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2071 break;
2072
2073 case AMDGPU::S_AND_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_AND_B64));
2077 break;
2078
2079 case AMDGPU::S_AND_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_AND_B32));
2083 break;
2084
2085 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2089 break;
2090
2091 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2095 break;
2096
2097 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2098 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2099 break;
2100
2101 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2102 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2103 break;
2104 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2105 Register Dst = MI.getOperand(0).getReg();
2106 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2107 MI.setDesc(
2108 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2109 break;
2110 }
2111 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2112 Register Dst = MI.getOperand(0).getReg();
2113 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2114 int64_t Imm = MI.getOperand(1).getImm();
2115
2116 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2117 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2118 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2121 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2122 .addImm(SignExtend64<32>(Imm >> 32))
2124 MI.eraseFromParent();
2125 break;
2126 }
2127
2128 [[fallthrough]];
2129 }
2130 case AMDGPU::V_MOV_B64_PSEUDO: {
2131 Register Dst = MI.getOperand(0).getReg();
2132 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2133 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2134
2135 const MachineOperand &SrcOp = MI.getOperand(1);
2136 // FIXME: Will this work for 64-bit floating point immediates?
2137 assert(!SrcOp.isFPImm());
2138 if (ST.hasMovB64()) {
2139 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2140 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2141 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2142 break;
2143 }
2144 if (SrcOp.isImm()) {
2145 APInt Imm(64, SrcOp.getImm());
2146 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2147 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2148 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2149 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2151 .addImm(Lo.getSExtValue())
2153 .addImm(Lo.getSExtValue())
2154 .addImm(0) // op_sel_lo
2155 .addImm(0) // op_sel_hi
2156 .addImm(0) // neg_lo
2157 .addImm(0) // neg_hi
2158 .addImm(0); // clamp
2159 } else {
2160 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2161 .addImm(Lo.getSExtValue())
2163 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2164 .addImm(Hi.getSExtValue())
2166 }
2167 } else {
2168 assert(SrcOp.isReg());
2169 if (ST.hasPkMovB32() &&
2170 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2171 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2172 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2173 .addReg(SrcOp.getReg())
2175 .addReg(SrcOp.getReg())
2176 .addImm(0) // op_sel_lo
2177 .addImm(0) // op_sel_hi
2178 .addImm(0) // neg_lo
2179 .addImm(0) // neg_hi
2180 .addImm(0); // clamp
2181 } else {
2182 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2183 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2185 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2186 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2188 }
2189 }
2190 MI.eraseFromParent();
2191 break;
2192 }
2193 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2195 break;
2196 }
2197 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2198 const MachineOperand &SrcOp = MI.getOperand(1);
2199 assert(!SrcOp.isFPImm());
2200
2201 if (ST.has64BitLiterals()) {
2202 MI.setDesc(get(AMDGPU::S_MOV_B64));
2203 break;
2204 }
2205
2206 APInt Imm(64, SrcOp.getImm());
2207 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2208 MI.setDesc(get(AMDGPU::S_MOV_B64));
2209 break;
2210 }
2211
2212 Register Dst = MI.getOperand(0).getReg();
2213 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2214 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2215
2216 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2217 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2218 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2219 .addImm(Lo.getSExtValue())
2221 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2222 .addImm(Hi.getSExtValue())
2224 MI.eraseFromParent();
2225 break;
2226 }
2227 case AMDGPU::V_SET_INACTIVE_B32: {
2228 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2229 Register DstReg = MI.getOperand(0).getReg();
2230 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2231 .add(MI.getOperand(3))
2232 .add(MI.getOperand(4))
2233 .add(MI.getOperand(1))
2234 .add(MI.getOperand(2))
2235 .add(MI.getOperand(5));
2236 MI.eraseFromParent();
2237 break;
2238 }
2239 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2240 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2241 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2242 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2243 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2244 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2245 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2246 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2247 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2248 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2249 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2250 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2251 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2252 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2253 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2254 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2255 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2256 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2257 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2258 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2259 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2260 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2261 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2262 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2263 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2268 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2269
2270 unsigned Opc;
2271 if (RI.hasVGPRs(EltRC)) {
2272 Opc = AMDGPU::V_MOVRELD_B32_e32;
2273 } else {
2274 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2275 : AMDGPU::S_MOVRELD_B32;
2276 }
2277
2278 const MCInstrDesc &OpDesc = get(Opc);
2279 Register VecReg = MI.getOperand(0).getReg();
2280 bool IsUndef = MI.getOperand(1).isUndef();
2281 unsigned SubReg = MI.getOperand(3).getImm();
2282 assert(VecReg == MI.getOperand(1).getReg());
2283
2285 BuildMI(MBB, MI, DL, OpDesc)
2286 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2287 .add(MI.getOperand(2))
2289 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2290
2291 const int ImpDefIdx =
2292 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2293 const int ImpUseIdx = ImpDefIdx + 1;
2294 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2295 MI.eraseFromParent();
2296 break;
2297 }
2298 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2310 assert(ST.useVGPRIndexMode());
2311 Register VecReg = MI.getOperand(0).getReg();
2312 bool IsUndef = MI.getOperand(1).isUndef();
2313 MachineOperand &Idx = MI.getOperand(3);
2314 Register SubReg = MI.getOperand(4).getImm();
2315
2316 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2317 .add(Idx)
2319 SetOn->getOperand(3).setIsUndef();
2320
2321 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2323 BuildMI(MBB, MI, DL, OpDesc)
2324 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2325 .add(MI.getOperand(2))
2327 .addReg(VecReg,
2328 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2329
2330 const int ImpDefIdx =
2331 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2332 const int ImpUseIdx = ImpDefIdx + 1;
2333 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2334
2335 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2336
2337 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2338
2339 MI.eraseFromParent();
2340 break;
2341 }
2342 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2343 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2344 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2345 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2346 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2347 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2348 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2349 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2350 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2351 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2352 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2353 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2354 assert(ST.useVGPRIndexMode());
2355 Register Dst = MI.getOperand(0).getReg();
2356 Register VecReg = MI.getOperand(1).getReg();
2357 bool IsUndef = MI.getOperand(1).isUndef();
2358 Register Idx = MI.getOperand(2).getReg();
2359 Register SubReg = MI.getOperand(3).getImm();
2360
2361 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2362 .addReg(Idx)
2364 SetOn->getOperand(3).setIsUndef();
2365
2366 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2367 .addDef(Dst)
2368 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2369 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2370
2371 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2372
2373 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2374
2375 MI.eraseFromParent();
2376 break;
2377 }
2378 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2379 MachineFunction &MF = *MBB.getParent();
2380 Register Reg = MI.getOperand(0).getReg();
2381 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2382 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2383 MachineOperand OpLo = MI.getOperand(1);
2384 MachineOperand OpHi = MI.getOperand(2);
2385
2386 // Create a bundle so these instructions won't be re-ordered by the
2387 // post-RA scheduler.
2388 MIBundleBuilder Bundler(MBB, MI);
2389 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2390
2391 // What we want here is an offset from the value returned by s_getpc (which
2392 // is the address of the s_add_u32 instruction) to the global variable, but
2393 // since the encoding of $symbol starts 4 bytes after the start of the
2394 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2395 // small. This requires us to add 4 to the global variable offset in order
2396 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2397 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2398 // instruction.
2399
2400 int64_t Adjust = 0;
2401 if (ST.hasGetPCZeroExtension()) {
2402 // Fix up hardware that does not sign-extend the 48-bit PC value by
2403 // inserting: s_sext_i32_i16 reghi, reghi
2404 Bundler.append(
2405 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2406 Adjust += 4;
2407 }
2408
2409 if (OpLo.isGlobal())
2410 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2411 Bundler.append(
2412 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2413
2414 if (OpHi.isGlobal())
2415 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2416 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2417 .addReg(RegHi)
2418 .add(OpHi));
2419
2420 finalizeBundle(MBB, Bundler.begin());
2421
2422 MI.eraseFromParent();
2423 break;
2424 }
2425 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2426 MachineFunction &MF = *MBB.getParent();
2427 Register Reg = MI.getOperand(0).getReg();
2428 MachineOperand Op = MI.getOperand(1);
2429
2430 // Create a bundle so these instructions won't be re-ordered by the
2431 // post-RA scheduler.
2432 MIBundleBuilder Bundler(MBB, MI);
2433 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2434 if (Op.isGlobal())
2435 Op.setOffset(Op.getOffset() + 4);
2436 Bundler.append(
2437 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2438
2439 finalizeBundle(MBB, Bundler.begin());
2440
2441 MI.eraseFromParent();
2442 break;
2443 }
2444 case AMDGPU::ENTER_STRICT_WWM: {
2445 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2446 // Whole Wave Mode is entered.
2447 MI.setDesc(get(LMC.OrSaveExecOpc));
2448 break;
2449 }
2450 case AMDGPU::ENTER_STRICT_WQM: {
2451 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2452 // STRICT_WQM is entered.
2453 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2454 .addReg(LMC.ExecReg);
2455 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2456
2457 MI.eraseFromParent();
2458 break;
2459 }
2460 case AMDGPU::EXIT_STRICT_WWM:
2461 case AMDGPU::EXIT_STRICT_WQM: {
2462 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2463 // WWM/STICT_WQM is exited.
2464 MI.setDesc(get(LMC.MovOpc));
2465 break;
2466 }
2467 case AMDGPU::SI_RETURN: {
2468 const MachineFunction *MF = MBB.getParent();
2469 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2470 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2471 // Hiding the return address use with SI_RETURN may lead to extra kills in
2472 // the function and missing live-ins. We are fine in practice because callee
2473 // saved register handling ensures the register value is restored before
2474 // RET, but we need the undef flag here to appease the MachineVerifier
2475 // liveness checks.
2477 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2478 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2479
2480 MIB.copyImplicitOps(MI);
2481 MI.eraseFromParent();
2482 break;
2483 }
2484
2485 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2486 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2487 MI.setDesc(get(AMDGPU::S_MUL_U64));
2488 break;
2489
2490 case AMDGPU::S_GETPC_B64_pseudo:
2491 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2492 if (ST.hasGetPCZeroExtension()) {
2493 Register Dst = MI.getOperand(0).getReg();
2494 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2495 // Fix up hardware that does not sign-extend the 48-bit PC value by
2496 // inserting: s_sext_i32_i16 dsthi, dsthi
2497 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2498 DstHi)
2499 .addReg(DstHi);
2500 }
2501 break;
2502
2503 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2504 assert(ST.hasBF16PackedInsts());
2505 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2506 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2507 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2508 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2509 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2510 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2511 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2512 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2513 break;
2514 }
2515
2516 return true;
2517}
2518
2521 unsigned SubIdx, const MachineInstr &Orig,
2522 const TargetRegisterInfo &RI) const {
2523
2524 // Try shrinking the instruction to remat only the part needed for current
2525 // context.
2526 // TODO: Handle more cases.
2527 unsigned Opcode = Orig.getOpcode();
2528 switch (Opcode) {
2529 case AMDGPU::S_LOAD_DWORDX16_IMM:
2530 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2531 if (SubIdx != 0)
2532 break;
2533
2534 if (I == MBB.end())
2535 break;
2536
2537 if (I->isBundled())
2538 break;
2539
2540 // Look for a single use of the register that is also a subreg.
2541 Register RegToFind = Orig.getOperand(0).getReg();
2542 MachineOperand *UseMO = nullptr;
2543 for (auto &CandMO : I->operands()) {
2544 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2545 continue;
2546 if (UseMO) {
2547 UseMO = nullptr;
2548 break;
2549 }
2550 UseMO = &CandMO;
2551 }
2552 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2553 break;
2554
2555 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2556 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2557
2558 MachineFunction *MF = MBB.getParent();
2560 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2561
2562 unsigned NewOpcode = -1;
2563 if (SubregSize == 256)
2564 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2565 else if (SubregSize == 128)
2566 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2567 else
2568 break;
2569
2570 const MCInstrDesc &TID = get(NewOpcode);
2571 const TargetRegisterClass *NewRC =
2572 RI.getAllocatableClass(getRegClass(TID, 0, &RI));
2573 MRI.setRegClass(DestReg, NewRC);
2574
2575 UseMO->setReg(DestReg);
2576 UseMO->setSubReg(AMDGPU::NoSubRegister);
2577
2578 // Use a smaller load with the desired size, possibly with updated offset.
2579 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2580 MI->setDesc(TID);
2581 MI->getOperand(0).setReg(DestReg);
2582 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2583 if (Offset) {
2584 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2585 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2586 OffsetMO->setImm(FinalOffset);
2587 }
2589 for (const MachineMemOperand *MemOp : Orig.memoperands())
2590 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2591 SubregSize / 8));
2592 MI->setMemRefs(*MF, NewMMOs);
2593
2594 MBB.insert(I, MI);
2595 return;
2596 }
2597
2598 default:
2599 break;
2600 }
2601
2602 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2603}
2604
2605std::pair<MachineInstr*, MachineInstr*>
2607 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2608
2609 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2611 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2612 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2613 return std::pair(&MI, nullptr);
2614 }
2615
2616 MachineBasicBlock &MBB = *MI.getParent();
2617 DebugLoc DL = MBB.findDebugLoc(MI);
2618 MachineFunction *MF = MBB.getParent();
2620 Register Dst = MI.getOperand(0).getReg();
2621 unsigned Part = 0;
2622 MachineInstr *Split[2];
2623
2624 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2625 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2626 if (Dst.isPhysical()) {
2627 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2628 } else {
2629 assert(MRI.isSSA());
2630 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2631 MovDPP.addDef(Tmp);
2632 }
2633
2634 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2635 const MachineOperand &SrcOp = MI.getOperand(I);
2636 assert(!SrcOp.isFPImm());
2637 if (SrcOp.isImm()) {
2638 APInt Imm(64, SrcOp.getImm());
2639 Imm.ashrInPlace(Part * 32);
2640 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2641 } else {
2642 assert(SrcOp.isReg());
2643 Register Src = SrcOp.getReg();
2644 if (Src.isPhysical())
2645 MovDPP.addReg(RI.getSubReg(Src, Sub));
2646 else
2647 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2648 }
2649 }
2650
2651 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2652 MovDPP.addImm(MO.getImm());
2653
2654 Split[Part] = MovDPP;
2655 ++Part;
2656 }
2657
2658 if (Dst.isVirtual())
2659 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2660 .addReg(Split[0]->getOperand(0).getReg())
2661 .addImm(AMDGPU::sub0)
2662 .addReg(Split[1]->getOperand(0).getReg())
2663 .addImm(AMDGPU::sub1);
2664
2665 MI.eraseFromParent();
2666 return std::pair(Split[0], Split[1]);
2667}
2668
2669std::optional<DestSourcePair>
2671 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2672 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2673
2674 return std::nullopt;
2675}
2676
2678 AMDGPU::OpName Src0OpName,
2679 MachineOperand &Src1,
2680 AMDGPU::OpName Src1OpName) const {
2681 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2682 if (!Src0Mods)
2683 return false;
2684
2685 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2686 assert(Src1Mods &&
2687 "All commutable instructions have both src0 and src1 modifiers");
2688
2689 int Src0ModsVal = Src0Mods->getImm();
2690 int Src1ModsVal = Src1Mods->getImm();
2691
2692 Src1Mods->setImm(Src0ModsVal);
2693 Src0Mods->setImm(Src1ModsVal);
2694 return true;
2695}
2696
2698 MachineOperand &RegOp,
2699 MachineOperand &NonRegOp) {
2700 Register Reg = RegOp.getReg();
2701 unsigned SubReg = RegOp.getSubReg();
2702 bool IsKill = RegOp.isKill();
2703 bool IsDead = RegOp.isDead();
2704 bool IsUndef = RegOp.isUndef();
2705 bool IsDebug = RegOp.isDebug();
2706
2707 if (NonRegOp.isImm())
2708 RegOp.ChangeToImmediate(NonRegOp.getImm());
2709 else if (NonRegOp.isFI())
2710 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2711 else if (NonRegOp.isGlobal()) {
2712 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2713 NonRegOp.getTargetFlags());
2714 } else
2715 return nullptr;
2716
2717 // Make sure we don't reinterpret a subreg index in the target flags.
2718 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2719
2720 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2721 NonRegOp.setSubReg(SubReg);
2722
2723 return &MI;
2724}
2725
2727 MachineOperand &NonRegOp1,
2728 MachineOperand &NonRegOp2) {
2729 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2730 int64_t NonRegVal = NonRegOp1.getImm();
2731
2732 NonRegOp1.setImm(NonRegOp2.getImm());
2733 NonRegOp2.setImm(NonRegVal);
2734 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2735 NonRegOp2.setTargetFlags(TargetFlags);
2736 return &MI;
2737}
2738
2739bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2740 unsigned OpIdx1) const {
2741 const MCInstrDesc &InstDesc = MI.getDesc();
2742 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2743 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2744
2745 unsigned Opc = MI.getOpcode();
2746 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2747
2748 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2749 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2750
2751 // Swap doesn't breach constant bus or literal limits
2752 // It may move literal to position other than src0, this is not allowed
2753 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2754 // FIXME: After gfx9, literal can be in place other than Src0
2755 if (isVALU(MI)) {
2756 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2757 !isInlineConstant(MO0, OpInfo1))
2758 return false;
2759 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2760 !isInlineConstant(MO1, OpInfo0))
2761 return false;
2762 }
2763
2764 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2765 if (OpInfo1.RegClass == -1)
2766 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2767 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2768 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2769 }
2770 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2771 if (OpInfo0.RegClass == -1)
2772 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2773 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2774 isLegalRegOperand(MI, OpIdx0, MO1);
2775 }
2776
2777 // No need to check 64-bit literals since swapping does not bring new
2778 // 64-bit literals into current instruction to fold to 32-bit
2779
2780 return isImmOperandLegal(MI, OpIdx1, MO0);
2781}
2782
2784 unsigned Src0Idx,
2785 unsigned Src1Idx) const {
2786 assert(!NewMI && "this should never be used");
2787
2788 unsigned Opc = MI.getOpcode();
2789 int CommutedOpcode = commuteOpcode(Opc);
2790 if (CommutedOpcode == -1)
2791 return nullptr;
2792
2793 if (Src0Idx > Src1Idx)
2794 std::swap(Src0Idx, Src1Idx);
2795
2796 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2797 static_cast<int>(Src0Idx) &&
2798 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2799 static_cast<int>(Src1Idx) &&
2800 "inconsistency with findCommutedOpIndices");
2801
2802 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2803 return nullptr;
2804
2805 MachineInstr *CommutedMI = nullptr;
2806 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2807 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2808 if (Src0.isReg() && Src1.isReg()) {
2809 // Be sure to copy the source modifiers to the right place.
2810 CommutedMI =
2811 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2812 } else if (Src0.isReg() && !Src1.isReg()) {
2813 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2814 } else if (!Src0.isReg() && Src1.isReg()) {
2815 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2816 } else if (Src0.isImm() && Src1.isImm()) {
2817 CommutedMI = swapImmOperands(MI, Src0, Src1);
2818 } else {
2819 // FIXME: Found two non registers to commute. This does happen.
2820 return nullptr;
2821 }
2822
2823 if (CommutedMI) {
2824 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2825 Src1, AMDGPU::OpName::src1_modifiers);
2826
2827 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2828 AMDGPU::OpName::src1_sel);
2829
2830 CommutedMI->setDesc(get(CommutedOpcode));
2831 }
2832
2833 return CommutedMI;
2834}
2835
2836// This needs to be implemented because the source modifiers may be inserted
2837// between the true commutable operands, and the base
2838// TargetInstrInfo::commuteInstruction uses it.
2840 unsigned &SrcOpIdx0,
2841 unsigned &SrcOpIdx1) const {
2842 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2843}
2844
2846 unsigned &SrcOpIdx0,
2847 unsigned &SrcOpIdx1) const {
2848 if (!Desc.isCommutable())
2849 return false;
2850
2851 unsigned Opc = Desc.getOpcode();
2852 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2853 if (Src0Idx == -1)
2854 return false;
2855
2856 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2857 if (Src1Idx == -1)
2858 return false;
2859
2860 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2861}
2862
2864 int64_t BrOffset) const {
2865 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2866 // because its dest block is unanalyzable.
2867 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2868
2869 // Convert to dwords.
2870 BrOffset /= 4;
2871
2872 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2873 // from the next instruction.
2874 BrOffset -= 1;
2875
2876 return isIntN(BranchOffsetBits, BrOffset);
2877}
2878
2881 return MI.getOperand(0).getMBB();
2882}
2883
2885 for (const MachineInstr &MI : MBB->terminators()) {
2886 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2887 MI.getOpcode() == AMDGPU::SI_LOOP)
2888 return true;
2889 }
2890 return false;
2891}
2892
2894 MachineBasicBlock &DestBB,
2895 MachineBasicBlock &RestoreBB,
2896 const DebugLoc &DL, int64_t BrOffset,
2897 RegScavenger *RS) const {
2898 assert(MBB.empty() &&
2899 "new block should be inserted for expanding unconditional branch");
2900 assert(MBB.pred_size() == 1);
2901 assert(RestoreBB.empty() &&
2902 "restore block should be inserted for restoring clobbered registers");
2903
2904 MachineFunction *MF = MBB.getParent();
2907 auto I = MBB.end();
2908 auto &MCCtx = MF->getContext();
2909
2910 if (ST.hasAddPC64Inst()) {
2911 MCSymbol *Offset =
2912 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2913 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2915 MCSymbol *PostAddPCLabel =
2916 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2917 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2918 auto *OffsetExpr = MCBinaryExpr::createSub(
2919 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2920 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2921 Offset->setVariableValue(OffsetExpr);
2922 return;
2923 }
2924
2925 assert(RS && "RegScavenger required for long branching");
2926
2927 // FIXME: Virtual register workaround for RegScavenger not working with empty
2928 // blocks.
2929 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2930
2931 // Note: as this is used after hazard recognizer we need to apply some hazard
2932 // workarounds directly.
2933 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2934 ST.hasVALUReadSGPRHazard();
2935 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2936 if (FlushSGPRWrites)
2937 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2939 };
2940
2941 // We need to compute the offset relative to the instruction immediately after
2942 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2943 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2944 ApplyHazardWorkarounds();
2945
2946 MCSymbol *PostGetPCLabel =
2947 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2948 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2949
2950 MCSymbol *OffsetLo =
2951 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2952 MCSymbol *OffsetHi =
2953 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2954 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2955 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2956 .addReg(PCReg, 0, AMDGPU::sub0)
2957 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2958 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2959 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2960 .addReg(PCReg, 0, AMDGPU::sub1)
2961 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2962 ApplyHazardWorkarounds();
2963
2964 // Insert the indirect branch after the other terminator.
2965 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2966 .addReg(PCReg);
2967
2968 // If a spill is needed for the pc register pair, we need to insert a spill
2969 // restore block right before the destination block, and insert a short branch
2970 // into the old destination block's fallthrough predecessor.
2971 // e.g.:
2972 //
2973 // s_cbranch_scc0 skip_long_branch:
2974 //
2975 // long_branch_bb:
2976 // spill s[8:9]
2977 // s_getpc_b64 s[8:9]
2978 // s_add_u32 s8, s8, restore_bb
2979 // s_addc_u32 s9, s9, 0
2980 // s_setpc_b64 s[8:9]
2981 //
2982 // skip_long_branch:
2983 // foo;
2984 //
2985 // .....
2986 //
2987 // dest_bb_fallthrough_predecessor:
2988 // bar;
2989 // s_branch dest_bb
2990 //
2991 // restore_bb:
2992 // restore s[8:9]
2993 // fallthrough dest_bb
2994 ///
2995 // dest_bb:
2996 // buzz;
2997
2998 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2999 Register Scav;
3000
3001 // If we've previously reserved a register for long branches
3002 // avoid running the scavenger and just use those registers
3003 if (LongBranchReservedReg) {
3004 RS->enterBasicBlock(MBB);
3005 Scav = LongBranchReservedReg;
3006 } else {
3007 RS->enterBasicBlockEnd(MBB);
3008 Scav = RS->scavengeRegisterBackwards(
3009 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3010 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3011 }
3012 if (Scav) {
3013 RS->setRegUsed(Scav);
3014 MRI.replaceRegWith(PCReg, Scav);
3015 MRI.clearVirtRegs();
3016 } else {
3017 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3018 // SGPR spill.
3019 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3020 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3021 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3022 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3023 MRI.clearVirtRegs();
3024 }
3025
3026 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3027 // Now, the distance could be defined.
3029 MCSymbolRefExpr::create(DestLabel, MCCtx),
3030 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3031 // Add offset assignments.
3032 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3033 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3034 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3035 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3036}
3037
3038unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3039 switch (Cond) {
3040 case SIInstrInfo::SCC_TRUE:
3041 return AMDGPU::S_CBRANCH_SCC1;
3042 case SIInstrInfo::SCC_FALSE:
3043 return AMDGPU::S_CBRANCH_SCC0;
3044 case SIInstrInfo::VCCNZ:
3045 return AMDGPU::S_CBRANCH_VCCNZ;
3046 case SIInstrInfo::VCCZ:
3047 return AMDGPU::S_CBRANCH_VCCZ;
3048 case SIInstrInfo::EXECNZ:
3049 return AMDGPU::S_CBRANCH_EXECNZ;
3050 case SIInstrInfo::EXECZ:
3051 return AMDGPU::S_CBRANCH_EXECZ;
3052 default:
3053 llvm_unreachable("invalid branch predicate");
3054 }
3055}
3056
3057SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3058 switch (Opcode) {
3059 case AMDGPU::S_CBRANCH_SCC0:
3060 return SCC_FALSE;
3061 case AMDGPU::S_CBRANCH_SCC1:
3062 return SCC_TRUE;
3063 case AMDGPU::S_CBRANCH_VCCNZ:
3064 return VCCNZ;
3065 case AMDGPU::S_CBRANCH_VCCZ:
3066 return VCCZ;
3067 case AMDGPU::S_CBRANCH_EXECNZ:
3068 return EXECNZ;
3069 case AMDGPU::S_CBRANCH_EXECZ:
3070 return EXECZ;
3071 default:
3072 return INVALID_BR;
3073 }
3074}
3075
3079 MachineBasicBlock *&FBB,
3081 bool AllowModify) const {
3082 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3083 // Unconditional Branch
3084 TBB = I->getOperand(0).getMBB();
3085 return false;
3086 }
3087
3088 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3089 if (Pred == INVALID_BR)
3090 return true;
3091
3092 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3093 Cond.push_back(MachineOperand::CreateImm(Pred));
3094 Cond.push_back(I->getOperand(1)); // Save the branch register.
3095
3096 ++I;
3097
3098 if (I == MBB.end()) {
3099 // Conditional branch followed by fall-through.
3100 TBB = CondBB;
3101 return false;
3102 }
3103
3104 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3105 TBB = CondBB;
3106 FBB = I->getOperand(0).getMBB();
3107 return false;
3108 }
3109
3110 return true;
3111}
3112
3114 MachineBasicBlock *&FBB,
3116 bool AllowModify) const {
3117 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3118 auto E = MBB.end();
3119 if (I == E)
3120 return false;
3121
3122 // Skip over the instructions that are artificially terminators for special
3123 // exec management.
3124 while (I != E && !I->isBranch() && !I->isReturn()) {
3125 switch (I->getOpcode()) {
3126 case AMDGPU::S_MOV_B64_term:
3127 case AMDGPU::S_XOR_B64_term:
3128 case AMDGPU::S_OR_B64_term:
3129 case AMDGPU::S_ANDN2_B64_term:
3130 case AMDGPU::S_AND_B64_term:
3131 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3132 case AMDGPU::S_MOV_B32_term:
3133 case AMDGPU::S_XOR_B32_term:
3134 case AMDGPU::S_OR_B32_term:
3135 case AMDGPU::S_ANDN2_B32_term:
3136 case AMDGPU::S_AND_B32_term:
3137 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3138 break;
3139 case AMDGPU::SI_IF:
3140 case AMDGPU::SI_ELSE:
3141 case AMDGPU::SI_KILL_I1_TERMINATOR:
3142 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3143 // FIXME: It's messy that these need to be considered here at all.
3144 return true;
3145 default:
3146 llvm_unreachable("unexpected non-branch terminator inst");
3147 }
3148
3149 ++I;
3150 }
3151
3152 if (I == E)
3153 return false;
3154
3155 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3156}
3157
3159 int *BytesRemoved) const {
3160 unsigned Count = 0;
3161 unsigned RemovedSize = 0;
3162 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3163 // Skip over artificial terminators when removing instructions.
3164 if (MI.isBranch() || MI.isReturn()) {
3165 RemovedSize += getInstSizeInBytes(MI);
3166 MI.eraseFromParent();
3167 ++Count;
3168 }
3169 }
3170
3171 if (BytesRemoved)
3172 *BytesRemoved = RemovedSize;
3173
3174 return Count;
3175}
3176
3177// Copy the flags onto the implicit condition register operand.
3179 const MachineOperand &OrigCond) {
3180 CondReg.setIsUndef(OrigCond.isUndef());
3181 CondReg.setIsKill(OrigCond.isKill());
3182}
3183
3186 MachineBasicBlock *FBB,
3188 const DebugLoc &DL,
3189 int *BytesAdded) const {
3190 if (!FBB && Cond.empty()) {
3191 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3192 .addMBB(TBB);
3193 if (BytesAdded)
3194 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3195 return 1;
3196 }
3197
3198 assert(TBB && Cond[0].isImm());
3199
3200 unsigned Opcode
3201 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3202
3203 if (!FBB) {
3204 MachineInstr *CondBr =
3205 BuildMI(&MBB, DL, get(Opcode))
3206 .addMBB(TBB);
3207
3208 // Copy the flags onto the implicit condition register operand.
3209 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3210 fixImplicitOperands(*CondBr);
3211
3212 if (BytesAdded)
3213 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3214 return 1;
3215 }
3216
3217 assert(TBB && FBB);
3218
3219 MachineInstr *CondBr =
3220 BuildMI(&MBB, DL, get(Opcode))
3221 .addMBB(TBB);
3222 fixImplicitOperands(*CondBr);
3223 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3224 .addMBB(FBB);
3225
3226 MachineOperand &CondReg = CondBr->getOperand(1);
3227 CondReg.setIsUndef(Cond[1].isUndef());
3228 CondReg.setIsKill(Cond[1].isKill());
3229
3230 if (BytesAdded)
3231 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3232
3233 return 2;
3234}
3235
3238 if (Cond.size() != 2) {
3239 return true;
3240 }
3241
3242 if (Cond[0].isImm()) {
3243 Cond[0].setImm(-Cond[0].getImm());
3244 return false;
3245 }
3246
3247 return true;
3248}
3249
3252 Register DstReg, Register TrueReg,
3253 Register FalseReg, int &CondCycles,
3254 int &TrueCycles, int &FalseCycles) const {
3255 switch (Cond[0].getImm()) {
3256 case VCCNZ:
3257 case VCCZ: {
3258 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3259 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3260 if (MRI.getRegClass(FalseReg) != RC)
3261 return false;
3262
3263 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3264 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3265
3266 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3267 return RI.hasVGPRs(RC) && NumInsts <= 6;
3268 }
3269 case SCC_TRUE:
3270 case SCC_FALSE: {
3271 // FIXME: We could insert for VGPRs if we could replace the original compare
3272 // with a vector one.
3273 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3274 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3275 if (MRI.getRegClass(FalseReg) != RC)
3276 return false;
3277
3278 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3279
3280 // Multiples of 8 can do s_cselect_b64
3281 if (NumInsts % 2 == 0)
3282 NumInsts /= 2;
3283
3284 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3285 return RI.isSGPRClass(RC);
3286 }
3287 default:
3288 return false;
3289 }
3290}
3291
3295 Register TrueReg, Register FalseReg) const {
3296 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3297 if (Pred == VCCZ || Pred == SCC_FALSE) {
3298 Pred = static_cast<BranchPredicate>(-Pred);
3299 std::swap(TrueReg, FalseReg);
3300 }
3301
3302 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3303 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3304 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3305
3306 if (DstSize == 32) {
3308 if (Pred == SCC_TRUE) {
3309 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3310 .addReg(TrueReg)
3311 .addReg(FalseReg);
3312 } else {
3313 // Instruction's operands are backwards from what is expected.
3314 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3315 .addReg(FalseReg)
3316 .addReg(TrueReg);
3317 }
3318
3319 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3320 return;
3321 }
3322
3323 if (DstSize == 64 && Pred == SCC_TRUE) {
3325 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3326 .addReg(TrueReg)
3327 .addReg(FalseReg);
3328
3329 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3330 return;
3331 }
3332
3333 static const int16_t Sub0_15[] = {
3334 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3335 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3336 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3337 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3338 };
3339
3340 static const int16_t Sub0_15_64[] = {
3341 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3342 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3343 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3344 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3345 };
3346
3347 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3348 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3349 const int16_t *SubIndices = Sub0_15;
3350 int NElts = DstSize / 32;
3351
3352 // 64-bit select is only available for SALU.
3353 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3354 if (Pred == SCC_TRUE) {
3355 if (NElts % 2) {
3356 SelOp = AMDGPU::S_CSELECT_B32;
3357 EltRC = &AMDGPU::SGPR_32RegClass;
3358 } else {
3359 SelOp = AMDGPU::S_CSELECT_B64;
3360 EltRC = &AMDGPU::SGPR_64RegClass;
3361 SubIndices = Sub0_15_64;
3362 NElts /= 2;
3363 }
3364 }
3365
3367 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3368
3369 I = MIB->getIterator();
3370
3372 for (int Idx = 0; Idx != NElts; ++Idx) {
3373 Register DstElt = MRI.createVirtualRegister(EltRC);
3374 Regs.push_back(DstElt);
3375
3376 unsigned SubIdx = SubIndices[Idx];
3377
3379 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3380 Select =
3381 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3382 .addReg(FalseReg, 0, SubIdx)
3383 .addReg(TrueReg, 0, SubIdx);
3384 } else {
3385 Select =
3386 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3387 .addReg(TrueReg, 0, SubIdx)
3388 .addReg(FalseReg, 0, SubIdx);
3389 }
3390
3391 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3393
3394 MIB.addReg(DstElt)
3395 .addImm(SubIdx);
3396 }
3397}
3398
3400 switch (MI.getOpcode()) {
3401 case AMDGPU::V_MOV_B16_t16_e32:
3402 case AMDGPU::V_MOV_B16_t16_e64:
3403 case AMDGPU::V_MOV_B32_e32:
3404 case AMDGPU::V_MOV_B32_e64:
3405 case AMDGPU::V_MOV_B64_PSEUDO:
3406 case AMDGPU::V_MOV_B64_e32:
3407 case AMDGPU::V_MOV_B64_e64:
3408 case AMDGPU::S_MOV_B32:
3409 case AMDGPU::S_MOV_B64:
3410 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3411 case AMDGPU::COPY:
3412 case AMDGPU::WWM_COPY:
3413 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3414 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3415 case AMDGPU::V_ACCVGPR_MOV_B32:
3416 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3417 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3418 return true;
3419 default:
3420 return false;
3421 }
3422}
3423
3425 switch (MI.getOpcode()) {
3426 case AMDGPU::V_MOV_B16_t16_e32:
3427 case AMDGPU::V_MOV_B16_t16_e64:
3428 return 2;
3429 case AMDGPU::V_MOV_B32_e32:
3430 case AMDGPU::V_MOV_B32_e64:
3431 case AMDGPU::V_MOV_B64_PSEUDO:
3432 case AMDGPU::V_MOV_B64_e32:
3433 case AMDGPU::V_MOV_B64_e64:
3434 case AMDGPU::S_MOV_B32:
3435 case AMDGPU::S_MOV_B64:
3436 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3437 case AMDGPU::COPY:
3438 case AMDGPU::WWM_COPY:
3439 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3440 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3441 case AMDGPU::V_ACCVGPR_MOV_B32:
3442 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3443 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3444 return 1;
3445 default:
3446 llvm_unreachable("MI is not a foldable copy");
3447 }
3448}
3449
3450static constexpr AMDGPU::OpName ModifierOpNames[] = {
3451 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3452 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3453 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3454
3456 unsigned Opc = MI.getOpcode();
3457 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3458 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3459 if (Idx >= 0)
3460 MI.removeOperand(Idx);
3461 }
3462}
3463
3464std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3465 unsigned SubRegIndex) {
3466 switch (SubRegIndex) {
3467 case AMDGPU::NoSubRegister:
3468 return Imm;
3469 case AMDGPU::sub0:
3470 return SignExtend64<32>(Imm);
3471 case AMDGPU::sub1:
3472 return SignExtend64<32>(Imm >> 32);
3473 case AMDGPU::lo16:
3474 return SignExtend64<16>(Imm);
3475 case AMDGPU::hi16:
3476 return SignExtend64<16>(Imm >> 16);
3477 case AMDGPU::sub1_lo16:
3478 return SignExtend64<16>(Imm >> 32);
3479 case AMDGPU::sub1_hi16:
3480 return SignExtend64<16>(Imm >> 48);
3481 default:
3482 return std::nullopt;
3483 }
3484
3485 llvm_unreachable("covered subregister switch");
3486}
3487
3488static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3489 switch (Opc) {
3490 case AMDGPU::V_MAC_F16_e32:
3491 case AMDGPU::V_MAC_F16_e64:
3492 case AMDGPU::V_MAD_F16_e64:
3493 return AMDGPU::V_MADAK_F16;
3494 case AMDGPU::V_MAC_F32_e32:
3495 case AMDGPU::V_MAC_F32_e64:
3496 case AMDGPU::V_MAD_F32_e64:
3497 return AMDGPU::V_MADAK_F32;
3498 case AMDGPU::V_FMAC_F32_e32:
3499 case AMDGPU::V_FMAC_F32_e64:
3500 case AMDGPU::V_FMA_F32_e64:
3501 return AMDGPU::V_FMAAK_F32;
3502 case AMDGPU::V_FMAC_F16_e32:
3503 case AMDGPU::V_FMAC_F16_e64:
3504 case AMDGPU::V_FMAC_F16_t16_e64:
3505 case AMDGPU::V_FMAC_F16_fake16_e64:
3506 case AMDGPU::V_FMA_F16_e64:
3507 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3508 ? AMDGPU::V_FMAAK_F16_t16
3509 : AMDGPU::V_FMAAK_F16_fake16
3510 : AMDGPU::V_FMAAK_F16;
3511 case AMDGPU::V_FMAC_F64_e32:
3512 case AMDGPU::V_FMAC_F64_e64:
3513 case AMDGPU::V_FMA_F64_e64:
3514 return AMDGPU::V_FMAAK_F64;
3515 default:
3516 llvm_unreachable("invalid instruction");
3517 }
3518}
3519
3520static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3521 switch (Opc) {
3522 case AMDGPU::V_MAC_F16_e32:
3523 case AMDGPU::V_MAC_F16_e64:
3524 case AMDGPU::V_MAD_F16_e64:
3525 return AMDGPU::V_MADMK_F16;
3526 case AMDGPU::V_MAC_F32_e32:
3527 case AMDGPU::V_MAC_F32_e64:
3528 case AMDGPU::V_MAD_F32_e64:
3529 return AMDGPU::V_MADMK_F32;
3530 case AMDGPU::V_FMAC_F32_e32:
3531 case AMDGPU::V_FMAC_F32_e64:
3532 case AMDGPU::V_FMA_F32_e64:
3533 return AMDGPU::V_FMAMK_F32;
3534 case AMDGPU::V_FMAC_F16_e32:
3535 case AMDGPU::V_FMAC_F16_e64:
3536 case AMDGPU::V_FMAC_F16_t16_e64:
3537 case AMDGPU::V_FMAC_F16_fake16_e64:
3538 case AMDGPU::V_FMA_F16_e64:
3539 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3540 ? AMDGPU::V_FMAMK_F16_t16
3541 : AMDGPU::V_FMAMK_F16_fake16
3542 : AMDGPU::V_FMAMK_F16;
3543 case AMDGPU::V_FMAC_F64_e32:
3544 case AMDGPU::V_FMAC_F64_e64:
3545 case AMDGPU::V_FMA_F64_e64:
3546 return AMDGPU::V_FMAMK_F64;
3547 default:
3548 llvm_unreachable("invalid instruction");
3549 }
3550}
3551
3553 Register Reg, MachineRegisterInfo *MRI) const {
3554 int64_t Imm;
3555 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3556 return false;
3557
3558 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3559
3560 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3561
3562 unsigned Opc = UseMI.getOpcode();
3563 if (Opc == AMDGPU::COPY) {
3564 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3565
3566 Register DstReg = UseMI.getOperand(0).getReg();
3567 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3568
3569 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3570
3571 if (HasMultipleUses) {
3572 // TODO: This should fold in more cases with multiple use, but we need to
3573 // more carefully consider what those uses are.
3574 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3575
3576 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3577 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3578 return false;
3579
3580 // Most of the time folding a 32-bit inline constant is free (though this
3581 // might not be true if we can't later fold it into a real user).
3582 //
3583 // FIXME: This isInlineConstant check is imprecise if
3584 // getConstValDefinedInReg handled the tricky non-mov cases.
3585 if (ImmDefSize == 32 &&
3587 return false;
3588 }
3589
3590 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3591 RI.getSubRegIdxSize(UseSubReg) == 16;
3592
3593 if (Is16Bit) {
3594 if (RI.hasVGPRs(DstRC))
3595 return false; // Do not clobber vgpr_hi16
3596
3597 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3598 return false;
3599 }
3600
3601 MachineFunction *MF = UseMI.getMF();
3602
3603 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3604 MCRegister MovDstPhysReg =
3605 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3606
3607 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3608
3609 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3610 for (unsigned MovOp :
3611 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3612 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3613 const MCInstrDesc &MovDesc = get(MovOp);
3614
3615 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
3616 if (Is16Bit) {
3617 // We just need to find a correctly sized register class, so the
3618 // subregister index compatibility doesn't matter since we're statically
3619 // extracting the immediate value.
3620 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3621 if (!MovDstRC)
3622 continue;
3623
3624 if (MovDstPhysReg) {
3625 // FIXME: We probably should not do this. If there is a live value in
3626 // the high half of the register, it will be corrupted.
3627 MovDstPhysReg =
3628 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3629 if (!MovDstPhysReg)
3630 continue;
3631 }
3632 }
3633
3634 // Result class isn't the right size, try the next instruction.
3635 if (MovDstPhysReg) {
3636 if (!MovDstRC->contains(MovDstPhysReg))
3637 return false;
3638 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3639 // TODO: This will be overly conservative in the case of 16-bit virtual
3640 // SGPRs. We could hack up the virtual register uses to use a compatible
3641 // 32-bit class.
3642 continue;
3643 }
3644
3645 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3646
3647 // Ensure the interpreted immediate value is a valid operand in the new
3648 // mov.
3649 //
3650 // FIXME: isImmOperandLegal should have form that doesn't require existing
3651 // MachineInstr or MachineOperand
3652 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3653 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3654 break;
3655
3656 NewOpc = MovOp;
3657 break;
3658 }
3659
3660 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3661 return false;
3662
3663 if (Is16Bit) {
3664 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3665 if (MovDstPhysReg)
3666 UseMI.getOperand(0).setReg(MovDstPhysReg);
3667 assert(UseMI.getOperand(1).getReg().isVirtual());
3668 }
3669
3670 const MCInstrDesc &NewMCID = get(NewOpc);
3671 UseMI.setDesc(NewMCID);
3672 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3673 UseMI.addImplicitDefUseOperands(*MF);
3674 return true;
3675 }
3676
3677 if (HasMultipleUses)
3678 return false;
3679
3680 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3681 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3682 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3683 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3684 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3685 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3686 Opc == AMDGPU::V_FMAC_F64_e64) {
3687 // Don't fold if we are using source or output modifiers. The new VOP2
3688 // instructions don't have them.
3690 return false;
3691
3692 // If this is a free constant, there's no reason to do this.
3693 // TODO: We could fold this here instead of letting SIFoldOperands do it
3694 // later.
3695 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3696
3697 // Any src operand can be used for the legality check.
3698 if (isInlineConstant(UseMI, Src0Idx, Imm))
3699 return false;
3700
3701 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3702
3703 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3704 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3705
3706 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3707 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3708 (Src1->isReg() && Src1->getReg() == Reg)) {
3709 MachineOperand *RegSrc =
3710 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3711 if (!RegSrc->isReg())
3712 return false;
3713 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3714 ST.getConstantBusLimit(Opc) < 2)
3715 return false;
3716
3717 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3718 return false;
3719
3720 // If src2 is also a literal constant then we have to choose which one to
3721 // fold. In general it is better to choose madak so that the other literal
3722 // can be materialized in an sgpr instead of a vgpr:
3723 // s_mov_b32 s0, literal
3724 // v_madak_f32 v0, s0, v0, literal
3725 // Instead of:
3726 // v_mov_b32 v1, literal
3727 // v_madmk_f32 v0, v0, literal, v1
3728 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3729 if (Def && Def->isMoveImmediate() &&
3730 !isInlineConstant(Def->getOperand(1)))
3731 return false;
3732
3733 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3734 if (pseudoToMCOpcode(NewOpc) == -1)
3735 return false;
3736
3737 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3738 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3739 // restricting their register classes. For now just bail out.
3740 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3741 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3742 return false;
3743
3744 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3745 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3746
3747 // FIXME: This would be a lot easier if we could return a new instruction
3748 // instead of having to modify in place.
3749
3750 Register SrcReg = RegSrc->getReg();
3751 unsigned SrcSubReg = RegSrc->getSubReg();
3752 Src0->setReg(SrcReg);
3753 Src0->setSubReg(SrcSubReg);
3754 Src0->setIsKill(RegSrc->isKill());
3755
3756 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3757 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3758 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3759 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3760 UseMI.untieRegOperand(
3761 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3762
3763 Src1->ChangeToImmediate(*SubRegImm);
3764
3766 UseMI.setDesc(get(NewOpc));
3767
3768 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3769 if (DeleteDef)
3770 DefMI.eraseFromParent();
3771
3772 return true;
3773 }
3774
3775 // Added part is the constant: Use v_madak_{f16, f32}.
3776 if (Src2->isReg() && Src2->getReg() == Reg) {
3777 if (ST.getConstantBusLimit(Opc) < 2) {
3778 // Not allowed to use constant bus for another operand.
3779 // We can however allow an inline immediate as src0.
3780 bool Src0Inlined = false;
3781 if (Src0->isReg()) {
3782 // Try to inline constant if possible.
3783 // If the Def moves immediate and the use is single
3784 // We are saving VGPR here.
3785 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3786 if (Def && Def->isMoveImmediate() &&
3787 isInlineConstant(Def->getOperand(1)) &&
3788 MRI->hasOneNonDBGUse(Src0->getReg())) {
3789 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3790 Src0Inlined = true;
3791 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3792 RI.isSGPRReg(*MRI, Src0->getReg())) {
3793 return false;
3794 }
3795 // VGPR is okay as Src0 - fallthrough
3796 }
3797
3798 if (Src1->isReg() && !Src0Inlined) {
3799 // We have one slot for inlinable constant so far - try to fill it
3800 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3801 if (Def && Def->isMoveImmediate() &&
3802 isInlineConstant(Def->getOperand(1)) &&
3803 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3804 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3805 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3806 return false;
3807 // VGPR is okay as Src1 - fallthrough
3808 }
3809 }
3810
3811 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3812 if (pseudoToMCOpcode(NewOpc) == -1)
3813 return false;
3814
3815 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3816 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3817 // restricting their register classes. For now just bail out.
3818 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3819 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3820 return false;
3821
3822 // FIXME: This would be a lot easier if we could return a new instruction
3823 // instead of having to modify in place.
3824
3825 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3826 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3827 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3828 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3829 UseMI.untieRegOperand(
3830 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3831
3832 const std::optional<int64_t> SubRegImm =
3833 extractSubregFromImm(Imm, Src2->getSubReg());
3834
3835 // ChangingToImmediate adds Src2 back to the instruction.
3836 Src2->ChangeToImmediate(*SubRegImm);
3837
3838 // These come before src2.
3840 UseMI.setDesc(get(NewOpc));
3841 // It might happen that UseMI was commuted
3842 // and we now have SGPR as SRC1. If so 2 inlined
3843 // constant and SGPR are illegal.
3845
3846 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3847 if (DeleteDef)
3848 DefMI.eraseFromParent();
3849
3850 return true;
3851 }
3852 }
3853
3854 return false;
3855}
3856
3857static bool
3860 if (BaseOps1.size() != BaseOps2.size())
3861 return false;
3862 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3863 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3864 return false;
3865 }
3866 return true;
3867}
3868
3869static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3870 LocationSize WidthB, int OffsetB) {
3871 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3872 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3873 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3874 return LowWidth.hasValue() &&
3875 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3876}
3877
3878bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3879 const MachineInstr &MIb) const {
3880 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3881 int64_t Offset0, Offset1;
3882 LocationSize Dummy0 = LocationSize::precise(0);
3883 LocationSize Dummy1 = LocationSize::precise(0);
3884 bool Offset0IsScalable, Offset1IsScalable;
3885 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3886 Dummy0, &RI) ||
3887 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3888 Dummy1, &RI))
3889 return false;
3890
3891 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3892 return false;
3893
3894 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3895 // FIXME: Handle ds_read2 / ds_write2.
3896 return false;
3897 }
3898 LocationSize Width0 = MIa.memoperands().front()->getSize();
3899 LocationSize Width1 = MIb.memoperands().front()->getSize();
3900 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3901}
3902
3904 const MachineInstr &MIb) const {
3905 assert(MIa.mayLoadOrStore() &&
3906 "MIa must load from or modify a memory location");
3907 assert(MIb.mayLoadOrStore() &&
3908 "MIb must load from or modify a memory location");
3909
3911 return false;
3912
3913 // XXX - Can we relax this between address spaces?
3914 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3915 return false;
3916
3917 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3918 return false;
3919
3920 // TODO: Should we check the address space from the MachineMemOperand? That
3921 // would allow us to distinguish objects we know don't alias based on the
3922 // underlying address space, even if it was lowered to a different one,
3923 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3924 // buffer.
3925 if (isDS(MIa)) {
3926 if (isDS(MIb))
3927 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3928
3929 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3930 }
3931
3932 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3933 if (isMUBUF(MIb) || isMTBUF(MIb))
3934 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3935
3936 if (isFLAT(MIb))
3937 return isFLATScratch(MIb);
3938
3939 return !isSMRD(MIb);
3940 }
3941
3942 if (isSMRD(MIa)) {
3943 if (isSMRD(MIb))
3944 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3945
3946 if (isFLAT(MIb))
3947 return isFLATScratch(MIb);
3948
3949 return !isMUBUF(MIb) && !isMTBUF(MIb);
3950 }
3951
3952 if (isFLAT(MIa)) {
3953 if (isFLAT(MIb)) {
3954 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3955 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3956 return true;
3957
3958 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3959 }
3960
3961 return false;
3962 }
3963
3964 return false;
3965}
3966
3968 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3969 if (Reg.isPhysical())
3970 return false;
3971 auto *Def = MRI.getUniqueVRegDef(Reg);
3972 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3973 Imm = Def->getOperand(1).getImm();
3974 if (DefMI)
3975 *DefMI = Def;
3976 return true;
3977 }
3978 return false;
3979}
3980
3981static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3982 MachineInstr **DefMI = nullptr) {
3983 if (!MO->isReg())
3984 return false;
3985 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3986 const MachineRegisterInfo &MRI = MF->getRegInfo();
3987 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3988}
3989
3991 MachineInstr &NewMI) {
3992 if (LV) {
3993 unsigned NumOps = MI.getNumOperands();
3994 for (unsigned I = 1; I < NumOps; ++I) {
3995 MachineOperand &Op = MI.getOperand(I);
3996 if (Op.isReg() && Op.isKill())
3997 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3998 }
3999 }
4000}
4001
4002static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4003 switch (Opc) {
4004 case AMDGPU::V_MAC_F16_e32:
4005 case AMDGPU::V_MAC_F16_e64:
4006 return AMDGPU::V_MAD_F16_e64;
4007 case AMDGPU::V_MAC_F32_e32:
4008 case AMDGPU::V_MAC_F32_e64:
4009 return AMDGPU::V_MAD_F32_e64;
4010 case AMDGPU::V_MAC_LEGACY_F32_e32:
4011 case AMDGPU::V_MAC_LEGACY_F32_e64:
4012 return AMDGPU::V_MAD_LEGACY_F32_e64;
4013 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4014 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4015 return AMDGPU::V_FMA_LEGACY_F32_e64;
4016 case AMDGPU::V_FMAC_F16_e32:
4017 case AMDGPU::V_FMAC_F16_e64:
4018 case AMDGPU::V_FMAC_F16_t16_e64:
4019 case AMDGPU::V_FMAC_F16_fake16_e64:
4020 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4021 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4022 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4023 : AMDGPU::V_FMA_F16_gfx9_e64;
4024 case AMDGPU::V_FMAC_F32_e32:
4025 case AMDGPU::V_FMAC_F32_e64:
4026 return AMDGPU::V_FMA_F32_e64;
4027 case AMDGPU::V_FMAC_F64_e32:
4028 case AMDGPU::V_FMAC_F64_e64:
4029 return AMDGPU::V_FMA_F64_e64;
4030 default:
4031 llvm_unreachable("invalid instruction");
4032 }
4033}
4034
4035/// Helper struct for the implementation of 3-address conversion to communicate
4036/// updates made to instruction operands.
4038 /// Other instruction whose def is no longer used by the converted
4039 /// instruction.
4041};
4042
4044 LiveVariables *LV,
4045 LiveIntervals *LIS) const {
4046 MachineBasicBlock &MBB = *MI.getParent();
4048 MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
4049
4050 if (NewMI) {
4051 updateLiveVariables(LV, MI, *NewMI);
4052 if (LIS) {
4053 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4054 // SlotIndex of defs needs to be updated when converting to early-clobber
4055 MachineOperand &Def = NewMI->getOperand(0);
4056 if (Def.isEarlyClobber() && Def.isReg() &&
4057 LIS->hasInterval(Def.getReg())) {
4058 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4059 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4060 auto &LI = LIS->getInterval(Def.getReg());
4061 auto UpdateDefIndex = [&](LiveRange &LR) {
4062 auto *S = LR.find(OldIndex);
4063 if (S != LR.end() && S->start == OldIndex) {
4064 assert(S->valno && S->valno->def == OldIndex);
4065 S->start = NewIndex;
4066 S->valno->def = NewIndex;
4067 }
4068 };
4069 UpdateDefIndex(LI);
4070 for (auto &SR : LI.subranges())
4071 UpdateDefIndex(SR);
4072 }
4073 }
4074 }
4075
4076 if (U.RemoveMIUse) {
4077 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4078 // The only user is the instruction which will be killed.
4079 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4080
4081 if (MRI.hasOneNonDBGUse(DefReg)) {
4082 // We cannot just remove the DefMI here, calling pass will crash.
4083 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4084 U.RemoveMIUse->getOperand(0).setIsDead(true);
4085 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4086 U.RemoveMIUse->removeOperand(I);
4087 if (LV)
4088 LV->getVarInfo(DefReg).AliveBlocks.clear();
4089 }
4090
4091 if (LIS) {
4092 LiveInterval &DefLI = LIS->getInterval(DefReg);
4093
4094 // We cannot delete the original instruction here, so hack out the use
4095 // in the original instruction with a dummy register so we can use
4096 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4097 // not have the complexity of deleting a use to consider here.
4098 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4099 for (MachineOperand &MIOp : MI.uses()) {
4100 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4101 MIOp.setIsUndef(true);
4102 MIOp.setReg(DummyReg);
4103 }
4104 }
4105
4106 LIS->shrinkToUses(&DefLI);
4107 }
4108 }
4109
4110 return NewMI;
4111}
4112
4114SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4115 ThreeAddressUpdates &U) const {
4116 MachineBasicBlock &MBB = *MI.getParent();
4117 unsigned Opc = MI.getOpcode();
4118
4119 // Handle MFMA.
4120 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4121 if (NewMFMAOpc != -1) {
4123 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4124 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4125 MIB.add(MI.getOperand(I));
4126 return MIB;
4127 }
4128
4129 if (SIInstrInfo::isWMMA(MI)) {
4130 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4131 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4132 .setMIFlags(MI.getFlags());
4133 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4134 MIB->addOperand(MI.getOperand(I));
4135 return MIB;
4136 }
4137
4138 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4139 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4140 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4141 "present pre-RA");
4142
4143 // Handle MAC/FMAC.
4144 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4145 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4146 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4147 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4148 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4149 bool Src0Literal = false;
4150
4151 switch (Opc) {
4152 default:
4153 return nullptr;
4154 case AMDGPU::V_MAC_F16_e64:
4155 case AMDGPU::V_FMAC_F16_e64:
4156 case AMDGPU::V_FMAC_F16_t16_e64:
4157 case AMDGPU::V_FMAC_F16_fake16_e64:
4158 case AMDGPU::V_MAC_F32_e64:
4159 case AMDGPU::V_MAC_LEGACY_F32_e64:
4160 case AMDGPU::V_FMAC_F32_e64:
4161 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4162 case AMDGPU::V_FMAC_F64_e64:
4163 break;
4164 case AMDGPU::V_MAC_F16_e32:
4165 case AMDGPU::V_FMAC_F16_e32:
4166 case AMDGPU::V_MAC_F32_e32:
4167 case AMDGPU::V_MAC_LEGACY_F32_e32:
4168 case AMDGPU::V_FMAC_F32_e32:
4169 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4170 case AMDGPU::V_FMAC_F64_e32: {
4171 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4172 AMDGPU::OpName::src0);
4173 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4174 if (!Src0->isReg() && !Src0->isImm())
4175 return nullptr;
4176
4177 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4178 Src0Literal = true;
4179
4180 break;
4181 }
4182 }
4183
4184 MachineInstrBuilder MIB;
4185 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4186 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4187 const MachineOperand *Src0Mods =
4188 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4189 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4190 const MachineOperand *Src1Mods =
4191 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4192 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4193 const MachineOperand *Src2Mods =
4194 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4195 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4196 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4197 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4198
4199 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4200 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4201 // If we have an SGPR input, we will violate the constant bus restriction.
4202 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4203 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4204 MachineInstr *DefMI;
4205
4206 int64_t Imm;
4207 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4208 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4209 if (pseudoToMCOpcode(NewOpc) != -1) {
4210 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4211 .add(*Dst)
4212 .add(*Src0)
4213 .add(*Src1)
4214 .addImm(Imm)
4215 .setMIFlags(MI.getFlags());
4216 U.RemoveMIUse = DefMI;
4217 return MIB;
4218 }
4219 }
4220 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4221 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4222 if (pseudoToMCOpcode(NewOpc) != -1) {
4223 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4224 .add(*Dst)
4225 .add(*Src0)
4226 .addImm(Imm)
4227 .add(*Src2)
4228 .setMIFlags(MI.getFlags());
4229 U.RemoveMIUse = DefMI;
4230 return MIB;
4231 }
4232 }
4233 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4234 if (Src0Literal) {
4235 Imm = Src0->getImm();
4236 DefMI = nullptr;
4237 }
4238 if (pseudoToMCOpcode(NewOpc) != -1 &&
4240 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4241 Src1)) {
4242 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4243 .add(*Dst)
4244 .add(*Src1)
4245 .addImm(Imm)
4246 .add(*Src2)
4247 .setMIFlags(MI.getFlags());
4248 U.RemoveMIUse = DefMI;
4249 return MIB;
4250 }
4251 }
4252 }
4253
4254 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4255 // if VOP3 does not allow a literal operand.
4256 if (Src0Literal && !ST.hasVOP3Literal())
4257 return nullptr;
4258
4259 unsigned NewOpc = getNewFMAInst(ST, Opc);
4260
4261 if (pseudoToMCOpcode(NewOpc) == -1)
4262 return nullptr;
4263
4264 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4265 .add(*Dst)
4266 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4267 .add(*Src0)
4268 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4269 .add(*Src1)
4270 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4271 .add(*Src2)
4272 .addImm(Clamp ? Clamp->getImm() : 0)
4273 .addImm(Omod ? Omod->getImm() : 0)
4274 .setMIFlags(MI.getFlags());
4275 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4276 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4277 return MIB;
4278}
4279
4280// It's not generally safe to move VALU instructions across these since it will
4281// start using the register as a base index rather than directly.
4282// XXX - Why isn't hasSideEffects sufficient for these?
4284 switch (MI.getOpcode()) {
4285 case AMDGPU::S_SET_GPR_IDX_ON:
4286 case AMDGPU::S_SET_GPR_IDX_MODE:
4287 case AMDGPU::S_SET_GPR_IDX_OFF:
4288 return true;
4289 default:
4290 return false;
4291 }
4292}
4293
4295 const MachineBasicBlock *MBB,
4296 const MachineFunction &MF) const {
4297 // Skipping the check for SP writes in the base implementation. The reason it
4298 // was added was apparently due to compile time concerns.
4299 //
4300 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4301 // but is probably avoidable.
4302
4303 // Copied from base implementation.
4304 // Terminators and labels can't be scheduled around.
4305 if (MI.isTerminator() || MI.isPosition())
4306 return true;
4307
4308 // INLINEASM_BR can jump to another block
4309 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4310 return true;
4311
4312 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4313 return true;
4314
4315 // Target-independent instructions do not have an implicit-use of EXEC, even
4316 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4317 // boundaries prevents incorrect movements of such instructions.
4318 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4319 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4320 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4321 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4322 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4324}
4325
4327 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4328 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4329 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4330}
4331
4333 if (!isFLAT(MI) || isFLATGlobal(MI))
4334 return false;
4335
4336 // If scratch is not initialized, we can never access it.
4337 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4338 return false;
4339
4340 // SCRATCH instructions always access scratch.
4341 if (isFLATScratch(MI))
4342 return true;
4343
4344 // If there are no memory operands then conservatively assume the flat
4345 // operation may access scratch.
4346 if (MI.memoperands_empty())
4347 return true;
4348
4349 // See if any memory operand specifies an address space that involves scratch.
4350 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4351 unsigned AS = Memop->getAddrSpace();
4352 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4353 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4354 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4355 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4356 }
4357 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4358 });
4359}
4360
4362 assert(isFLAT(MI));
4363
4364 // All flat instructions use the VMEM counter except prefetch.
4365 if (!usesVM_CNT(MI))
4366 return false;
4367
4368 // If there are no memory operands then conservatively assume the flat
4369 // operation may access VMEM.
4370 if (MI.memoperands_empty())
4371 return true;
4372
4373 // See if any memory operand specifies an address space that involves VMEM.
4374 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4375 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4376 // (GDS) address space is not supported by flat operations. Therefore, simply
4377 // return true unless only the LDS address space is found.
4378 for (const MachineMemOperand *Memop : MI.memoperands()) {
4379 unsigned AS = Memop->getAddrSpace();
4381 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4382 return true;
4383 }
4384
4385 return false;
4386}
4387
4389 assert(isFLAT(MI));
4390
4391 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4392 if (!usesLGKM_CNT(MI))
4393 return false;
4394
4395 // If in tgsplit mode then there can be no use of LDS.
4396 if (ST.isTgSplitEnabled())
4397 return false;
4398
4399 // If there are no memory operands then conservatively assume the flat
4400 // operation may access LDS.
4401 if (MI.memoperands_empty())
4402 return true;
4403
4404 // See if any memory operand specifies an address space that involves LDS.
4405 for (const MachineMemOperand *Memop : MI.memoperands()) {
4406 unsigned AS = Memop->getAddrSpace();
4408 return true;
4409 }
4410
4411 return false;
4412}
4413
4415 // Skip the full operand and register alias search modifiesRegister
4416 // does. There's only a handful of instructions that touch this, it's only an
4417 // implicit def, and doesn't alias any other registers.
4418 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4419}
4420
4422 unsigned Opcode = MI.getOpcode();
4423
4424 if (MI.mayStore() && isSMRD(MI))
4425 return true; // scalar store or atomic
4426
4427 // This will terminate the function when other lanes may need to continue.
4428 if (MI.isReturn())
4429 return true;
4430
4431 // These instructions cause shader I/O that may cause hardware lockups
4432 // when executed with an empty EXEC mask.
4433 //
4434 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4435 // EXEC = 0, but checking for that case here seems not worth it
4436 // given the typical code patterns.
4437 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4438 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4439 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4440 return true;
4441
4442 if (MI.isCall() || MI.isInlineAsm())
4443 return true; // conservative assumption
4444
4445 // Assume that barrier interactions are only intended with active lanes.
4446 if (isBarrier(Opcode))
4447 return true;
4448
4449 // A mode change is a scalar operation that influences vector instructions.
4451 return true;
4452
4453 // These are like SALU instructions in terms of effects, so it's questionable
4454 // whether we should return true for those.
4455 //
4456 // However, executing them with EXEC = 0 causes them to operate on undefined
4457 // data, which we avoid by returning true here.
4458 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4459 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4460 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4461 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4462 return true;
4463
4464 return false;
4465}
4466
4468 const MachineInstr &MI) const {
4469 if (MI.isMetaInstruction())
4470 return false;
4471
4472 // This won't read exec if this is an SGPR->SGPR copy.
4473 if (MI.isCopyLike()) {
4474 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4475 return true;
4476
4477 // Make sure this isn't copying exec as a normal operand
4478 return MI.readsRegister(AMDGPU::EXEC, &RI);
4479 }
4480
4481 // Make a conservative assumption about the callee.
4482 if (MI.isCall())
4483 return true;
4484
4485 // Be conservative with any unhandled generic opcodes.
4486 if (!isTargetSpecificOpcode(MI.getOpcode()))
4487 return true;
4488
4489 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4490}
4491
4492bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4493 switch (Imm.getBitWidth()) {
4494 case 1: // This likely will be a condition code mask.
4495 return true;
4496
4497 case 32:
4498 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4499 ST.hasInv2PiInlineImm());
4500 case 64:
4501 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4502 ST.hasInv2PiInlineImm());
4503 case 16:
4504 return ST.has16BitInsts() &&
4505 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4506 ST.hasInv2PiInlineImm());
4507 default:
4508 llvm_unreachable("invalid bitwidth");
4509 }
4510}
4511
4513 APInt IntImm = Imm.bitcastToAPInt();
4514 int64_t IntImmVal = IntImm.getSExtValue();
4515 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4516 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4517 default:
4518 llvm_unreachable("invalid fltSemantics");
4521 return isInlineConstant(IntImm);
4523 return ST.has16BitInsts() &&
4524 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4526 return ST.has16BitInsts() &&
4527 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4528 }
4529}
4530
4531bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4532 // MachineOperand provides no way to tell the true operand size, since it only
4533 // records a 64-bit value. We need to know the size to determine if a 32-bit
4534 // floating point immediate bit pattern is legal for an integer immediate. It
4535 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4536 switch (OperandType) {
4546 int32_t Trunc = static_cast<int32_t>(Imm);
4547 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4548 }
4554 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4557 // We would expect inline immediates to not be concerned with an integer/fp
4558 // distinction. However, in the case of 16-bit integer operations, the
4559 // "floating point" values appear to not work. It seems read the low 16-bits
4560 // of 32-bit immediates, which happens to always work for the integer
4561 // values.
4562 //
4563 // See llvm bugzilla 46302.
4564 //
4565 // TODO: Theoretically we could use op-sel to use the high bits of the
4566 // 32-bit FP values.
4578 return false;
4581 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4582 // A few special case instructions have 16-bit operands on subtargets
4583 // where 16-bit instructions are not legal.
4584 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4585 // constants in these cases
4586 int16_t Trunc = static_cast<int16_t>(Imm);
4587 return ST.has16BitInsts() &&
4588 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4589 }
4590
4591 return false;
4592 }
4595 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4596 int16_t Trunc = static_cast<int16_t>(Imm);
4597 return ST.has16BitInsts() &&
4598 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4599 }
4600 return false;
4601 }
4605 return false;
4607 return isLegalAV64PseudoImm(Imm);
4610 // Always embedded in the instruction for free.
4611 return true;
4621 // Just ignore anything else.
4622 return true;
4623 default:
4624 llvm_unreachable("invalid operand type");
4625 }
4626}
4627
4628static bool compareMachineOp(const MachineOperand &Op0,
4629 const MachineOperand &Op1) {
4630 if (Op0.getType() != Op1.getType())
4631 return false;
4632
4633 switch (Op0.getType()) {
4635 return Op0.getReg() == Op1.getReg();
4637 return Op0.getImm() == Op1.getImm();
4638 default:
4639 llvm_unreachable("Didn't expect to be comparing these operand types");
4640 }
4641}
4642
4644 const MCOperandInfo &OpInfo) const {
4645 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4646 return true;
4647
4648 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4649 return false;
4650
4651 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4652 return true;
4653
4654 return ST.hasVOP3Literal();
4655}
4656
4657bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4658 int64_t ImmVal) const {
4659 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4660 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4661 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4662 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4663 AMDGPU::OpName::src2))
4664 return false;
4665 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4666 }
4667
4668 return isLiteralOperandLegal(InstDesc, OpInfo);
4669}
4670
4671bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4672 const MachineOperand &MO) const {
4673 if (MO.isImm())
4674 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4675
4676 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4677 "unexpected imm-like operand kind");
4678 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4679 return isLiteralOperandLegal(InstDesc, OpInfo);
4680}
4681
4683 // 2 32-bit inline constants packed into one.
4684 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4685 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4686}
4687
4688bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4689 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4690 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4691 return false;
4692
4693 int Op32 = AMDGPU::getVOPe32(Opcode);
4694 if (Op32 == -1)
4695 return false;
4696
4697 return pseudoToMCOpcode(Op32) != -1;
4698}
4699
4700bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4701 // The src0_modifier operand is present on all instructions
4702 // that have modifiers.
4703
4704 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4705}
4706
4708 AMDGPU::OpName OpName) const {
4709 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4710 return Mods && Mods->getImm();
4711}
4712
4714 return any_of(ModifierOpNames,
4715 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4716}
4717
4719 const MachineRegisterInfo &MRI) const {
4720 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4721 // Can't shrink instruction with three operands.
4722 if (Src2) {
4723 switch (MI.getOpcode()) {
4724 default: return false;
4725
4726 case AMDGPU::V_ADDC_U32_e64:
4727 case AMDGPU::V_SUBB_U32_e64:
4728 case AMDGPU::V_SUBBREV_U32_e64: {
4729 const MachineOperand *Src1
4730 = getNamedOperand(MI, AMDGPU::OpName::src1);
4731 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4732 return false;
4733 // Additional verification is needed for sdst/src2.
4734 return true;
4735 }
4736 case AMDGPU::V_MAC_F16_e64:
4737 case AMDGPU::V_MAC_F32_e64:
4738 case AMDGPU::V_MAC_LEGACY_F32_e64:
4739 case AMDGPU::V_FMAC_F16_e64:
4740 case AMDGPU::V_FMAC_F16_t16_e64:
4741 case AMDGPU::V_FMAC_F16_fake16_e64:
4742 case AMDGPU::V_FMAC_F32_e64:
4743 case AMDGPU::V_FMAC_F64_e64:
4744 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4745 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4746 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4747 return false;
4748 break;
4749
4750 case AMDGPU::V_CNDMASK_B32_e64:
4751 break;
4752 }
4753 }
4754
4755 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4756 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4757 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4758 return false;
4759
4760 // We don't need to check src0, all input types are legal, so just make sure
4761 // src0 isn't using any modifiers.
4762 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4763 return false;
4764
4765 // Can it be shrunk to a valid 32 bit opcode?
4766 if (!hasVALU32BitEncoding(MI.getOpcode()))
4767 return false;
4768
4769 // Check output modifiers
4770 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4771 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4772 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4773 // TODO: Can we avoid checking bound_ctrl/fi here?
4774 // They are only used by permlane*_swap special case.
4775 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4776 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4777}
4778
4779// Set VCC operand with all flags from \p Orig, except for setting it as
4780// implicit.
4782 const MachineOperand &Orig) {
4783
4784 for (MachineOperand &Use : MI.implicit_operands()) {
4785 if (Use.isUse() &&
4786 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4787 Use.setIsUndef(Orig.isUndef());
4788 Use.setIsKill(Orig.isKill());
4789 return;
4790 }
4791 }
4792}
4793
4795 unsigned Op32) const {
4796 MachineBasicBlock *MBB = MI.getParent();
4797
4798 const MCInstrDesc &Op32Desc = get(Op32);
4799 MachineInstrBuilder Inst32 =
4800 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4801 .setMIFlags(MI.getFlags());
4802
4803 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4804 // For VOPC instructions, this is replaced by an implicit def of vcc.
4805
4806 // We assume the defs of the shrunk opcode are in the same order, and the
4807 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4808 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4809 Inst32.add(MI.getOperand(I));
4810
4811 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4812
4813 int Idx = MI.getNumExplicitDefs();
4814 for (const MachineOperand &Use : MI.explicit_uses()) {
4815 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4817 continue;
4818
4819 if (&Use == Src2) {
4820 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4821 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4822 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4823 // of vcc was already added during the initial BuildMI, but we
4824 // 1) may need to change vcc to vcc_lo to preserve the original register
4825 // 2) have to preserve the original flags.
4826 copyFlagsToImplicitVCC(*Inst32, *Src2);
4827 continue;
4828 }
4829 }
4830
4831 Inst32.add(Use);
4832 }
4833
4834 // FIXME: Losing implicit operands
4835 fixImplicitOperands(*Inst32);
4836 return Inst32;
4837}
4838
4840 // Null is free
4841 Register Reg = RegOp.getReg();
4842 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4843 return false;
4844
4845 // SGPRs use the constant bus
4846
4847 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4848 // physical register operands should also count, except for exec.
4849 if (RegOp.isImplicit())
4850 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4851
4852 // SGPRs use the constant bus
4853 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4854 AMDGPU::SReg_64RegClass.contains(Reg);
4855}
4856
4858 const MachineRegisterInfo &MRI) const {
4859 Register Reg = RegOp.getReg();
4860 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4861 : physRegUsesConstantBus(RegOp);
4862}
4863
4865 const MachineOperand &MO,
4866 const MCOperandInfo &OpInfo) const {
4867 // Literal constants use the constant bus.
4868 if (!MO.isReg())
4869 return !isInlineConstant(MO, OpInfo);
4870
4871 Register Reg = MO.getReg();
4872 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4874}
4875
4877 for (const MachineOperand &MO : MI.implicit_operands()) {
4878 // We only care about reads.
4879 if (MO.isDef())
4880 continue;
4881
4882 switch (MO.getReg()) {
4883 case AMDGPU::VCC:
4884 case AMDGPU::VCC_LO:
4885 case AMDGPU::VCC_HI:
4886 case AMDGPU::M0:
4887 case AMDGPU::FLAT_SCR:
4888 return MO.getReg();
4889
4890 default:
4891 break;
4892 }
4893 }
4894
4895 return Register();
4896}
4897
4898static bool shouldReadExec(const MachineInstr &MI) {
4899 if (SIInstrInfo::isVALU(MI)) {
4900 switch (MI.getOpcode()) {
4901 case AMDGPU::V_READLANE_B32:
4902 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4903 case AMDGPU::V_WRITELANE_B32:
4904 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4905 return false;
4906 }
4907
4908 return true;
4909 }
4910
4911 if (MI.isPreISelOpcode() ||
4912 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4915 return false;
4916
4917 return true;
4918}
4919
4920static bool isRegOrFI(const MachineOperand &MO) {
4921 return MO.isReg() || MO.isFI();
4922}
4923
4924static bool isSubRegOf(const SIRegisterInfo &TRI,
4925 const MachineOperand &SuperVec,
4926 const MachineOperand &SubReg) {
4927 if (SubReg.getReg().isPhysical())
4928 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4929
4930 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4931 SubReg.getReg() == SuperVec.getReg();
4932}
4933
4934// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4935bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4936 const MachineRegisterInfo &MRI,
4937 StringRef &ErrInfo) const {
4938 Register DstReg = MI.getOperand(0).getReg();
4939 Register SrcReg = MI.getOperand(1).getReg();
4940 // This is a check for copy from vector register to SGPR
4941 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4942 ErrInfo = "illegal copy from vector register to SGPR";
4943 return false;
4944 }
4945 return true;
4946}
4947
4949 StringRef &ErrInfo) const {
4950 uint16_t Opcode = MI.getOpcode();
4951 const MachineFunction *MF = MI.getParent()->getParent();
4952 const MachineRegisterInfo &MRI = MF->getRegInfo();
4953
4954 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4955 // Find a better property to recognize the point where instruction selection
4956 // is just done.
4957 // We can only enforce this check after SIFixSGPRCopies pass so that the
4958 // illegal copies are legalized and thereafter we don't expect a pass
4959 // inserting similar copies.
4960 if (!MRI.isSSA() && MI.isCopy())
4961 return verifyCopy(MI, MRI, ErrInfo);
4962
4963 if (SIInstrInfo::isGenericOpcode(Opcode))
4964 return true;
4965
4966 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4967 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4968 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4969 int Src3Idx = -1;
4970 if (Src0Idx == -1) {
4971 // VOPD V_DUAL_* instructions use different operand names.
4972 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4973 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4974 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4975 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4976 }
4977
4978 // Make sure the number of operands is correct.
4979 const MCInstrDesc &Desc = get(Opcode);
4980 if (!Desc.isVariadic() &&
4981 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4982 ErrInfo = "Instruction has wrong number of operands.";
4983 return false;
4984 }
4985
4986 if (MI.isInlineAsm()) {
4987 // Verify register classes for inlineasm constraints.
4988 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4989 I != E; ++I) {
4990 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4991 if (!RC)
4992 continue;
4993
4994 const MachineOperand &Op = MI.getOperand(I);
4995 if (!Op.isReg())
4996 continue;
4997
4998 Register Reg = Op.getReg();
4999 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5000 ErrInfo = "inlineasm operand has incorrect register class.";
5001 return false;
5002 }
5003 }
5004
5005 return true;
5006 }
5007
5008 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5009 ErrInfo = "missing memory operand from image instruction.";
5010 return false;
5011 }
5012
5013 // Make sure the register classes are correct.
5014 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5015 const MachineOperand &MO = MI.getOperand(i);
5016 if (MO.isFPImm()) {
5017 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5018 "all fp values to integers.";
5019 return false;
5020 }
5021
5022 const MCOperandInfo &OpInfo = Desc.operands()[i];
5023 int16_t RegClass = getOpRegClassID(OpInfo);
5024
5025 switch (OpInfo.OperandType) {
5027 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5028 ErrInfo = "Illegal immediate value for operand.";
5029 return false;
5030 }
5031 break;
5044 break;
5046 break;
5047 break;
5061 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5062 ErrInfo = "Illegal immediate value for operand.";
5063 return false;
5064 }
5065 break;
5066 }
5068 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5069 ErrInfo = "Expected inline constant for operand.";
5070 return false;
5071 }
5072 break;
5076 break;
5081 // Check if this operand is an immediate.
5082 // FrameIndex operands will be replaced by immediates, so they are
5083 // allowed.
5084 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5085 ErrInfo = "Expected immediate, but got non-immediate";
5086 return false;
5087 }
5088 break;
5092 break;
5093 default:
5094 if (OpInfo.isGenericType())
5095 continue;
5096 break;
5097 }
5098
5099 if (!MO.isReg())
5100 continue;
5101 Register Reg = MO.getReg();
5102 if (!Reg)
5103 continue;
5104
5105 // FIXME: Ideally we would have separate instruction definitions with the
5106 // aligned register constraint.
5107 // FIXME: We do not verify inline asm operands, but custom inline asm
5108 // verification is broken anyway
5109 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5110 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5111 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5112 if (const TargetRegisterClass *SubRC =
5113 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5114 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5115 if (RC)
5116 RC = SubRC;
5117 }
5118 }
5119
5120 // Check that this is the aligned version of the class.
5121 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5122 ErrInfo = "Subtarget requires even aligned vector registers";
5123 return false;
5124 }
5125 }
5126
5127 if (RegClass != -1) {
5128 if (Reg.isVirtual())
5129 continue;
5130
5131 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5132 if (!RC->contains(Reg)) {
5133 ErrInfo = "Operand has incorrect register class.";
5134 return false;
5135 }
5136 }
5137 }
5138
5139 // Verify SDWA
5140 if (isSDWA(MI)) {
5141 if (!ST.hasSDWA()) {
5142 ErrInfo = "SDWA is not supported on this target";
5143 return false;
5144 }
5145
5146 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5147 AMDGPU::OpName::dst_sel}) {
5148 const MachineOperand *MO = getNamedOperand(MI, Op);
5149 if (!MO)
5150 continue;
5151 int64_t Imm = MO->getImm();
5152 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5153 ErrInfo = "Invalid SDWA selection";
5154 return false;
5155 }
5156 }
5157
5158 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5159
5160 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5161 if (OpIdx == -1)
5162 continue;
5163 const MachineOperand &MO = MI.getOperand(OpIdx);
5164
5165 if (!ST.hasSDWAScalar()) {
5166 // Only VGPRS on VI
5167 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5168 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5169 return false;
5170 }
5171 } else {
5172 // No immediates on GFX9
5173 if (!MO.isReg()) {
5174 ErrInfo =
5175 "Only reg allowed as operands in SDWA instructions on GFX9+";
5176 return false;
5177 }
5178 }
5179 }
5180
5181 if (!ST.hasSDWAOmod()) {
5182 // No omod allowed on VI
5183 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5184 if (OMod != nullptr &&
5185 (!OMod->isImm() || OMod->getImm() != 0)) {
5186 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5187 return false;
5188 }
5189 }
5190
5191 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5192 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5193 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5194 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5195 const MachineOperand *Src0ModsMO =
5196 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5197 unsigned Mods = Src0ModsMO->getImm();
5198 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5199 Mods & SISrcMods::SEXT) {
5200 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5201 return false;
5202 }
5203 }
5204
5205 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5206 if (isVOPC(BasicOpcode)) {
5207 if (!ST.hasSDWASdst() && DstIdx != -1) {
5208 // Only vcc allowed as dst on VI for VOPC
5209 const MachineOperand &Dst = MI.getOperand(DstIdx);
5210 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5211 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5212 return false;
5213 }
5214 } else if (!ST.hasSDWAOutModsVOPC()) {
5215 // No clamp allowed on GFX9 for VOPC
5216 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5217 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5218 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5219 return false;
5220 }
5221
5222 // No omod allowed on GFX9 for VOPC
5223 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5224 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5225 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5226 return false;
5227 }
5228 }
5229 }
5230
5231 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5232 if (DstUnused && DstUnused->isImm() &&
5233 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5234 const MachineOperand &Dst = MI.getOperand(DstIdx);
5235 if (!Dst.isReg() || !Dst.isTied()) {
5236 ErrInfo = "Dst register should have tied register";
5237 return false;
5238 }
5239
5240 const MachineOperand &TiedMO =
5241 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5242 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5243 ErrInfo =
5244 "Dst register should be tied to implicit use of preserved register";
5245 return false;
5246 }
5247 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5248 ErrInfo = "Dst register should use same physical register as preserved";
5249 return false;
5250 }
5251 }
5252 }
5253
5254 // Verify MIMG / VIMAGE / VSAMPLE
5255 if (isImage(Opcode) && !MI.mayStore()) {
5256 // Ensure that the return type used is large enough for all the options
5257 // being used TFE/LWE require an extra result register.
5258 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5259 if (DMask) {
5260 uint64_t DMaskImm = DMask->getImm();
5261 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5262 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5263 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5264 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5265
5266 // Adjust for packed 16 bit values
5267 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5268 RegCount = divideCeil(RegCount, 2);
5269
5270 // Adjust if using LWE or TFE
5271 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5272 RegCount += 1;
5273
5274 const uint32_t DstIdx =
5275 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5276 const MachineOperand &Dst = MI.getOperand(DstIdx);
5277 if (Dst.isReg()) {
5278 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5279 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5280 if (RegCount > DstSize) {
5281 ErrInfo = "Image instruction returns too many registers for dst "
5282 "register class";
5283 return false;
5284 }
5285 }
5286 }
5287 }
5288
5289 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5290 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5291 unsigned ConstantBusCount = 0;
5292 bool UsesLiteral = false;
5293 const MachineOperand *LiteralVal = nullptr;
5294
5295 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5296 if (ImmIdx != -1) {
5297 ++ConstantBusCount;
5298 UsesLiteral = true;
5299 LiteralVal = &MI.getOperand(ImmIdx);
5300 }
5301
5302 SmallVector<Register, 2> SGPRsUsed;
5303 Register SGPRUsed;
5304
5305 // Only look at the true operands. Only a real operand can use the constant
5306 // bus, and we don't want to check pseudo-operands like the source modifier
5307 // flags.
5308 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5309 if (OpIdx == -1)
5310 continue;
5311 const MachineOperand &MO = MI.getOperand(OpIdx);
5312 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5313 if (MO.isReg()) {
5314 SGPRUsed = MO.getReg();
5315 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5316 ++ConstantBusCount;
5317 SGPRsUsed.push_back(SGPRUsed);
5318 }
5319 } else if (!MO.isFI()) { // Treat FI like a register.
5320 if (!UsesLiteral) {
5321 ++ConstantBusCount;
5322 UsesLiteral = true;
5323 LiteralVal = &MO;
5324 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5325 assert(isVOP2(MI) || isVOP3(MI));
5326 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5327 return false;
5328 }
5329 }
5330 }
5331 }
5332
5333 SGPRUsed = findImplicitSGPRRead(MI);
5334 if (SGPRUsed) {
5335 // Implicit uses may safely overlap true operands
5336 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5337 return !RI.regsOverlap(SGPRUsed, SGPR);
5338 })) {
5339 ++ConstantBusCount;
5340 SGPRsUsed.push_back(SGPRUsed);
5341 }
5342 }
5343
5344 // v_writelane_b32 is an exception from constant bus restriction:
5345 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5346 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5347 Opcode != AMDGPU::V_WRITELANE_B32) {
5348 ErrInfo = "VOP* instruction violates constant bus restriction";
5349 return false;
5350 }
5351
5352 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5353 ErrInfo = "VOP3 instruction uses literal";
5354 return false;
5355 }
5356 }
5357
5358 // Special case for writelane - this can break the multiple constant bus rule,
5359 // but still can't use more than one SGPR register
5360 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5361 unsigned SGPRCount = 0;
5362 Register SGPRUsed;
5363
5364 for (int OpIdx : {Src0Idx, Src1Idx}) {
5365 if (OpIdx == -1)
5366 break;
5367
5368 const MachineOperand &MO = MI.getOperand(OpIdx);
5369
5370 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5371 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5372 if (MO.getReg() != SGPRUsed)
5373 ++SGPRCount;
5374 SGPRUsed = MO.getReg();
5375 }
5376 }
5377 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5378 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5379 return false;
5380 }
5381 }
5382 }
5383
5384 // Verify misc. restrictions on specific instructions.
5385 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5386 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5387 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5388 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5389 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5390 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5391 if (!compareMachineOp(Src0, Src1) &&
5392 !compareMachineOp(Src0, Src2)) {
5393 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5394 return false;
5395 }
5396 }
5397 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5398 SISrcMods::ABS) ||
5399 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5400 SISrcMods::ABS) ||
5401 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5402 SISrcMods::ABS)) {
5403 ErrInfo = "ABS not allowed in VOP3B instructions";
5404 return false;
5405 }
5406 }
5407
5408 if (isSOP2(MI) || isSOPC(MI)) {
5409 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5410 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5411
5412 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5413 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5414 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5415 !Src0.isIdenticalTo(Src1)) {
5416 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5417 return false;
5418 }
5419 }
5420
5421 if (isSOPK(MI)) {
5422 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5423 if (Desc.isBranch()) {
5424 if (!Op->isMBB()) {
5425 ErrInfo = "invalid branch target for SOPK instruction";
5426 return false;
5427 }
5428 } else {
5429 uint64_t Imm = Op->getImm();
5430 if (sopkIsZext(Opcode)) {
5431 if (!isUInt<16>(Imm)) {
5432 ErrInfo = "invalid immediate for SOPK instruction";
5433 return false;
5434 }
5435 } else {
5436 if (!isInt<16>(Imm)) {
5437 ErrInfo = "invalid immediate for SOPK instruction";
5438 return false;
5439 }
5440 }
5441 }
5442 }
5443
5444 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5445 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5446 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5447 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5448 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5449 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5450
5451 const unsigned StaticNumOps =
5452 Desc.getNumOperands() + Desc.implicit_uses().size();
5453 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5454
5455 // Allow additional implicit operands. This allows a fixup done by the post
5456 // RA scheduler where the main implicit operand is killed and implicit-defs
5457 // are added for sub-registers that remain live after this instruction.
5458 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5459 ErrInfo = "missing implicit register operands";
5460 return false;
5461 }
5462
5463 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5464 if (IsDst) {
5465 if (!Dst->isUse()) {
5466 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5467 return false;
5468 }
5469
5470 unsigned UseOpIdx;
5471 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5472 UseOpIdx != StaticNumOps + 1) {
5473 ErrInfo = "movrel implicit operands should be tied";
5474 return false;
5475 }
5476 }
5477
5478 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5479 const MachineOperand &ImpUse
5480 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5481 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5482 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5483 ErrInfo = "src0 should be subreg of implicit vector use";
5484 return false;
5485 }
5486 }
5487
5488 // Make sure we aren't losing exec uses in the td files. This mostly requires
5489 // being careful when using let Uses to try to add other use registers.
5490 if (shouldReadExec(MI)) {
5491 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5492 ErrInfo = "VALU instruction does not implicitly read exec mask";
5493 return false;
5494 }
5495 }
5496
5497 if (isSMRD(MI)) {
5498 if (MI.mayStore() &&
5499 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5500 // The register offset form of scalar stores may only use m0 as the
5501 // soffset register.
5502 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5503 if (Soff && Soff->getReg() != AMDGPU::M0) {
5504 ErrInfo = "scalar stores must use m0 as offset register";
5505 return false;
5506 }
5507 }
5508 }
5509
5510 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5511 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5512 if (Offset->getImm() != 0) {
5513 ErrInfo = "subtarget does not support offsets in flat instructions";
5514 return false;
5515 }
5516 }
5517
5518 if (isDS(MI) && !ST.hasGDS()) {
5519 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5520 if (GDSOp && GDSOp->getImm() != 0) {
5521 ErrInfo = "GDS is not supported on this subtarget";
5522 return false;
5523 }
5524 }
5525
5526 if (isImage(MI)) {
5527 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5528 if (DimOp) {
5529 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5530 AMDGPU::OpName::vaddr0);
5531 AMDGPU::OpName RSrcOpName =
5532 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5533 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5534 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5535 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5536 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5537 const AMDGPU::MIMGDimInfo *Dim =
5539
5540 if (!Dim) {
5541 ErrInfo = "dim is out of range";
5542 return false;
5543 }
5544
5545 bool IsA16 = false;
5546 if (ST.hasR128A16()) {
5547 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5548 IsA16 = R128A16->getImm() != 0;
5549 } else if (ST.hasA16()) {
5550 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5551 IsA16 = A16->getImm() != 0;
5552 }
5553
5554 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5555
5556 unsigned AddrWords =
5557 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5558
5559 unsigned VAddrWords;
5560 if (IsNSA) {
5561 VAddrWords = RsrcIdx - VAddr0Idx;
5562 if (ST.hasPartialNSAEncoding() &&
5563 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5564 unsigned LastVAddrIdx = RsrcIdx - 1;
5565 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5566 }
5567 } else {
5568 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5569 if (AddrWords > 12)
5570 AddrWords = 16;
5571 }
5572
5573 if (VAddrWords != AddrWords) {
5574 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5575 << " but got " << VAddrWords << "\n");
5576 ErrInfo = "bad vaddr size";
5577 return false;
5578 }
5579 }
5580 }
5581
5582 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5583 if (DppCt) {
5584 using namespace AMDGPU::DPP;
5585
5586 unsigned DC = DppCt->getImm();
5587 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5588 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5589 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5590 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5591 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5592 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5593 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5594 ErrInfo = "Invalid dpp_ctrl value";
5595 return false;
5596 }
5597 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5598 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5599 ErrInfo = "Invalid dpp_ctrl value: "
5600 "wavefront shifts are not supported on GFX10+";
5601 return false;
5602 }
5603 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5604 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5605 ErrInfo = "Invalid dpp_ctrl value: "
5606 "broadcasts are not supported on GFX10+";
5607 return false;
5608 }
5609 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5610 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5611 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5612 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5613 !ST.hasGFX90AInsts()) {
5614 ErrInfo = "Invalid dpp_ctrl value: "
5615 "row_newbroadcast/row_share is not supported before "
5616 "GFX90A/GFX10";
5617 return false;
5618 }
5619 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5620 ErrInfo = "Invalid dpp_ctrl value: "
5621 "row_share and row_xmask are not supported before GFX10";
5622 return false;
5623 }
5624 }
5625
5626 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5628 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5629 ErrInfo = "Invalid dpp_ctrl value: "
5630 "DP ALU dpp only support row_newbcast";
5631 return false;
5632 }
5633 }
5634
5635 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5636 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5637 AMDGPU::OpName DataName =
5638 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5639 const MachineOperand *Data = getNamedOperand(MI, DataName);
5640 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5641 if (Data && !Data->isReg())
5642 Data = nullptr;
5643
5644 if (ST.hasGFX90AInsts()) {
5645 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5646 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5647 ErrInfo = "Invalid register class: "
5648 "vdata and vdst should be both VGPR or AGPR";
5649 return false;
5650 }
5651 if (Data && Data2 &&
5652 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5653 ErrInfo = "Invalid register class: "
5654 "both data operands should be VGPR or AGPR";
5655 return false;
5656 }
5657 } else {
5658 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5659 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5660 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5661 ErrInfo = "Invalid register class: "
5662 "agpr loads and stores not supported on this GPU";
5663 return false;
5664 }
5665 }
5666 }
5667
5668 if (ST.needsAlignedVGPRs()) {
5669 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5671 if (!Op)
5672 return true;
5673 Register Reg = Op->getReg();
5674 if (Reg.isPhysical())
5675 return !(RI.getHWRegIndex(Reg) & 1);
5676 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5677 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5678 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5679 };
5680
5681 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5682 Opcode == AMDGPU::DS_GWS_BARRIER) {
5683
5684 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5685 ErrInfo = "Subtarget requires even aligned vector registers "
5686 "for DS_GWS instructions";
5687 return false;
5688 }
5689 }
5690
5691 if (isMIMG(MI)) {
5692 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5693 ErrInfo = "Subtarget requires even aligned vector registers "
5694 "for vaddr operand of image instructions";
5695 return false;
5696 }
5697 }
5698 }
5699
5700 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5701 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5702 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5703 ErrInfo = "Invalid register class: "
5704 "v_accvgpr_write with an SGPR is not supported on this GPU";
5705 return false;
5706 }
5707 }
5708
5709 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5710 const MachineOperand &SrcOp = MI.getOperand(1);
5711 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5712 ErrInfo = "pseudo expects only physical SGPRs";
5713 return false;
5714 }
5715 }
5716
5717 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5718 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5719 if (!ST.hasScaleOffset()) {
5720 ErrInfo = "Subtarget does not support offset scaling";
5721 return false;
5722 }
5723 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5724 ErrInfo = "Instruction does not support offset scaling";
5725 return false;
5726 }
5727 }
5728 }
5729
5730 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5731 // information.
5732 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5733 for (unsigned I = 0; I < 3; ++I) {
5735 return false;
5736 }
5737 }
5738
5739 return true;
5740}
5741
5742// It is more readable to list mapped opcodes on the same line.
5743// clang-format off
5744
5746 switch (MI.getOpcode()) {
5747 default: return AMDGPU::INSTRUCTION_LIST_END;
5748 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5749 case AMDGPU::COPY: return AMDGPU::COPY;
5750 case AMDGPU::PHI: return AMDGPU::PHI;
5751 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5752 case AMDGPU::WQM: return AMDGPU::WQM;
5753 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5754 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5755 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5756 case AMDGPU::S_MOV_B32: {
5757 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5758 return MI.getOperand(1).isReg() ||
5759 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5760 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5761 }
5762 case AMDGPU::S_ADD_I32:
5763 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5764 case AMDGPU::S_ADDC_U32:
5765 return AMDGPU::V_ADDC_U32_e32;
5766 case AMDGPU::S_SUB_I32:
5767 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5768 // FIXME: These are not consistently handled, and selected when the carry is
5769 // used.
5770 case AMDGPU::S_ADD_U32:
5771 return AMDGPU::V_ADD_CO_U32_e32;
5772 case AMDGPU::S_SUB_U32:
5773 return AMDGPU::V_SUB_CO_U32_e32;
5774 case AMDGPU::S_ADD_U64_PSEUDO:
5775 return AMDGPU::V_ADD_U64_PSEUDO;
5776 case AMDGPU::S_SUB_U64_PSEUDO:
5777 return AMDGPU::V_SUB_U64_PSEUDO;
5778 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5779 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5780 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5781 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5782 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5783 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5784 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5785 case AMDGPU::S_XNOR_B32:
5786 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5787 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5788 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5789 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5790 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5791 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5792 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5793 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5794 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5795 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5796 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5797 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5798 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5799 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5800 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5801 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5802 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5803 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5804 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5805 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5806 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5807 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5808 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5809 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5810 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5811 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5812 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5813 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5814 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5815 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5816 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5817 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5818 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5819 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5820 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5821 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5822 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5823 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5824 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5825 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5826 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5827 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5828 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5829 case AMDGPU::S_CVT_F32_F16:
5830 case AMDGPU::S_CVT_HI_F32_F16:
5831 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5832 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5833 case AMDGPU::S_CVT_F16_F32:
5834 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5835 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5836 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5837 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5838 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5839 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5840 case AMDGPU::S_CEIL_F16:
5841 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5842 : AMDGPU::V_CEIL_F16_fake16_e64;
5843 case AMDGPU::S_FLOOR_F16:
5844 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5845 : AMDGPU::V_FLOOR_F16_fake16_e64;
5846 case AMDGPU::S_TRUNC_F16:
5847 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5848 : AMDGPU::V_TRUNC_F16_fake16_e64;
5849 case AMDGPU::S_RNDNE_F16:
5850 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5851 : AMDGPU::V_RNDNE_F16_fake16_e64;
5852 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5853 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5854 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5855 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5856 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5857 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5858 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5859 case AMDGPU::S_ADD_F16:
5860 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5861 : AMDGPU::V_ADD_F16_fake16_e64;
5862 case AMDGPU::S_SUB_F16:
5863 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5864 : AMDGPU::V_SUB_F16_fake16_e64;
5865 case AMDGPU::S_MIN_F16:
5866 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5867 : AMDGPU::V_MIN_F16_fake16_e64;
5868 case AMDGPU::S_MAX_F16:
5869 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5870 : AMDGPU::V_MAX_F16_fake16_e64;
5871 case AMDGPU::S_MINIMUM_F16:
5872 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5873 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5874 case AMDGPU::S_MAXIMUM_F16:
5875 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5876 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5877 case AMDGPU::S_MUL_F16:
5878 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5879 : AMDGPU::V_MUL_F16_fake16_e64;
5880 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5881 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5882 case AMDGPU::S_FMAC_F16:
5883 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5884 : AMDGPU::V_FMAC_F16_fake16_e64;
5885 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5886 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5887 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5888 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5889 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5890 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5891 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5892 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5893 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5894 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5895 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5896 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5897 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5898 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5899 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5900 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5901 case AMDGPU::S_CMP_LT_F16:
5902 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5903 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5904 case AMDGPU::S_CMP_EQ_F16:
5905 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5906 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5907 case AMDGPU::S_CMP_LE_F16:
5908 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5909 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5910 case AMDGPU::S_CMP_GT_F16:
5911 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5912 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5913 case AMDGPU::S_CMP_LG_F16:
5914 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5915 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5916 case AMDGPU::S_CMP_GE_F16:
5917 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5918 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5919 case AMDGPU::S_CMP_O_F16:
5920 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5921 : AMDGPU::V_CMP_O_F16_fake16_e64;
5922 case AMDGPU::S_CMP_U_F16:
5923 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5924 : AMDGPU::V_CMP_U_F16_fake16_e64;
5925 case AMDGPU::S_CMP_NGE_F16:
5926 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5927 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5928 case AMDGPU::S_CMP_NLG_F16:
5929 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5930 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5931 case AMDGPU::S_CMP_NGT_F16:
5932 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5933 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5934 case AMDGPU::S_CMP_NLE_F16:
5935 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5936 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5937 case AMDGPU::S_CMP_NEQ_F16:
5938 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5939 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5940 case AMDGPU::S_CMP_NLT_F16:
5941 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5942 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5943 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5944 case AMDGPU::V_S_EXP_F16_e64:
5945 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5946 : AMDGPU::V_EXP_F16_fake16_e64;
5947 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5948 case AMDGPU::V_S_LOG_F16_e64:
5949 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5950 : AMDGPU::V_LOG_F16_fake16_e64;
5951 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5952 case AMDGPU::V_S_RCP_F16_e64:
5953 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5954 : AMDGPU::V_RCP_F16_fake16_e64;
5955 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5956 case AMDGPU::V_S_RSQ_F16_e64:
5957 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5958 : AMDGPU::V_RSQ_F16_fake16_e64;
5959 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5960 case AMDGPU::V_S_SQRT_F16_e64:
5961 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5962 : AMDGPU::V_SQRT_F16_fake16_e64;
5963 }
5965 "Unexpected scalar opcode without corresponding vector one!");
5966}
5967
5968// clang-format on
5969
5973 const DebugLoc &DL, Register Reg,
5974 bool IsSCCLive,
5975 SlotIndexes *Indexes) const {
5976 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5977 const SIInstrInfo *TII = ST.getInstrInfo();
5979 if (IsSCCLive) {
5980 // Insert two move instructions, one to save the original value of EXEC and
5981 // the other to turn on all bits in EXEC. This is required as we can't use
5982 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5983 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
5985 auto FlipExecMI =
5986 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
5987 if (Indexes) {
5988 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5989 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5990 }
5991 } else {
5992 auto SaveExec =
5993 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
5994 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5995 if (Indexes)
5996 Indexes->insertMachineInstrInMaps(*SaveExec);
5997 }
5998}
5999
6002 const DebugLoc &DL, Register Reg,
6003 SlotIndexes *Indexes) const {
6005 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6006 .addReg(Reg, RegState::Kill);
6007 if (Indexes)
6008 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6009}
6010
6014 "Not a whole wave func");
6015 MachineBasicBlock &MBB = *MF.begin();
6016 for (MachineInstr &MI : MBB)
6017 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6018 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6019 return &MI;
6020
6021 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6022}
6023
6024// FIXME: This should not be an overridable function. All subtarget dependent
6025// operand modifications should go through isLookupRegClassByHwMode in the
6026// generic handling.
6027const TargetRegisterClass *
6028SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
6029 const TargetRegisterInfo *TRI) const {
6030 if (OpNum >= TID.getNumOperands())
6031 return nullptr;
6032 const MCOperandInfo &OpInfo = TID.operands()[OpNum];
6033 int16_t RegClass = getOpRegClassID(OpInfo);
6034 return RI.getRegClass(RegClass);
6035}
6036
6038 unsigned OpNo) const {
6039 const MCInstrDesc &Desc = get(MI.getOpcode());
6040 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6041 Desc.operands()[OpNo].RegClass == -1) {
6042 Register Reg = MI.getOperand(OpNo).getReg();
6043
6044 if (Reg.isVirtual()) {
6045 const MachineRegisterInfo &MRI =
6046 MI.getParent()->getParent()->getRegInfo();
6047 return MRI.getRegClass(Reg);
6048 }
6049 return RI.getPhysRegBaseClass(Reg);
6050 }
6051
6052 return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
6053}
6054
6057 MachineBasicBlock *MBB = MI.getParent();
6058 MachineOperand &MO = MI.getOperand(OpIdx);
6059 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6060 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6061 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6062 unsigned Size = RI.getRegSizeInBits(*RC);
6063 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6064 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6065 : AMDGPU::V_MOV_B32_e32;
6066 if (MO.isReg())
6067 Opcode = AMDGPU::COPY;
6068 else if (RI.isSGPRClass(RC))
6069 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6070
6071 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6072 Register Reg = MRI.createVirtualRegister(VRC);
6073 DebugLoc DL = MBB->findDebugLoc(I);
6074 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6075 MO.ChangeToRegister(Reg, false);
6076}
6077
6080 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6081 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6082 if (!SuperReg.getReg().isVirtual())
6083 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6084
6085 MachineBasicBlock *MBB = MI->getParent();
6086 const DebugLoc &DL = MI->getDebugLoc();
6087 Register SubReg = MRI.createVirtualRegister(SubRC);
6088
6089 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6090 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6091 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6092 return SubReg;
6093}
6094
6097 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6098 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6099 if (Op.isImm()) {
6100 if (SubIdx == AMDGPU::sub0)
6101 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6102 if (SubIdx == AMDGPU::sub1)
6103 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6104
6105 llvm_unreachable("Unhandled register index for immediate");
6106 }
6107
6108 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6109 SubIdx, SubRC);
6110 return MachineOperand::CreateReg(SubReg, false);
6111}
6112
6113// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6114void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6115 assert(Inst.getNumExplicitOperands() == 3);
6116 MachineOperand Op1 = Inst.getOperand(1);
6117 Inst.removeOperand(1);
6118 Inst.addOperand(Op1);
6119}
6120
6122 const MCOperandInfo &OpInfo,
6123 const MachineOperand &MO) const {
6124 if (!MO.isReg())
6125 return false;
6126
6127 Register Reg = MO.getReg();
6128
6129 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6130 if (Reg.isPhysical())
6131 return DRC->contains(Reg);
6132
6133 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6134
6135 if (MO.getSubReg()) {
6136 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6137 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6138 if (!SuperRC)
6139 return false;
6140 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6141 }
6142
6143 return RI.getCommonSubClass(DRC, RC) != nullptr;
6144}
6145
6147 const MachineOperand &MO) const {
6148 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6149 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6150 unsigned Opc = MI.getOpcode();
6151
6152 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6153 // information.
6154 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6155 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6156 constexpr const AMDGPU::OpName OpNames[] = {
6157 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6158
6159 for (auto [I, OpName] : enumerate(OpNames)) {
6160 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6161 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6163 return false;
6164 }
6165 }
6166
6167 if (!isLegalRegOperand(MRI, OpInfo, MO))
6168 return false;
6169
6170 // check Accumulate GPR operand
6171 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6172 if (IsAGPR && !ST.hasMAIInsts())
6173 return false;
6174 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6175 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6176 return false;
6177 // Atomics should have both vdst and vdata either vgpr or agpr.
6178 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6179 const int DataIdx = AMDGPU::getNamedOperandIdx(
6180 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6181 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6182 MI.getOperand(DataIdx).isReg() &&
6183 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6184 return false;
6185 if ((int)OpIdx == DataIdx) {
6186 if (VDstIdx != -1 &&
6187 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6188 return false;
6189 // DS instructions with 2 src operands also must have tied RC.
6190 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6191 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6192 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6193 return false;
6194 }
6195
6196 // Check V_ACCVGPR_WRITE_B32_e64
6197 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6198 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6199 RI.isSGPRReg(MRI, MO.getReg()))
6200 return false;
6201 return true;
6202}
6203
6205 const MCOperandInfo &OpInfo,
6206 const MachineOperand &MO) const {
6207 if (MO.isReg())
6208 return isLegalRegOperand(MRI, OpInfo, MO);
6209
6210 // Handle non-register types that are treated like immediates.
6211 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6212 return true;
6213}
6214
6216 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6217 const MachineOperand *MO) const {
6218 constexpr const unsigned NumOps = 3;
6219 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6220 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6221 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6222 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6223
6224 assert(SrcN < NumOps);
6225
6226 if (!MO) {
6227 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6228 if (SrcIdx == -1)
6229 return true;
6230 MO = &MI.getOperand(SrcIdx);
6231 }
6232
6233 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6234 return true;
6235
6236 int ModsIdx =
6237 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6238 if (ModsIdx == -1)
6239 return true;
6240
6241 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6242 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6243 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6244
6245 return !OpSel && !OpSelHi;
6246}
6247
6249 const MachineOperand *MO) const {
6250 const MachineFunction &MF = *MI.getParent()->getParent();
6251 const MachineRegisterInfo &MRI = MF.getRegInfo();
6252 const MCInstrDesc &InstDesc = MI.getDesc();
6253 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6254 int64_t RegClass = getOpRegClassID(OpInfo);
6255 const TargetRegisterClass *DefinedRC =
6256 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6257 if (!MO)
6258 MO = &MI.getOperand(OpIdx);
6259
6260 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6261
6262 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6263 const MachineOperand *UsedLiteral = nullptr;
6264
6265 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6266 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6267
6268 // TODO: Be more permissive with frame indexes.
6269 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6270 if (!LiteralLimit--)
6271 return false;
6272
6273 UsedLiteral = MO;
6274 }
6275
6277 if (MO->isReg())
6278 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6279
6280 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6281 if (i == OpIdx)
6282 continue;
6283 const MachineOperand &Op = MI.getOperand(i);
6284 if (Op.isReg()) {
6285 if (Op.isUse()) {
6286 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6287 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6288 if (--ConstantBusLimit <= 0)
6289 return false;
6290 }
6291 }
6292 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6293 !isInlineConstant(Op, InstDesc.operands()[i])) {
6294 // The same literal may be used multiple times.
6295 if (!UsedLiteral)
6296 UsedLiteral = &Op;
6297 else if (UsedLiteral->isIdenticalTo(Op))
6298 continue;
6299
6300 if (!LiteralLimit--)
6301 return false;
6302 if (--ConstantBusLimit <= 0)
6303 return false;
6304 }
6305 }
6306 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6307 // There can be at most one literal operand, but it can be repeated.
6308 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6309 if (i == OpIdx)
6310 continue;
6311 const MachineOperand &Op = MI.getOperand(i);
6312 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6313 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6314 !Op.isIdenticalTo(*MO))
6315 return false;
6316
6317 // Do not fold a non-inlineable and non-register operand into an
6318 // instruction that already has a frame index. The frame index handling
6319 // code could not handle well when a frame index co-exists with another
6320 // non-register operand, unless that operand is an inlineable immediate.
6321 if (Op.isFI())
6322 return false;
6323 }
6324 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6325 isF16PseudoScalarTrans(MI.getOpcode())) {
6326 return false;
6327 }
6328
6329 if (MO->isReg()) {
6330 if (!DefinedRC)
6331 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6332 return isLegalRegOperand(MI, OpIdx, *MO);
6333 }
6334
6335 if (MO->isImm()) {
6336 uint64_t Imm = MO->getImm();
6337 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6338 bool Is64BitOp = Is64BitFPOp ||
6339 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6340 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6341 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6342 if (Is64BitOp &&
6343 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6344 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6345 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6346 return false;
6347
6348 // FIXME: We can use sign extended 64-bit literals, but only for signed
6349 // operands. At the moment we do not know if an operand is signed.
6350 // Such operand will be encoded as its low 32 bits and then either
6351 // correctly sign extended or incorrectly zero extended by HW.
6352 // If 64-bit literals are supported and the literal will be encoded
6353 // as full 64 bit we still can use it.
6354 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6355 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6356 return false;
6357 }
6358 }
6359
6360 // Handle non-register types that are treated like immediates.
6361 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6362
6363 if (!DefinedRC) {
6364 // This operand expects an immediate.
6365 return true;
6366 }
6367
6368 return isImmOperandLegal(MI, OpIdx, *MO);
6369}
6370
6372 bool IsGFX950Only = ST.hasGFX950Insts();
6373 bool IsGFX940Only = ST.hasGFX940Insts();
6374
6375 if (!IsGFX950Only && !IsGFX940Only)
6376 return false;
6377
6378 if (!isVALU(MI))
6379 return false;
6380
6381 // V_COS, V_EXP, V_RCP, etc.
6382 if (isTRANS(MI))
6383 return true;
6384
6385 // DOT2, DOT2C, DOT4, etc.
6386 if (isDOT(MI))
6387 return true;
6388
6389 // MFMA, SMFMA
6390 if (isMFMA(MI))
6391 return true;
6392
6393 unsigned Opcode = MI.getOpcode();
6394 switch (Opcode) {
6395 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6396 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6397 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6398 case AMDGPU::V_MQSAD_U32_U8_e64:
6399 case AMDGPU::V_PK_ADD_F16:
6400 case AMDGPU::V_PK_ADD_F32:
6401 case AMDGPU::V_PK_ADD_I16:
6402 case AMDGPU::V_PK_ADD_U16:
6403 case AMDGPU::V_PK_ASHRREV_I16:
6404 case AMDGPU::V_PK_FMA_F16:
6405 case AMDGPU::V_PK_FMA_F32:
6406 case AMDGPU::V_PK_FMAC_F16_e32:
6407 case AMDGPU::V_PK_FMAC_F16_e64:
6408 case AMDGPU::V_PK_LSHLREV_B16:
6409 case AMDGPU::V_PK_LSHRREV_B16:
6410 case AMDGPU::V_PK_MAD_I16:
6411 case AMDGPU::V_PK_MAD_U16:
6412 case AMDGPU::V_PK_MAX_F16:
6413 case AMDGPU::V_PK_MAX_I16:
6414 case AMDGPU::V_PK_MAX_U16:
6415 case AMDGPU::V_PK_MIN_F16:
6416 case AMDGPU::V_PK_MIN_I16:
6417 case AMDGPU::V_PK_MIN_U16:
6418 case AMDGPU::V_PK_MOV_B32:
6419 case AMDGPU::V_PK_MUL_F16:
6420 case AMDGPU::V_PK_MUL_F32:
6421 case AMDGPU::V_PK_MUL_LO_U16:
6422 case AMDGPU::V_PK_SUB_I16:
6423 case AMDGPU::V_PK_SUB_U16:
6424 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6425 return true;
6426 default:
6427 return false;
6428 }
6429}
6430
6432 MachineInstr &MI) const {
6433 unsigned Opc = MI.getOpcode();
6434 const MCInstrDesc &InstrDesc = get(Opc);
6435
6436 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6437 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6438
6439 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6440 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6441
6442 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6443 // we need to only have one constant bus use before GFX10.
6444 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6445 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6446 RI.isSGPRReg(MRI, Src0.getReg()))
6447 legalizeOpWithMove(MI, Src0Idx);
6448
6449 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6450 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6451 // src0/src1 with V_READFIRSTLANE.
6452 if (Opc == AMDGPU::V_WRITELANE_B32) {
6453 const DebugLoc &DL = MI.getDebugLoc();
6454 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6455 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6456 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6457 .add(Src0);
6458 Src0.ChangeToRegister(Reg, false);
6459 }
6460 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6461 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6462 const DebugLoc &DL = MI.getDebugLoc();
6463 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6464 .add(Src1);
6465 Src1.ChangeToRegister(Reg, false);
6466 }
6467 return;
6468 }
6469
6470 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6471 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6472 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6473 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6474 legalizeOpWithMove(MI, Src2Idx);
6475 }
6476
6477 // VOP2 src0 instructions support all operand types, so we don't need to check
6478 // their legality. If src1 is already legal, we don't need to do anything.
6479 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6480 return;
6481
6482 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6483 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6484 // select is uniform.
6485 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6486 RI.isVGPR(MRI, Src1.getReg())) {
6487 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6488 const DebugLoc &DL = MI.getDebugLoc();
6489 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6490 .add(Src1);
6491 Src1.ChangeToRegister(Reg, false);
6492 return;
6493 }
6494
6495 // We do not use commuteInstruction here because it is too aggressive and will
6496 // commute if it is possible. We only want to commute here if it improves
6497 // legality. This can be called a fairly large number of times so don't waste
6498 // compile time pointlessly swapping and checking legality again.
6499 if (HasImplicitSGPR || !MI.isCommutable()) {
6500 legalizeOpWithMove(MI, Src1Idx);
6501 return;
6502 }
6503
6504 // If src0 can be used as src1, commuting will make the operands legal.
6505 // Otherwise we have to give up and insert a move.
6506 //
6507 // TODO: Other immediate-like operand kinds could be commuted if there was a
6508 // MachineOperand::ChangeTo* for them.
6509 if ((!Src1.isImm() && !Src1.isReg()) ||
6510 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6511 legalizeOpWithMove(MI, Src1Idx);
6512 return;
6513 }
6514
6515 int CommutedOpc = commuteOpcode(MI);
6516 if (CommutedOpc == -1) {
6517 legalizeOpWithMove(MI, Src1Idx);
6518 return;
6519 }
6520
6521 MI.setDesc(get(CommutedOpc));
6522
6523 Register Src0Reg = Src0.getReg();
6524 unsigned Src0SubReg = Src0.getSubReg();
6525 bool Src0Kill = Src0.isKill();
6526
6527 if (Src1.isImm())
6528 Src0.ChangeToImmediate(Src1.getImm());
6529 else if (Src1.isReg()) {
6530 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6531 Src0.setSubReg(Src1.getSubReg());
6532 } else
6533 llvm_unreachable("Should only have register or immediate operands");
6534
6535 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6536 Src1.setSubReg(Src0SubReg);
6538}
6539
6540// Legalize VOP3 operands. All operand types are supported for any operand
6541// but only one literal constant and only starting from GFX10.
6543 MachineInstr &MI) const {
6544 unsigned Opc = MI.getOpcode();
6545
6546 int VOP3Idx[3] = {
6547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6548 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6549 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6550 };
6551
6552 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6553 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6554 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6555 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6556 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6557 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6558 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6559 // src1 and src2 must be scalar
6560 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6561 const DebugLoc &DL = MI.getDebugLoc();
6562 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6563 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6564 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6565 .add(Src1);
6566 Src1.ChangeToRegister(Reg, false);
6567 }
6568 if (VOP3Idx[2] != -1) {
6569 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6570 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6571 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6572 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6573 .add(Src2);
6574 Src2.ChangeToRegister(Reg, false);
6575 }
6576 }
6577 }
6578
6579 // Find the one SGPR operand we are allowed to use.
6580 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6581 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6582 SmallDenseSet<unsigned> SGPRsUsed;
6583 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6584 if (SGPRReg) {
6585 SGPRsUsed.insert(SGPRReg);
6586 --ConstantBusLimit;
6587 }
6588
6589 for (int Idx : VOP3Idx) {
6590 if (Idx == -1)
6591 break;
6592 MachineOperand &MO = MI.getOperand(Idx);
6593
6594 if (!MO.isReg()) {
6595 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6596 continue;
6597
6598 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6599 --LiteralLimit;
6600 --ConstantBusLimit;
6601 continue;
6602 }
6603
6604 --LiteralLimit;
6605 --ConstantBusLimit;
6606 legalizeOpWithMove(MI, Idx);
6607 continue;
6608 }
6609
6610 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6611 continue; // VGPRs are legal
6612
6613 // We can use one SGPR in each VOP3 instruction prior to GFX10
6614 // and two starting from GFX10.
6615 if (SGPRsUsed.count(MO.getReg()))
6616 continue;
6617 if (ConstantBusLimit > 0) {
6618 SGPRsUsed.insert(MO.getReg());
6619 --ConstantBusLimit;
6620 continue;
6621 }
6622
6623 // If we make it this far, then the operand is not legal and we must
6624 // legalize it.
6625 legalizeOpWithMove(MI, Idx);
6626 }
6627
6628 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6629 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6630 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6631 legalizeOpWithMove(MI, VOP3Idx[2]);
6632
6633 // Fix the register class of packed FP32 instructions on gfx12+. See
6634 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6636 for (unsigned I = 0; I < 3; ++I) {
6638 legalizeOpWithMove(MI, VOP3Idx[I]);
6639 }
6640 }
6641}
6642
6645 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6646 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6647 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6648 if (DstRC)
6649 SRC = RI.getCommonSubClass(SRC, DstRC);
6650
6651 Register DstReg = MRI.createVirtualRegister(SRC);
6652 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6653
6654 if (RI.hasAGPRs(VRC)) {
6655 VRC = RI.getEquivalentVGPRClass(VRC);
6656 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6657 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6658 get(TargetOpcode::COPY), NewSrcReg)
6659 .addReg(SrcReg);
6660 SrcReg = NewSrcReg;
6661 }
6662
6663 if (SubRegs == 1) {
6664 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6665 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6666 .addReg(SrcReg);
6667 return DstReg;
6668 }
6669
6671 for (unsigned i = 0; i < SubRegs; ++i) {
6672 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6673 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6674 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6675 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6676 SRegs.push_back(SGPR);
6677 }
6678
6680 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6681 get(AMDGPU::REG_SEQUENCE), DstReg);
6682 for (unsigned i = 0; i < SubRegs; ++i) {
6683 MIB.addReg(SRegs[i]);
6684 MIB.addImm(RI.getSubRegFromChannel(i));
6685 }
6686 return DstReg;
6687}
6688
6690 MachineInstr &MI) const {
6691
6692 // If the pointer is store in VGPRs, then we need to move them to
6693 // SGPRs using v_readfirstlane. This is safe because we only select
6694 // loads with uniform pointers to SMRD instruction so we know the
6695 // pointer value is uniform.
6696 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6697 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6698 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6699 SBase->setReg(SGPR);
6700 }
6701 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6702 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6703 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6704 SOff->setReg(SGPR);
6705 }
6706}
6707
6709 unsigned Opc = Inst.getOpcode();
6710 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6711 if (OldSAddrIdx < 0)
6712 return false;
6713
6714 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6715
6716 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6717 if (NewOpc < 0)
6719 if (NewOpc < 0)
6720 return false;
6721
6723 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6724 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6725 return false;
6726
6727 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6728 if (NewVAddrIdx < 0)
6729 return false;
6730
6731 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6732
6733 // Check vaddr, it shall be zero or absent.
6734 MachineInstr *VAddrDef = nullptr;
6735 if (OldVAddrIdx >= 0) {
6736 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6737 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6738 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6739 !VAddrDef->getOperand(1).isImm() ||
6740 VAddrDef->getOperand(1).getImm() != 0)
6741 return false;
6742 }
6743
6744 const MCInstrDesc &NewDesc = get(NewOpc);
6745 Inst.setDesc(NewDesc);
6746
6747 // Callers expect iterator to be valid after this call, so modify the
6748 // instruction in place.
6749 if (OldVAddrIdx == NewVAddrIdx) {
6750 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6751 // Clear use list from the old vaddr holding a zero register.
6752 MRI.removeRegOperandFromUseList(&NewVAddr);
6753 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6754 Inst.removeOperand(OldSAddrIdx);
6755 // Update the use list with the pointer we have just moved from vaddr to
6756 // saddr position. Otherwise new vaddr will be missing from the use list.
6757 MRI.removeRegOperandFromUseList(&NewVAddr);
6758 MRI.addRegOperandToUseList(&NewVAddr);
6759 } else {
6760 assert(OldSAddrIdx == NewVAddrIdx);
6761
6762 if (OldVAddrIdx >= 0) {
6763 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6764 AMDGPU::OpName::vdst_in);
6765
6766 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6767 // it asserts. Untie the operands for now and retie them afterwards.
6768 if (NewVDstIn != -1) {
6769 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6770 Inst.untieRegOperand(OldVDstIn);
6771 }
6772
6773 Inst.removeOperand(OldVAddrIdx);
6774
6775 if (NewVDstIn != -1) {
6776 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6777 Inst.tieOperands(NewVDst, NewVDstIn);
6778 }
6779 }
6780 }
6781
6782 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6783 VAddrDef->eraseFromParent();
6784
6785 return true;
6786}
6787
6788// FIXME: Remove this when SelectionDAG is obsoleted.
6790 MachineInstr &MI) const {
6791 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6792 return;
6793
6794 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6795 // thinks they are uniform, so a readfirstlane should be valid.
6796 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6797 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6798 return;
6799
6801 return;
6802
6803 const TargetRegisterClass *DeclaredRC =
6804 getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
6805
6806 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6807 SAddr->setReg(ToSGPR);
6808}
6809
6812 const TargetRegisterClass *DstRC,
6815 const DebugLoc &DL) const {
6816 Register OpReg = Op.getReg();
6817 unsigned OpSubReg = Op.getSubReg();
6818
6819 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6820 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6821
6822 // Check if operand is already the correct register class.
6823 if (DstRC == OpRC)
6824 return;
6825
6826 Register DstReg = MRI.createVirtualRegister(DstRC);
6827 auto Copy =
6828 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6829 Op.setReg(DstReg);
6830
6831 MachineInstr *Def = MRI.getVRegDef(OpReg);
6832 if (!Def)
6833 return;
6834
6835 // Try to eliminate the copy if it is copying an immediate value.
6836 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6837 foldImmediate(*Copy, *Def, OpReg, &MRI);
6838
6839 bool ImpDef = Def->isImplicitDef();
6840 while (!ImpDef && Def && Def->isCopy()) {
6841 if (Def->getOperand(1).getReg().isPhysical())
6842 break;
6843 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6844 ImpDef = Def && Def->isImplicitDef();
6845 }
6846 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6847 !ImpDef)
6848 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6849}
6850
6851// Emit the actual waterfall loop, executing the wrapped instruction for each
6852// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6853// iteration, in the worst case we execute 64 (once per lane).
6854static void
6857 MachineBasicBlock &LoopBB,
6858 MachineBasicBlock &BodyBB,
6859 const DebugLoc &DL,
6860 ArrayRef<MachineOperand *> ScalarOps) {
6861 MachineFunction &MF = *LoopBB.getParent();
6862 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6863 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6865 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6866
6868 Register CondReg;
6869
6870 for (MachineOperand *ScalarOp : ScalarOps) {
6871 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6872 unsigned NumSubRegs = RegSize / 32;
6873 Register VScalarOp = ScalarOp->getReg();
6874
6875 if (NumSubRegs == 1) {
6876 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6877
6878 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6879 .addReg(VScalarOp);
6880
6881 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6882
6883 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6884 .addReg(CurReg)
6885 .addReg(VScalarOp);
6886
6887 // Combine the comparison results with AND.
6888 if (!CondReg) // First.
6889 CondReg = NewCondReg;
6890 else { // If not the first, we create an AND.
6891 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6892 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6893 .addReg(CondReg)
6894 .addReg(NewCondReg);
6895 CondReg = AndReg;
6896 }
6897
6898 // Update ScalarOp operand to use the SGPR ScalarOp.
6899 ScalarOp->setReg(CurReg);
6900 ScalarOp->setIsKill();
6901 } else {
6902 SmallVector<Register, 8> ReadlanePieces;
6903 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6904 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6905 "Unhandled register size");
6906
6907 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6908 Register CurRegLo =
6909 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6910 Register CurRegHi =
6911 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6912
6913 // Read the next variant <- also loop target.
6914 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6915 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6916
6917 // Read the next variant <- also loop target.
6918 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6919 .addReg(VScalarOp, VScalarOpUndef,
6920 TRI->getSubRegFromChannel(Idx + 1));
6921
6922 ReadlanePieces.push_back(CurRegLo);
6923 ReadlanePieces.push_back(CurRegHi);
6924
6925 // Comparison is to be done as 64-bit.
6926 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6927 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6928 .addReg(CurRegLo)
6929 .addImm(AMDGPU::sub0)
6930 .addReg(CurRegHi)
6931 .addImm(AMDGPU::sub1);
6932
6933 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6934 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6935 NewCondReg)
6936 .addReg(CurReg);
6937 if (NumSubRegs <= 2)
6938 Cmp.addReg(VScalarOp);
6939 else
6940 Cmp.addReg(VScalarOp, VScalarOpUndef,
6941 TRI->getSubRegFromChannel(Idx, 2));
6942
6943 // Combine the comparison results with AND.
6944 if (!CondReg) // First.
6945 CondReg = NewCondReg;
6946 else { // If not the first, we create an AND.
6947 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6948 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6949 .addReg(CondReg)
6950 .addReg(NewCondReg);
6951 CondReg = AndReg;
6952 }
6953 } // End for loop.
6954
6955 const auto *SScalarOpRC =
6956 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6957 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6958
6959 // Build scalar ScalarOp.
6960 auto Merge =
6961 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6962 unsigned Channel = 0;
6963 for (Register Piece : ReadlanePieces) {
6964 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6965 }
6966
6967 // Update ScalarOp operand to use the SGPR ScalarOp.
6968 ScalarOp->setReg(SScalarOp);
6969 ScalarOp->setIsKill();
6970 }
6971 }
6972
6973 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6974 MRI.setSimpleHint(SaveExec, CondReg);
6975
6976 // Update EXEC to matching lanes, saving original to SaveExec.
6977 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
6978 .addReg(CondReg, RegState::Kill);
6979
6980 // The original instruction is here; we insert the terminators after it.
6981 I = BodyBB.end();
6982
6983 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6984 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
6985 .addReg(LMC.ExecReg)
6986 .addReg(SaveExec);
6987
6988 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6989}
6990
6991// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6992// with SGPRs by iterating over all unique values across all lanes.
6993// Returns the loop basic block that now contains \p MI.
6994static MachineBasicBlock *
6998 MachineBasicBlock::iterator Begin = nullptr,
6999 MachineBasicBlock::iterator End = nullptr) {
7000 MachineBasicBlock &MBB = *MI.getParent();
7001 MachineFunction &MF = *MBB.getParent();
7002 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7003 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7005 if (!Begin.isValid())
7006 Begin = &MI;
7007 if (!End.isValid()) {
7008 End = &MI;
7009 ++End;
7010 }
7011 const DebugLoc &DL = MI.getDebugLoc();
7013 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7014
7015 // Save SCC. Waterfall Loop may overwrite SCC.
7016 Register SaveSCCReg;
7017
7018 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7019 // rather than unlimited scan everywhere
7020 bool SCCNotDead =
7021 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7022 std::numeric_limits<unsigned>::max()) !=
7024 if (SCCNotDead) {
7025 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7026 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7027 .addImm(1)
7028 .addImm(0);
7029 }
7030
7031 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7032
7033 // Save the EXEC mask
7034 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7035
7036 // Killed uses in the instruction we are waterfalling around will be
7037 // incorrect due to the added control-flow.
7039 ++AfterMI;
7040 for (auto I = Begin; I != AfterMI; I++) {
7041 for (auto &MO : I->all_uses())
7042 MRI.clearKillFlags(MO.getReg());
7043 }
7044
7045 // To insert the loop we need to split the block. Move everything after this
7046 // point to a new block, and insert a new empty block between the two.
7049 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7051 ++MBBI;
7052
7053 MF.insert(MBBI, LoopBB);
7054 MF.insert(MBBI, BodyBB);
7055 MF.insert(MBBI, RemainderBB);
7056
7057 LoopBB->addSuccessor(BodyBB);
7058 BodyBB->addSuccessor(LoopBB);
7059 BodyBB->addSuccessor(RemainderBB);
7060
7061 // Move Begin to MI to the BodyBB, and the remainder of the block to
7062 // RemainderBB.
7063 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7064 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7065 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7066
7067 MBB.addSuccessor(LoopBB);
7068
7069 // Update dominators. We know that MBB immediately dominates LoopBB, that
7070 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7071 // RemainderBB. RemainderBB immediately dominates all of the successors
7072 // transferred to it from MBB that MBB used to properly dominate.
7073 if (MDT) {
7074 MDT->addNewBlock(LoopBB, &MBB);
7075 MDT->addNewBlock(BodyBB, LoopBB);
7076 MDT->addNewBlock(RemainderBB, BodyBB);
7077 for (auto &Succ : RemainderBB->successors()) {
7078 if (MDT->properlyDominates(&MBB, Succ)) {
7079 MDT->changeImmediateDominator(Succ, RemainderBB);
7080 }
7081 }
7082 }
7083
7084 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7085
7086 MachineBasicBlock::iterator First = RemainderBB->begin();
7087 // Restore SCC
7088 if (SCCNotDead) {
7089 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7090 .addReg(SaveSCCReg, RegState::Kill)
7091 .addImm(0);
7092 }
7093
7094 // Restore the EXEC mask
7095 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7096 .addReg(SaveExec);
7097 return BodyBB;
7098}
7099
7100// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7101static std::tuple<unsigned, unsigned>
7103 MachineBasicBlock &MBB = *MI.getParent();
7104 MachineFunction &MF = *MBB.getParent();
7106
7107 // Extract the ptr from the resource descriptor.
7108 unsigned RsrcPtr =
7109 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7110 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7111
7112 // Create an empty resource descriptor
7113 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7114 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7115 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7116 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7117 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7118
7119 // Zero64 = 0
7120 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7121 .addImm(0);
7122
7123 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7124 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7125 .addImm(Lo_32(RsrcDataFormat));
7126
7127 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7128 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7129 .addImm(Hi_32(RsrcDataFormat));
7130
7131 // NewSRsrc = {Zero64, SRsrcFormat}
7132 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7133 .addReg(Zero64)
7134 .addImm(AMDGPU::sub0_sub1)
7135 .addReg(SRsrcFormatLo)
7136 .addImm(AMDGPU::sub2)
7137 .addReg(SRsrcFormatHi)
7138 .addImm(AMDGPU::sub3);
7139
7140 return std::tuple(RsrcPtr, NewSRsrc);
7141}
7142
7145 MachineDominatorTree *MDT) const {
7146 MachineFunction &MF = *MI.getParent()->getParent();
7148 MachineBasicBlock *CreatedBB = nullptr;
7149
7150 // Legalize VOP2
7151 if (isVOP2(MI) || isVOPC(MI)) {
7153 return CreatedBB;
7154 }
7155
7156 // Legalize VOP3
7157 if (isVOP3(MI)) {
7159 return CreatedBB;
7160 }
7161
7162 // Legalize SMRD
7163 if (isSMRD(MI)) {
7165 return CreatedBB;
7166 }
7167
7168 // Legalize FLAT
7169 if (isFLAT(MI)) {
7171 return CreatedBB;
7172 }
7173
7174 // Legalize REG_SEQUENCE and PHI
7175 // The register class of the operands much be the same type as the register
7176 // class of the output.
7177 if (MI.getOpcode() == AMDGPU::PHI) {
7178 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7179 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7180 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7181 continue;
7182 const TargetRegisterClass *OpRC =
7183 MRI.getRegClass(MI.getOperand(i).getReg());
7184 if (RI.hasVectorRegisters(OpRC)) {
7185 VRC = OpRC;
7186 } else {
7187 SRC = OpRC;
7188 }
7189 }
7190
7191 // If any of the operands are VGPR registers, then they all most be
7192 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7193 // them.
7194 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7195 if (!VRC) {
7196 assert(SRC);
7197 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7198 VRC = &AMDGPU::VReg_1RegClass;
7199 } else
7200 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7201 ? RI.getEquivalentAGPRClass(SRC)
7202 : RI.getEquivalentVGPRClass(SRC);
7203 } else {
7204 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7205 ? RI.getEquivalentAGPRClass(VRC)
7206 : RI.getEquivalentVGPRClass(VRC);
7207 }
7208 RC = VRC;
7209 } else {
7210 RC = SRC;
7211 }
7212
7213 // Update all the operands so they have the same type.
7214 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7215 MachineOperand &Op = MI.getOperand(I);
7216 if (!Op.isReg() || !Op.getReg().isVirtual())
7217 continue;
7218
7219 // MI is a PHI instruction.
7220 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7222
7223 // Avoid creating no-op copies with the same src and dst reg class. These
7224 // confuse some of the machine passes.
7225 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7226 }
7227 }
7228
7229 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7230 // VGPR dest type and SGPR sources, insert copies so all operands are
7231 // VGPRs. This seems to help operand folding / the register coalescer.
7232 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7233 MachineBasicBlock *MBB = MI.getParent();
7234 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7235 if (RI.hasVGPRs(DstRC)) {
7236 // Update all the operands so they are VGPR register classes. These may
7237 // not be the same register class because REG_SEQUENCE supports mixing
7238 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7239 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7240 MachineOperand &Op = MI.getOperand(I);
7241 if (!Op.isReg() || !Op.getReg().isVirtual())
7242 continue;
7243
7244 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7245 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7246 if (VRC == OpRC)
7247 continue;
7248
7249 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7250 Op.setIsKill();
7251 }
7252 }
7253
7254 return CreatedBB;
7255 }
7256
7257 // Legalize INSERT_SUBREG
7258 // src0 must have the same register class as dst
7259 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7260 Register Dst = MI.getOperand(0).getReg();
7261 Register Src0 = MI.getOperand(1).getReg();
7262 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7263 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7264 if (DstRC != Src0RC) {
7265 MachineBasicBlock *MBB = MI.getParent();
7266 MachineOperand &Op = MI.getOperand(1);
7267 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7268 }
7269 return CreatedBB;
7270 }
7271
7272 // Legalize SI_INIT_M0
7273 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7274 MachineOperand &Src = MI.getOperand(0);
7275 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7276 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7277 return CreatedBB;
7278 }
7279
7280 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7281 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7282 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7283 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7284 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7285 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7286 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7287 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7288 MachineOperand &Src = MI.getOperand(1);
7289 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7290 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7291 return CreatedBB;
7292 }
7293
7294 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7295 //
7296 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7297 // scratch memory access. In both cases, the legalization never involves
7298 // conversion to the addr64 form.
7300 (isMUBUF(MI) || isMTBUF(MI)))) {
7301 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7302 ? AMDGPU::OpName::rsrc
7303 : AMDGPU::OpName::srsrc;
7304 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7305 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7306 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7307
7308 AMDGPU::OpName SampOpName =
7309 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7310 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7311 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7312 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7313
7314 return CreatedBB;
7315 }
7316
7317 // Legalize SI_CALL
7318 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7319 MachineOperand *Dest = &MI.getOperand(0);
7320 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7321 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7322 // following copies, we also need to move copies from and to physical
7323 // registers into the loop block.
7324 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7325 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7326
7327 // Also move the copies to physical registers into the loop block
7328 MachineBasicBlock &MBB = *MI.getParent();
7330 while (Start->getOpcode() != FrameSetupOpcode)
7331 --Start;
7333 while (End->getOpcode() != FrameDestroyOpcode)
7334 ++End;
7335 // Also include following copies of the return value
7336 ++End;
7337 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7338 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7339 ++End;
7340 CreatedBB =
7341 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7342 }
7343 }
7344
7345 // Legalize s_sleep_var.
7346 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7347 const DebugLoc &DL = MI.getDebugLoc();
7348 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7349 int Src0Idx =
7350 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7351 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7352 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7353 .add(Src0);
7354 Src0.ChangeToRegister(Reg, false);
7355 return nullptr;
7356 }
7357
7358 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7359 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7360 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7361 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7362 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7363 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7364 for (MachineOperand &Src : MI.explicit_operands()) {
7365 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7366 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7367 }
7368 return CreatedBB;
7369 }
7370
7371 // Legalize MUBUF instructions.
7372 bool isSoffsetLegal = true;
7373 int SoffsetIdx =
7374 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7375 if (SoffsetIdx != -1) {
7376 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7377 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7378 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7379 isSoffsetLegal = false;
7380 }
7381 }
7382
7383 bool isRsrcLegal = true;
7384 int RsrcIdx =
7385 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7386 if (RsrcIdx != -1) {
7387 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7388 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7389 isRsrcLegal = false;
7390 }
7391
7392 // The operands are legal.
7393 if (isRsrcLegal && isSoffsetLegal)
7394 return CreatedBB;
7395
7396 if (!isRsrcLegal) {
7397 // Legalize a VGPR Rsrc
7398 //
7399 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7400 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7401 // a zero-value SRsrc.
7402 //
7403 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7404 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7405 // above.
7406 //
7407 // Otherwise we are on non-ADDR64 hardware, and/or we have
7408 // idxen/offen/bothen and we fall back to a waterfall loop.
7409
7410 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7411 MachineBasicBlock &MBB = *MI.getParent();
7412
7413 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7414 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7415 // This is already an ADDR64 instruction so we need to add the pointer
7416 // extracted from the resource descriptor to the current value of VAddr.
7417 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7418 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7419 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7420
7421 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7422 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7423 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7424
7425 unsigned RsrcPtr, NewSRsrc;
7426 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7427
7428 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7429 const DebugLoc &DL = MI.getDebugLoc();
7430 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7431 .addDef(CondReg0)
7432 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7433 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7434 .addImm(0);
7435
7436 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7437 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7438 .addDef(CondReg1, RegState::Dead)
7439 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7440 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7441 .addReg(CondReg0, RegState::Kill)
7442 .addImm(0);
7443
7444 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7445 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7446 .addReg(NewVAddrLo)
7447 .addImm(AMDGPU::sub0)
7448 .addReg(NewVAddrHi)
7449 .addImm(AMDGPU::sub1);
7450
7451 VAddr->setReg(NewVAddr);
7452 Rsrc->setReg(NewSRsrc);
7453 } else if (!VAddr && ST.hasAddr64()) {
7454 // This instructions is the _OFFSET variant, so we need to convert it to
7455 // ADDR64.
7456 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7457 "FIXME: Need to emit flat atomics here");
7458
7459 unsigned RsrcPtr, NewSRsrc;
7460 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7461
7462 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7463 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7464 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7465 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7466 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7467
7468 // Atomics with return have an additional tied operand and are
7469 // missing some of the special bits.
7470 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7471 MachineInstr *Addr64;
7472
7473 if (!VDataIn) {
7474 // Regular buffer load / store.
7476 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7477 .add(*VData)
7478 .addReg(NewVAddr)
7479 .addReg(NewSRsrc)
7480 .add(*SOffset)
7481 .add(*Offset);
7482
7483 if (const MachineOperand *CPol =
7484 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7485 MIB.addImm(CPol->getImm());
7486 }
7487
7488 if (const MachineOperand *TFE =
7489 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7490 MIB.addImm(TFE->getImm());
7491 }
7492
7493 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7494
7495 MIB.cloneMemRefs(MI);
7496 Addr64 = MIB;
7497 } else {
7498 // Atomics with return.
7499 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7500 .add(*VData)
7501 .add(*VDataIn)
7502 .addReg(NewVAddr)
7503 .addReg(NewSRsrc)
7504 .add(*SOffset)
7505 .add(*Offset)
7506 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7507 .cloneMemRefs(MI);
7508 }
7509
7510 MI.removeFromParent();
7511
7512 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7513 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7514 NewVAddr)
7515 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7516 .addImm(AMDGPU::sub0)
7517 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7518 .addImm(AMDGPU::sub1);
7519 } else {
7520 // Legalize a VGPR Rsrc and soffset together.
7521 if (!isSoffsetLegal) {
7522 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7523 CreatedBB =
7524 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7525 return CreatedBB;
7526 }
7527 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7528 return CreatedBB;
7529 }
7530 }
7531
7532 // Legalize a VGPR soffset.
7533 if (!isSoffsetLegal) {
7534 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7535 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7536 return CreatedBB;
7537 }
7538 return CreatedBB;
7539}
7540
7542 InstrList.insert(MI);
7543 // Add MBUF instructiosn to deferred list.
7544 int RsrcIdx =
7545 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7546 if (RsrcIdx != -1) {
7547 DeferredList.insert(MI);
7548 }
7549}
7550
7552 return DeferredList.contains(MI);
7553}
7554
7555// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7556// lowering (change spgr to vgpr).
7557// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7558// size. Need to legalize the size of the operands during the vgpr lowering
7559// chain. This can be removed after we have sgpr16 in place
7561 MachineRegisterInfo &MRI) const {
7562 if (!ST.useRealTrue16Insts())
7563 return;
7564
7565 unsigned Opcode = MI.getOpcode();
7566 MachineBasicBlock *MBB = MI.getParent();
7567 // Legalize operands and check for size mismatch
7568 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7569 OpIdx >= get(Opcode).getNumOperands() ||
7570 get(Opcode).operands()[OpIdx].RegClass == -1)
7571 return;
7572
7573 MachineOperand &Op = MI.getOperand(OpIdx);
7574 if (!Op.isReg() || !Op.getReg().isVirtual())
7575 return;
7576
7577 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7578 if (!RI.isVGPRClass(CurrRC))
7579 return;
7580
7581 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7582 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7583 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7584 Op.setSubReg(AMDGPU::lo16);
7585 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7586 const DebugLoc &DL = MI.getDebugLoc();
7587 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7588 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7589 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7590 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7591 .addReg(Op.getReg())
7592 .addImm(AMDGPU::lo16)
7593 .addReg(Undef)
7594 .addImm(AMDGPU::hi16);
7595 Op.setReg(NewDstReg);
7596 }
7597}
7599 MachineRegisterInfo &MRI) const {
7600 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7602}
7603
7605 MachineDominatorTree *MDT) const {
7606
7607 while (!Worklist.empty()) {
7608 MachineInstr &Inst = *Worklist.top();
7609 Worklist.erase_top();
7610 // Skip MachineInstr in the deferred list.
7611 if (Worklist.isDeferred(&Inst))
7612 continue;
7613 moveToVALUImpl(Worklist, MDT, Inst);
7614 }
7615
7616 // Deferred list of instructions will be processed once
7617 // all the MachineInstr in the worklist are done.
7618 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7619 moveToVALUImpl(Worklist, MDT, *Inst);
7620 assert(Worklist.empty() &&
7621 "Deferred MachineInstr are not supposed to re-populate worklist");
7622 }
7623}
7624
7627 MachineInstr &Inst) const {
7628
7630 if (!MBB)
7631 return;
7632 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7633 unsigned Opcode = Inst.getOpcode();
7634 unsigned NewOpcode = getVALUOp(Inst);
7635 // Handle some special cases
7636 switch (Opcode) {
7637 default:
7638 break;
7639 case AMDGPU::S_ADD_I32:
7640 case AMDGPU::S_SUB_I32: {
7641 // FIXME: The u32 versions currently selected use the carry.
7642 bool Changed;
7643 MachineBasicBlock *CreatedBBTmp = nullptr;
7644 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7645 if (Changed)
7646 return;
7647
7648 // Default handling
7649 break;
7650 }
7651
7652 case AMDGPU::S_MUL_U64:
7653 if (ST.hasVectorMulU64()) {
7654 NewOpcode = AMDGPU::V_MUL_U64_e64;
7655 break;
7656 }
7657 // Split s_mul_u64 in 32-bit vector multiplications.
7658 splitScalarSMulU64(Worklist, Inst, MDT);
7659 Inst.eraseFromParent();
7660 return;
7661
7662 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7663 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7664 // This is a special case of s_mul_u64 where all the operands are either
7665 // zero extended or sign extended.
7666 splitScalarSMulPseudo(Worklist, Inst, MDT);
7667 Inst.eraseFromParent();
7668 return;
7669
7670 case AMDGPU::S_AND_B64:
7671 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7672 Inst.eraseFromParent();
7673 return;
7674
7675 case AMDGPU::S_OR_B64:
7676 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7677 Inst.eraseFromParent();
7678 return;
7679
7680 case AMDGPU::S_XOR_B64:
7681 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7682 Inst.eraseFromParent();
7683 return;
7684
7685 case AMDGPU::S_NAND_B64:
7686 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7687 Inst.eraseFromParent();
7688 return;
7689
7690 case AMDGPU::S_NOR_B64:
7691 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7692 Inst.eraseFromParent();
7693 return;
7694
7695 case AMDGPU::S_XNOR_B64:
7696 if (ST.hasDLInsts())
7697 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7698 else
7699 splitScalar64BitXnor(Worklist, Inst, MDT);
7700 Inst.eraseFromParent();
7701 return;
7702
7703 case AMDGPU::S_ANDN2_B64:
7704 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7705 Inst.eraseFromParent();
7706 return;
7707
7708 case AMDGPU::S_ORN2_B64:
7709 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7710 Inst.eraseFromParent();
7711 return;
7712
7713 case AMDGPU::S_BREV_B64:
7714 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7715 Inst.eraseFromParent();
7716 return;
7717
7718 case AMDGPU::S_NOT_B64:
7719 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7720 Inst.eraseFromParent();
7721 return;
7722
7723 case AMDGPU::S_BCNT1_I32_B64:
7724 splitScalar64BitBCNT(Worklist, Inst);
7725 Inst.eraseFromParent();
7726 return;
7727
7728 case AMDGPU::S_BFE_I64:
7729 splitScalar64BitBFE(Worklist, Inst);
7730 Inst.eraseFromParent();
7731 return;
7732
7733 case AMDGPU::S_FLBIT_I32_B64:
7734 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7735 Inst.eraseFromParent();
7736 return;
7737 case AMDGPU::S_FF1_I32_B64:
7738 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7739 Inst.eraseFromParent();
7740 return;
7741
7742 case AMDGPU::S_LSHL_B32:
7743 if (ST.hasOnlyRevVALUShifts()) {
7744 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7745 swapOperands(Inst);
7746 }
7747 break;
7748 case AMDGPU::S_ASHR_I32:
7749 if (ST.hasOnlyRevVALUShifts()) {
7750 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7751 swapOperands(Inst);
7752 }
7753 break;
7754 case AMDGPU::S_LSHR_B32:
7755 if (ST.hasOnlyRevVALUShifts()) {
7756 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7757 swapOperands(Inst);
7758 }
7759 break;
7760 case AMDGPU::S_LSHL_B64:
7761 if (ST.hasOnlyRevVALUShifts()) {
7762 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7763 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7764 : AMDGPU::V_LSHLREV_B64_e64;
7765 swapOperands(Inst);
7766 }
7767 break;
7768 case AMDGPU::S_ASHR_I64:
7769 if (ST.hasOnlyRevVALUShifts()) {
7770 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7771 swapOperands(Inst);
7772 }
7773 break;
7774 case AMDGPU::S_LSHR_B64:
7775 if (ST.hasOnlyRevVALUShifts()) {
7776 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7777 swapOperands(Inst);
7778 }
7779 break;
7780
7781 case AMDGPU::S_ABS_I32:
7782 lowerScalarAbs(Worklist, Inst);
7783 Inst.eraseFromParent();
7784 return;
7785
7786 case AMDGPU::S_CBRANCH_SCC0:
7787 case AMDGPU::S_CBRANCH_SCC1: {
7788 // Clear unused bits of vcc
7789 Register CondReg = Inst.getOperand(1).getReg();
7790 bool IsSCC = CondReg == AMDGPU::SCC;
7792 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7793 .addReg(LMC.ExecReg)
7794 .addReg(IsSCC ? LMC.VccReg : CondReg);
7795 Inst.removeOperand(1);
7796 } break;
7797
7798 case AMDGPU::S_BFE_U64:
7799 case AMDGPU::S_BFM_B64:
7800 llvm_unreachable("Moving this op to VALU not implemented");
7801
7802 case AMDGPU::S_PACK_LL_B32_B16:
7803 case AMDGPU::S_PACK_LH_B32_B16:
7804 case AMDGPU::S_PACK_HL_B32_B16:
7805 case AMDGPU::S_PACK_HH_B32_B16:
7806 movePackToVALU(Worklist, MRI, Inst);
7807 Inst.eraseFromParent();
7808 return;
7809
7810 case AMDGPU::S_XNOR_B32:
7811 lowerScalarXnor(Worklist, Inst);
7812 Inst.eraseFromParent();
7813 return;
7814
7815 case AMDGPU::S_NAND_B32:
7816 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7817 Inst.eraseFromParent();
7818 return;
7819
7820 case AMDGPU::S_NOR_B32:
7821 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7822 Inst.eraseFromParent();
7823 return;
7824
7825 case AMDGPU::S_ANDN2_B32:
7826 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7827 Inst.eraseFromParent();
7828 return;
7829
7830 case AMDGPU::S_ORN2_B32:
7831 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7832 Inst.eraseFromParent();
7833 return;
7834
7835 // TODO: remove as soon as everything is ready
7836 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7837 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7838 // can only be selected from the uniform SDNode.
7839 case AMDGPU::S_ADD_CO_PSEUDO:
7840 case AMDGPU::S_SUB_CO_PSEUDO: {
7841 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7842 ? AMDGPU::V_ADDC_U32_e64
7843 : AMDGPU::V_SUBB_U32_e64;
7844 const auto *CarryRC = RI.getWaveMaskRegClass();
7845
7846 Register CarryInReg = Inst.getOperand(4).getReg();
7847 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7848 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7849 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7850 .addReg(CarryInReg);
7851 }
7852
7853 Register CarryOutReg = Inst.getOperand(1).getReg();
7854
7855 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7856 MRI.getRegClass(Inst.getOperand(0).getReg())));
7857 MachineInstr *CarryOp =
7858 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7859 .addReg(CarryOutReg, RegState::Define)
7860 .add(Inst.getOperand(2))
7861 .add(Inst.getOperand(3))
7862 .addReg(CarryInReg)
7863 .addImm(0);
7864 legalizeOperands(*CarryOp);
7865 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7866 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7867 Inst.eraseFromParent();
7868 }
7869 return;
7870 case AMDGPU::S_UADDO_PSEUDO:
7871 case AMDGPU::S_USUBO_PSEUDO: {
7872 const DebugLoc &DL = Inst.getDebugLoc();
7873 MachineOperand &Dest0 = Inst.getOperand(0);
7874 MachineOperand &Dest1 = Inst.getOperand(1);
7875 MachineOperand &Src0 = Inst.getOperand(2);
7876 MachineOperand &Src1 = Inst.getOperand(3);
7877
7878 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7879 ? AMDGPU::V_ADD_CO_U32_e64
7880 : AMDGPU::V_SUB_CO_U32_e64;
7881 const TargetRegisterClass *NewRC =
7882 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7883 Register DestReg = MRI.createVirtualRegister(NewRC);
7884 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7885 .addReg(Dest1.getReg(), RegState::Define)
7886 .add(Src0)
7887 .add(Src1)
7888 .addImm(0); // clamp bit
7889
7890 legalizeOperands(*NewInstr, MDT);
7891 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7892 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7893 Worklist);
7894 Inst.eraseFromParent();
7895 }
7896 return;
7897
7898 case AMDGPU::S_CSELECT_B32:
7899 case AMDGPU::S_CSELECT_B64:
7900 lowerSelect(Worklist, Inst, MDT);
7901 Inst.eraseFromParent();
7902 return;
7903 case AMDGPU::S_CMP_EQ_I32:
7904 case AMDGPU::S_CMP_LG_I32:
7905 case AMDGPU::S_CMP_GT_I32:
7906 case AMDGPU::S_CMP_GE_I32:
7907 case AMDGPU::S_CMP_LT_I32:
7908 case AMDGPU::S_CMP_LE_I32:
7909 case AMDGPU::S_CMP_EQ_U32:
7910 case AMDGPU::S_CMP_LG_U32:
7911 case AMDGPU::S_CMP_GT_U32:
7912 case AMDGPU::S_CMP_GE_U32:
7913 case AMDGPU::S_CMP_LT_U32:
7914 case AMDGPU::S_CMP_LE_U32:
7915 case AMDGPU::S_CMP_EQ_U64:
7916 case AMDGPU::S_CMP_LG_U64:
7917 case AMDGPU::S_CMP_LT_F32:
7918 case AMDGPU::S_CMP_EQ_F32:
7919 case AMDGPU::S_CMP_LE_F32:
7920 case AMDGPU::S_CMP_GT_F32:
7921 case AMDGPU::S_CMP_LG_F32:
7922 case AMDGPU::S_CMP_GE_F32:
7923 case AMDGPU::S_CMP_O_F32:
7924 case AMDGPU::S_CMP_U_F32:
7925 case AMDGPU::S_CMP_NGE_F32:
7926 case AMDGPU::S_CMP_NLG_F32:
7927 case AMDGPU::S_CMP_NGT_F32:
7928 case AMDGPU::S_CMP_NLE_F32:
7929 case AMDGPU::S_CMP_NEQ_F32:
7930 case AMDGPU::S_CMP_NLT_F32: {
7931 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7932 auto NewInstr =
7933 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7934 .setMIFlags(Inst.getFlags());
7935 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7936 0) {
7937 NewInstr
7938 .addImm(0) // src0_modifiers
7939 .add(Inst.getOperand(0)) // src0
7940 .addImm(0) // src1_modifiers
7941 .add(Inst.getOperand(1)) // src1
7942 .addImm(0); // clamp
7943 } else {
7944 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7945 }
7946 legalizeOperands(*NewInstr, MDT);
7947 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7948 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7949 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7950 Inst.eraseFromParent();
7951 return;
7952 }
7953 case AMDGPU::S_CMP_LT_F16:
7954 case AMDGPU::S_CMP_EQ_F16:
7955 case AMDGPU::S_CMP_LE_F16:
7956 case AMDGPU::S_CMP_GT_F16:
7957 case AMDGPU::S_CMP_LG_F16:
7958 case AMDGPU::S_CMP_GE_F16:
7959 case AMDGPU::S_CMP_O_F16:
7960 case AMDGPU::S_CMP_U_F16:
7961 case AMDGPU::S_CMP_NGE_F16:
7962 case AMDGPU::S_CMP_NLG_F16:
7963 case AMDGPU::S_CMP_NGT_F16:
7964 case AMDGPU::S_CMP_NLE_F16:
7965 case AMDGPU::S_CMP_NEQ_F16:
7966 case AMDGPU::S_CMP_NLT_F16: {
7967 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7968 auto NewInstr =
7969 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7970 .setMIFlags(Inst.getFlags());
7971 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7972 NewInstr
7973 .addImm(0) // src0_modifiers
7974 .add(Inst.getOperand(0)) // src0
7975 .addImm(0) // src1_modifiers
7976 .add(Inst.getOperand(1)) // src1
7977 .addImm(0); // clamp
7978 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7979 NewInstr.addImm(0); // op_sel0
7980 } else {
7981 NewInstr
7982 .add(Inst.getOperand(0))
7983 .add(Inst.getOperand(1));
7984 }
7985 legalizeOperandsVALUt16(*NewInstr, MRI);
7986 legalizeOperands(*NewInstr, MDT);
7987 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7988 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7989 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7990 Inst.eraseFromParent();
7991 return;
7992 }
7993 case AMDGPU::S_CVT_HI_F32_F16: {
7994 const DebugLoc &DL = Inst.getDebugLoc();
7995 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7996 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7997 if (ST.useRealTrue16Insts()) {
7998 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7999 .add(Inst.getOperand(1));
8000 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8001 .addImm(0) // src0_modifiers
8002 .addReg(TmpReg, 0, AMDGPU::hi16)
8003 .addImm(0) // clamp
8004 .addImm(0) // omod
8005 .addImm(0); // op_sel0
8006 } else {
8007 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8008 .addImm(16)
8009 .add(Inst.getOperand(1));
8010 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8011 .addImm(0) // src0_modifiers
8012 .addReg(TmpReg)
8013 .addImm(0) // clamp
8014 .addImm(0); // omod
8015 }
8016
8017 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8018 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8019 Inst.eraseFromParent();
8020 return;
8021 }
8022 case AMDGPU::S_MINIMUM_F32:
8023 case AMDGPU::S_MAXIMUM_F32: {
8024 const DebugLoc &DL = Inst.getDebugLoc();
8025 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8026 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8027 .addImm(0) // src0_modifiers
8028 .add(Inst.getOperand(1))
8029 .addImm(0) // src1_modifiers
8030 .add(Inst.getOperand(2))
8031 .addImm(0) // clamp
8032 .addImm(0); // omod
8033 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8034
8035 legalizeOperands(*NewInstr, MDT);
8036 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8037 Inst.eraseFromParent();
8038 return;
8039 }
8040 case AMDGPU::S_MINIMUM_F16:
8041 case AMDGPU::S_MAXIMUM_F16: {
8042 const DebugLoc &DL = Inst.getDebugLoc();
8043 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8044 ? &AMDGPU::VGPR_16RegClass
8045 : &AMDGPU::VGPR_32RegClass);
8046 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8047 .addImm(0) // src0_modifiers
8048 .add(Inst.getOperand(1))
8049 .addImm(0) // src1_modifiers
8050 .add(Inst.getOperand(2))
8051 .addImm(0) // clamp
8052 .addImm(0) // omod
8053 .addImm(0); // opsel0
8054 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8055 legalizeOperandsVALUt16(*NewInstr, MRI);
8056 legalizeOperands(*NewInstr, MDT);
8057 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8058 Inst.eraseFromParent();
8059 return;
8060 }
8061 case AMDGPU::V_S_EXP_F16_e64:
8062 case AMDGPU::V_S_LOG_F16_e64:
8063 case AMDGPU::V_S_RCP_F16_e64:
8064 case AMDGPU::V_S_RSQ_F16_e64:
8065 case AMDGPU::V_S_SQRT_F16_e64: {
8066 const DebugLoc &DL = Inst.getDebugLoc();
8067 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8068 ? &AMDGPU::VGPR_16RegClass
8069 : &AMDGPU::VGPR_32RegClass);
8070 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8071 .add(Inst.getOperand(1)) // src0_modifiers
8072 .add(Inst.getOperand(2))
8073 .add(Inst.getOperand(3)) // clamp
8074 .add(Inst.getOperand(4)) // omod
8075 .setMIFlags(Inst.getFlags());
8076 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8077 NewInstr.addImm(0); // opsel0
8078 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8079 legalizeOperandsVALUt16(*NewInstr, MRI);
8080 legalizeOperands(*NewInstr, MDT);
8081 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8082 Inst.eraseFromParent();
8083 return;
8084 }
8085 }
8086
8087 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8088 // We cannot move this instruction to the VALU, so we should try to
8089 // legalize its operands instead.
8090 legalizeOperands(Inst, MDT);
8091 return;
8092 }
8093 // Handle converting generic instructions like COPY-to-SGPR into
8094 // COPY-to-VGPR.
8095 if (NewOpcode == Opcode) {
8096 Register DstReg = Inst.getOperand(0).getReg();
8097 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8098
8099 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8100 // hope for the best.
8101 if (Inst.isCopy() && DstReg.isPhysical() &&
8102 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8103 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8104 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8105 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8106 .add(Inst.getOperand(1));
8107 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8108 DstReg)
8109 .addReg(NewDst);
8110
8111 Inst.eraseFromParent();
8112 return;
8113 }
8114
8115 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8116 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8117 // Instead of creating a copy where src and dst are the same register
8118 // class, we just replace all uses of dst with src. These kinds of
8119 // copies interfere with the heuristics MachineSink uses to decide
8120 // whether or not to split a critical edge. Since the pass assumes
8121 // that copies will end up as machine instructions and not be
8122 // eliminated.
8123 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8124 Register NewDstReg = Inst.getOperand(1).getReg();
8125 MRI.replaceRegWith(DstReg, NewDstReg);
8126 MRI.clearKillFlags(NewDstReg);
8127 Inst.getOperand(0).setReg(DstReg);
8128 Inst.eraseFromParent();
8129 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8130 for (MachineOperand &MO :
8131 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8132 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8133 }
8134 return;
8135 }
8136
8137 // If this is a v2s copy between 16bit and 32bit reg,
8138 // replace vgpr copy to reg_sequence/extract_subreg
8139 // This can be remove after we have sgpr16 in place
8140 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8141 Inst.getOperand(1).getReg().isVirtual() &&
8142 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8143 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8144 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8145 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8146 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8147 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8148 get(AMDGPU::IMPLICIT_DEF), Undef);
8149 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8150 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8151 .addReg(Inst.getOperand(1).getReg())
8152 .addImm(AMDGPU::lo16)
8153 .addReg(Undef)
8154 .addImm(AMDGPU::hi16);
8155 Inst.eraseFromParent();
8156 MRI.replaceRegWith(DstReg, NewDstReg);
8157 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8158 return;
8159 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8160 AMDGPU::lo16)) {
8161 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8162 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8163 MRI.replaceRegWith(DstReg, NewDstReg);
8164 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8165 return;
8166 }
8167 }
8168
8169 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8170 MRI.replaceRegWith(DstReg, NewDstReg);
8171 legalizeOperands(Inst, MDT);
8172 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8173 return;
8174 }
8175
8176 // Use the new VALU Opcode.
8177 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8178 .setMIFlags(Inst.getFlags());
8179 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8180 // Intersperse VOP3 modifiers among the SALU operands.
8181 NewInstr->addOperand(Inst.getOperand(0));
8182 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8183 AMDGPU::OpName::src0_modifiers) >= 0)
8184 NewInstr.addImm(0);
8185 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8186 MachineOperand Src = Inst.getOperand(1);
8187 NewInstr->addOperand(Src);
8188 }
8189
8190 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8191 // We are converting these to a BFE, so we need to add the missing
8192 // operands for the size and offset.
8193 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8194 NewInstr.addImm(0);
8195 NewInstr.addImm(Size);
8196 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8197 // The VALU version adds the second operand to the result, so insert an
8198 // extra 0 operand.
8199 NewInstr.addImm(0);
8200 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8201 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8202 // If we need to move this to VGPRs, we need to unpack the second
8203 // operand back into the 2 separate ones for bit offset and width.
8204 assert(OffsetWidthOp.isImm() &&
8205 "Scalar BFE is only implemented for constant width and offset");
8206 uint32_t Imm = OffsetWidthOp.getImm();
8207
8208 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8209 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8210 NewInstr.addImm(Offset);
8211 NewInstr.addImm(BitWidth);
8212 } else {
8213 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8214 AMDGPU::OpName::src1_modifiers) >= 0)
8215 NewInstr.addImm(0);
8216 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8217 NewInstr->addOperand(Inst.getOperand(2));
8218 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8219 AMDGPU::OpName::src2_modifiers) >= 0)
8220 NewInstr.addImm(0);
8221 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8222 NewInstr->addOperand(Inst.getOperand(3));
8223 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8224 NewInstr.addImm(0);
8225 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8226 NewInstr.addImm(0);
8227 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8228 NewInstr.addImm(0);
8229 }
8230 } else {
8231 // Just copy the SALU operands.
8232 for (const MachineOperand &Op : Inst.explicit_operands())
8233 NewInstr->addOperand(Op);
8234 }
8235
8236 // Remove any references to SCC. Vector instructions can't read from it, and
8237 // We're just about to add the implicit use / defs of VCC, and we don't want
8238 // both.
8239 for (MachineOperand &Op : Inst.implicit_operands()) {
8240 if (Op.getReg() == AMDGPU::SCC) {
8241 // Only propagate through live-def of SCC.
8242 if (Op.isDef() && !Op.isDead())
8243 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8244 if (Op.isUse())
8245 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8246 }
8247 }
8248 Inst.eraseFromParent();
8249 Register NewDstReg;
8250 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8251 Register DstReg = NewInstr->getOperand(0).getReg();
8252 assert(DstReg.isVirtual());
8253 // Update the destination register class.
8254 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8255 assert(NewDstRC);
8256 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8257 MRI.replaceRegWith(DstReg, NewDstReg);
8258 }
8259 fixImplicitOperands(*NewInstr);
8260
8261 legalizeOperandsVALUt16(*NewInstr, MRI);
8262
8263 // Legalize the operands
8264 legalizeOperands(*NewInstr, MDT);
8265 if (NewDstReg)
8266 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8267}
8268
8269// Add/sub require special handling to deal with carry outs.
8270std::pair<bool, MachineBasicBlock *>
8271SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8272 MachineDominatorTree *MDT) const {
8273 if (ST.hasAddNoCarry()) {
8274 // Assume there is no user of scc since we don't select this in that case.
8275 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8276 // is used.
8277
8278 MachineBasicBlock &MBB = *Inst.getParent();
8279 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8280
8281 Register OldDstReg = Inst.getOperand(0).getReg();
8282 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8283
8284 unsigned Opc = Inst.getOpcode();
8285 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8286
8287 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8288 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8289
8290 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8291 Inst.removeOperand(3);
8292
8293 Inst.setDesc(get(NewOpc));
8294 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8295 Inst.addImplicitDefUseOperands(*MBB.getParent());
8296 MRI.replaceRegWith(OldDstReg, ResultReg);
8297 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8298
8299 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8300 return std::pair(true, NewBB);
8301 }
8302
8303 return std::pair(false, nullptr);
8304}
8305
8306void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8307 MachineDominatorTree *MDT) const {
8308
8309 MachineBasicBlock &MBB = *Inst.getParent();
8310 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8311 MachineBasicBlock::iterator MII = Inst;
8312 DebugLoc DL = Inst.getDebugLoc();
8313
8314 MachineOperand &Dest = Inst.getOperand(0);
8315 MachineOperand &Src0 = Inst.getOperand(1);
8316 MachineOperand &Src1 = Inst.getOperand(2);
8317 MachineOperand &Cond = Inst.getOperand(3);
8318
8319 Register CondReg = Cond.getReg();
8320 bool IsSCC = (CondReg == AMDGPU::SCC);
8321
8322 // If this is a trivial select where the condition is effectively not SCC
8323 // (CondReg is a source of copy to SCC), then the select is semantically
8324 // equivalent to copying CondReg. Hence, there is no need to create
8325 // V_CNDMASK, we can just use that and bail out.
8326 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8327 (Src1.getImm() == 0)) {
8328 MRI.replaceRegWith(Dest.getReg(), CondReg);
8329 return;
8330 }
8331
8332 Register NewCondReg = CondReg;
8333 if (IsSCC) {
8334 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8335 NewCondReg = MRI.createVirtualRegister(TC);
8336
8337 // Now look for the closest SCC def if it is a copy
8338 // replacing the CondReg with the COPY source register
8339 bool CopyFound = false;
8340 for (MachineInstr &CandI :
8342 Inst.getParent()->rend())) {
8343 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8344 -1) {
8345 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8346 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8347 .addReg(CandI.getOperand(1).getReg());
8348 CopyFound = true;
8349 }
8350 break;
8351 }
8352 }
8353 if (!CopyFound) {
8354 // SCC def is not a copy
8355 // Insert a trivial select instead of creating a copy, because a copy from
8356 // SCC would semantically mean just copying a single bit, but we may need
8357 // the result to be a vector condition mask that needs preserving.
8358 unsigned Opcode =
8359 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8360 auto NewSelect =
8361 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8362 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8363 }
8364 }
8365
8366 Register NewDestReg = MRI.createVirtualRegister(
8367 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8368 MachineInstr *NewInst;
8369 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8370 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8371 .addImm(0)
8372 .add(Src1) // False
8373 .addImm(0)
8374 .add(Src0) // True
8375 .addReg(NewCondReg);
8376 } else {
8377 NewInst =
8378 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8379 .add(Src1) // False
8380 .add(Src0) // True
8381 .addReg(NewCondReg);
8382 }
8383 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8384 legalizeOperands(*NewInst, MDT);
8385 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8386}
8387
8388void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8389 MachineInstr &Inst) const {
8390 MachineBasicBlock &MBB = *Inst.getParent();
8391 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8392 MachineBasicBlock::iterator MII = Inst;
8393 DebugLoc DL = Inst.getDebugLoc();
8394
8395 MachineOperand &Dest = Inst.getOperand(0);
8396 MachineOperand &Src = Inst.getOperand(1);
8397 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8398 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8399
8400 unsigned SubOp = ST.hasAddNoCarry() ?
8401 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8402
8403 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8404 .addImm(0)
8405 .addReg(Src.getReg());
8406
8407 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8408 .addReg(Src.getReg())
8409 .addReg(TmpReg);
8410
8411 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8412 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8413}
8414
8415void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8416 MachineInstr &Inst) const {
8417 MachineBasicBlock &MBB = *Inst.getParent();
8418 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8419 MachineBasicBlock::iterator MII = Inst;
8420 const DebugLoc &DL = Inst.getDebugLoc();
8421
8422 MachineOperand &Dest = Inst.getOperand(0);
8423 MachineOperand &Src0 = Inst.getOperand(1);
8424 MachineOperand &Src1 = Inst.getOperand(2);
8425
8426 if (ST.hasDLInsts()) {
8427 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8428 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8429 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8430
8431 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8432 .add(Src0)
8433 .add(Src1);
8434
8435 MRI.replaceRegWith(Dest.getReg(), NewDest);
8436 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8437 } else {
8438 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8439 // invert either source and then perform the XOR. If either source is a
8440 // scalar register, then we can leave the inversion on the scalar unit to
8441 // achieve a better distribution of scalar and vector instructions.
8442 bool Src0IsSGPR = Src0.isReg() &&
8443 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8444 bool Src1IsSGPR = Src1.isReg() &&
8445 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8446 MachineInstr *Xor;
8447 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8448 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8449
8450 // Build a pair of scalar instructions and add them to the work list.
8451 // The next iteration over the work list will lower these to the vector
8452 // unit as necessary.
8453 if (Src0IsSGPR) {
8454 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8455 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8456 .addReg(Temp)
8457 .add(Src1);
8458 } else if (Src1IsSGPR) {
8459 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8460 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8461 .add(Src0)
8462 .addReg(Temp);
8463 } else {
8464 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8465 .add(Src0)
8466 .add(Src1);
8467 MachineInstr *Not =
8468 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8469 Worklist.insert(Not);
8470 }
8471
8472 MRI.replaceRegWith(Dest.getReg(), NewDest);
8473
8474 Worklist.insert(Xor);
8475
8476 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8477 }
8478}
8479
8480void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8481 MachineInstr &Inst,
8482 unsigned Opcode) const {
8483 MachineBasicBlock &MBB = *Inst.getParent();
8484 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8485 MachineBasicBlock::iterator MII = Inst;
8486 const DebugLoc &DL = Inst.getDebugLoc();
8487
8488 MachineOperand &Dest = Inst.getOperand(0);
8489 MachineOperand &Src0 = Inst.getOperand(1);
8490 MachineOperand &Src1 = Inst.getOperand(2);
8491
8492 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8493 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8494
8495 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8496 .add(Src0)
8497 .add(Src1);
8498
8499 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8500 .addReg(Interm);
8501
8502 Worklist.insert(&Op);
8503 Worklist.insert(&Not);
8504
8505 MRI.replaceRegWith(Dest.getReg(), NewDest);
8506 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8507}
8508
8509void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8510 MachineInstr &Inst,
8511 unsigned Opcode) const {
8512 MachineBasicBlock &MBB = *Inst.getParent();
8513 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8514 MachineBasicBlock::iterator MII = Inst;
8515 const DebugLoc &DL = Inst.getDebugLoc();
8516
8517 MachineOperand &Dest = Inst.getOperand(0);
8518 MachineOperand &Src0 = Inst.getOperand(1);
8519 MachineOperand &Src1 = Inst.getOperand(2);
8520
8521 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8522 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8523
8524 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8525 .add(Src1);
8526
8527 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8528 .add(Src0)
8529 .addReg(Interm);
8530
8531 Worklist.insert(&Not);
8532 Worklist.insert(&Op);
8533
8534 MRI.replaceRegWith(Dest.getReg(), NewDest);
8535 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8536}
8537
8538void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8539 MachineInstr &Inst, unsigned Opcode,
8540 bool Swap) const {
8541 MachineBasicBlock &MBB = *Inst.getParent();
8542 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8543
8544 MachineOperand &Dest = Inst.getOperand(0);
8545 MachineOperand &Src0 = Inst.getOperand(1);
8546 DebugLoc DL = Inst.getDebugLoc();
8547
8548 MachineBasicBlock::iterator MII = Inst;
8549
8550 const MCInstrDesc &InstDesc = get(Opcode);
8551 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8552 MRI.getRegClass(Src0.getReg()) :
8553 &AMDGPU::SGPR_32RegClass;
8554
8555 const TargetRegisterClass *Src0SubRC =
8556 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8557
8558 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8559 AMDGPU::sub0, Src0SubRC);
8560
8561 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8562 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8563 const TargetRegisterClass *NewDestSubRC =
8564 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8565
8566 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8567 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8568
8569 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8570 AMDGPU::sub1, Src0SubRC);
8571
8572 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8573 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8574
8575 if (Swap)
8576 std::swap(DestSub0, DestSub1);
8577
8578 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8579 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8580 .addReg(DestSub0)
8581 .addImm(AMDGPU::sub0)
8582 .addReg(DestSub1)
8583 .addImm(AMDGPU::sub1);
8584
8585 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8586
8587 Worklist.insert(&LoHalf);
8588 Worklist.insert(&HiHalf);
8589
8590 // We don't need to legalizeOperands here because for a single operand, src0
8591 // will support any kind of input.
8592
8593 // Move all users of this moved value.
8594 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8595}
8596
8597// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8598// split the s_mul_u64 in 32-bit vector multiplications.
8599void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8600 MachineInstr &Inst,
8601 MachineDominatorTree *MDT) const {
8602 MachineBasicBlock &MBB = *Inst.getParent();
8603 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8604
8605 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8606 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8607 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8608
8609 MachineOperand &Dest = Inst.getOperand(0);
8610 MachineOperand &Src0 = Inst.getOperand(1);
8611 MachineOperand &Src1 = Inst.getOperand(2);
8612 const DebugLoc &DL = Inst.getDebugLoc();
8613 MachineBasicBlock::iterator MII = Inst;
8614
8615 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8616 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8617 const TargetRegisterClass *Src0SubRC =
8618 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8619 if (RI.isSGPRClass(Src0SubRC))
8620 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8621 const TargetRegisterClass *Src1SubRC =
8622 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8623 if (RI.isSGPRClass(Src1SubRC))
8624 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8625
8626 // First, we extract the low 32-bit and high 32-bit values from each of the
8627 // operands.
8628 MachineOperand Op0L =
8629 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8630 MachineOperand Op1L =
8631 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8632 MachineOperand Op0H =
8633 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8634 MachineOperand Op1H =
8635 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8636
8637 // The multilication is done as follows:
8638 //
8639 // Op1H Op1L
8640 // * Op0H Op0L
8641 // --------------------
8642 // Op1H*Op0L Op1L*Op0L
8643 // + Op1H*Op0H Op1L*Op0H
8644 // -----------------------------------------
8645 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8646 //
8647 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8648 // value and that would overflow.
8649 // The low 32-bit value is Op1L*Op0L.
8650 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8651
8652 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8653 MachineInstr *Op1L_Op0H =
8654 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8655 .add(Op1L)
8656 .add(Op0H);
8657
8658 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8659 MachineInstr *Op1H_Op0L =
8660 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8661 .add(Op1H)
8662 .add(Op0L);
8663
8664 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8665 MachineInstr *Carry =
8666 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8667 .add(Op1L)
8668 .add(Op0L);
8669
8670 MachineInstr *LoHalf =
8671 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8672 .add(Op1L)
8673 .add(Op0L);
8674
8675 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8676 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8677 .addReg(Op1L_Op0H_Reg)
8678 .addReg(Op1H_Op0L_Reg);
8679
8680 MachineInstr *HiHalf =
8681 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8682 .addReg(AddReg)
8683 .addReg(CarryReg);
8684
8685 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8686 .addReg(DestSub0)
8687 .addImm(AMDGPU::sub0)
8688 .addReg(DestSub1)
8689 .addImm(AMDGPU::sub1);
8690
8691 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8692
8693 // Try to legalize the operands in case we need to swap the order to keep it
8694 // valid.
8695 legalizeOperands(*Op1L_Op0H, MDT);
8696 legalizeOperands(*Op1H_Op0L, MDT);
8697 legalizeOperands(*Carry, MDT);
8698 legalizeOperands(*LoHalf, MDT);
8699 legalizeOperands(*Add, MDT);
8700 legalizeOperands(*HiHalf, MDT);
8701
8702 // Move all users of this moved value.
8703 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8704}
8705
8706// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8707// multiplications.
8708void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8709 MachineInstr &Inst,
8710 MachineDominatorTree *MDT) const {
8711 MachineBasicBlock &MBB = *Inst.getParent();
8712 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8713
8714 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8715 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8716 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8717
8718 MachineOperand &Dest = Inst.getOperand(0);
8719 MachineOperand &Src0 = Inst.getOperand(1);
8720 MachineOperand &Src1 = Inst.getOperand(2);
8721 const DebugLoc &DL = Inst.getDebugLoc();
8722 MachineBasicBlock::iterator MII = Inst;
8723
8724 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8725 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8726 const TargetRegisterClass *Src0SubRC =
8727 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8728 if (RI.isSGPRClass(Src0SubRC))
8729 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8730 const TargetRegisterClass *Src1SubRC =
8731 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8732 if (RI.isSGPRClass(Src1SubRC))
8733 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8734
8735 // First, we extract the low 32-bit and high 32-bit values from each of the
8736 // operands.
8737 MachineOperand Op0L =
8738 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8739 MachineOperand Op1L =
8740 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8741
8742 unsigned Opc = Inst.getOpcode();
8743 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8744 ? AMDGPU::V_MUL_HI_U32_e64
8745 : AMDGPU::V_MUL_HI_I32_e64;
8746 MachineInstr *HiHalf =
8747 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8748
8749 MachineInstr *LoHalf =
8750 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8751 .add(Op1L)
8752 .add(Op0L);
8753
8754 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8755 .addReg(DestSub0)
8756 .addImm(AMDGPU::sub0)
8757 .addReg(DestSub1)
8758 .addImm(AMDGPU::sub1);
8759
8760 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8761
8762 // Try to legalize the operands in case we need to swap the order to keep it
8763 // valid.
8764 legalizeOperands(*HiHalf, MDT);
8765 legalizeOperands(*LoHalf, MDT);
8766
8767 // Move all users of this moved value.
8768 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8769}
8770
8771void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8772 MachineInstr &Inst, unsigned Opcode,
8773 MachineDominatorTree *MDT) const {
8774 MachineBasicBlock &MBB = *Inst.getParent();
8775 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8776
8777 MachineOperand &Dest = Inst.getOperand(0);
8778 MachineOperand &Src0 = Inst.getOperand(1);
8779 MachineOperand &Src1 = Inst.getOperand(2);
8780 DebugLoc DL = Inst.getDebugLoc();
8781
8782 MachineBasicBlock::iterator MII = Inst;
8783
8784 const MCInstrDesc &InstDesc = get(Opcode);
8785 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8786 MRI.getRegClass(Src0.getReg()) :
8787 &AMDGPU::SGPR_32RegClass;
8788
8789 const TargetRegisterClass *Src0SubRC =
8790 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8791 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8792 MRI.getRegClass(Src1.getReg()) :
8793 &AMDGPU::SGPR_32RegClass;
8794
8795 const TargetRegisterClass *Src1SubRC =
8796 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8797
8798 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8799 AMDGPU::sub0, Src0SubRC);
8800 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8801 AMDGPU::sub0, Src1SubRC);
8802 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8803 AMDGPU::sub1, Src0SubRC);
8804 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8805 AMDGPU::sub1, Src1SubRC);
8806
8807 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8808 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8809 const TargetRegisterClass *NewDestSubRC =
8810 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8811
8812 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8813 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8814 .add(SrcReg0Sub0)
8815 .add(SrcReg1Sub0);
8816
8817 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8818 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8819 .add(SrcReg0Sub1)
8820 .add(SrcReg1Sub1);
8821
8822 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8823 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8824 .addReg(DestSub0)
8825 .addImm(AMDGPU::sub0)
8826 .addReg(DestSub1)
8827 .addImm(AMDGPU::sub1);
8828
8829 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8830
8831 Worklist.insert(&LoHalf);
8832 Worklist.insert(&HiHalf);
8833
8834 // Move all users of this moved value.
8835 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8836}
8837
8838void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8839 MachineInstr &Inst,
8840 MachineDominatorTree *MDT) const {
8841 MachineBasicBlock &MBB = *Inst.getParent();
8842 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8843
8844 MachineOperand &Dest = Inst.getOperand(0);
8845 MachineOperand &Src0 = Inst.getOperand(1);
8846 MachineOperand &Src1 = Inst.getOperand(2);
8847 const DebugLoc &DL = Inst.getDebugLoc();
8848
8849 MachineBasicBlock::iterator MII = Inst;
8850
8851 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8852
8853 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8854
8855 MachineOperand* Op0;
8856 MachineOperand* Op1;
8857
8858 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8859 Op0 = &Src0;
8860 Op1 = &Src1;
8861 } else {
8862 Op0 = &Src1;
8863 Op1 = &Src0;
8864 }
8865
8866 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8867 .add(*Op0);
8868
8869 Register NewDest = MRI.createVirtualRegister(DestRC);
8870
8871 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8872 .addReg(Interm)
8873 .add(*Op1);
8874
8875 MRI.replaceRegWith(Dest.getReg(), NewDest);
8876
8877 Worklist.insert(&Xor);
8878}
8879
8880void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8881 MachineInstr &Inst) const {
8882 MachineBasicBlock &MBB = *Inst.getParent();
8883 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8884
8885 MachineBasicBlock::iterator MII = Inst;
8886 const DebugLoc &DL = Inst.getDebugLoc();
8887
8888 MachineOperand &Dest = Inst.getOperand(0);
8889 MachineOperand &Src = Inst.getOperand(1);
8890
8891 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8892 const TargetRegisterClass *SrcRC = Src.isReg() ?
8893 MRI.getRegClass(Src.getReg()) :
8894 &AMDGPU::SGPR_32RegClass;
8895
8896 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8897 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8898
8899 const TargetRegisterClass *SrcSubRC =
8900 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8901
8902 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8903 AMDGPU::sub0, SrcSubRC);
8904 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8905 AMDGPU::sub1, SrcSubRC);
8906
8907 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8908
8909 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8910
8911 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8912
8913 // We don't need to legalize operands here. src0 for either instruction can be
8914 // an SGPR, and the second input is unused or determined here.
8915 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8916}
8917
8918void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8919 MachineInstr &Inst) const {
8920 MachineBasicBlock &MBB = *Inst.getParent();
8921 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8922 MachineBasicBlock::iterator MII = Inst;
8923 const DebugLoc &DL = Inst.getDebugLoc();
8924
8925 MachineOperand &Dest = Inst.getOperand(0);
8926 uint32_t Imm = Inst.getOperand(2).getImm();
8927 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8928 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8929
8930 (void) Offset;
8931
8932 // Only sext_inreg cases handled.
8933 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8934 Offset == 0 && "Not implemented");
8935
8936 if (BitWidth < 32) {
8937 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8938 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8939 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8940
8941 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8942 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8943 .addImm(0)
8944 .addImm(BitWidth);
8945
8946 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8947 .addImm(31)
8948 .addReg(MidRegLo);
8949
8950 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8951 .addReg(MidRegLo)
8952 .addImm(AMDGPU::sub0)
8953 .addReg(MidRegHi)
8954 .addImm(AMDGPU::sub1);
8955
8956 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8957 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8958 return;
8959 }
8960
8961 MachineOperand &Src = Inst.getOperand(1);
8962 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8963 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8964
8965 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8966 .addImm(31)
8967 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8968
8969 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8970 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8971 .addImm(AMDGPU::sub0)
8972 .addReg(TmpReg)
8973 .addImm(AMDGPU::sub1);
8974
8975 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8976 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8977}
8978
8979void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8980 MachineInstr &Inst, unsigned Opcode,
8981 MachineDominatorTree *MDT) const {
8982 // (S_FLBIT_I32_B64 hi:lo) ->
8983 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8984 // (S_FF1_I32_B64 hi:lo) ->
8985 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8986
8987 MachineBasicBlock &MBB = *Inst.getParent();
8988 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8989 MachineBasicBlock::iterator MII = Inst;
8990 const DebugLoc &DL = Inst.getDebugLoc();
8991
8992 MachineOperand &Dest = Inst.getOperand(0);
8993 MachineOperand &Src = Inst.getOperand(1);
8994
8995 const MCInstrDesc &InstDesc = get(Opcode);
8996
8997 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8998 unsigned OpcodeAdd =
8999 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9000
9001 const TargetRegisterClass *SrcRC =
9002 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9003 const TargetRegisterClass *SrcSubRC =
9004 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9005
9006 MachineOperand SrcRegSub0 =
9007 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9008 MachineOperand SrcRegSub1 =
9009 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9010
9011 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9012 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9013 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9014 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9015
9016 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9017
9018 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9019
9020 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9021 .addReg(IsCtlz ? MidReg1 : MidReg2)
9022 .addImm(32)
9023 .addImm(1); // enable clamp
9024
9025 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9026 .addReg(MidReg3)
9027 .addReg(IsCtlz ? MidReg2 : MidReg1);
9028
9029 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9030
9031 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9032}
9033
9034void SIInstrInfo::addUsersToMoveToVALUWorklist(
9036 SIInstrWorklist &Worklist) const {
9037 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9038 MachineInstr &UseMI = *MO.getParent();
9039
9040 unsigned OpNo = 0;
9041
9042 switch (UseMI.getOpcode()) {
9043 case AMDGPU::COPY:
9044 case AMDGPU::WQM:
9045 case AMDGPU::SOFT_WQM:
9046 case AMDGPU::STRICT_WWM:
9047 case AMDGPU::STRICT_WQM:
9048 case AMDGPU::REG_SEQUENCE:
9049 case AMDGPU::PHI:
9050 case AMDGPU::INSERT_SUBREG:
9051 break;
9052 default:
9053 OpNo = MO.getOperandNo();
9054 break;
9055 }
9056
9057 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9058 MRI.constrainRegClass(DstReg, OpRC);
9059
9060 if (!RI.hasVectorRegisters(OpRC))
9061 Worklist.insert(&UseMI);
9062 else
9063 // Legalization could change user list.
9065 }
9066}
9067
9068void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9070 MachineInstr &Inst) const {
9071 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9072 MachineBasicBlock *MBB = Inst.getParent();
9073 MachineOperand &Src0 = Inst.getOperand(1);
9074 MachineOperand &Src1 = Inst.getOperand(2);
9075 const DebugLoc &DL = Inst.getDebugLoc();
9076
9077 if (ST.useRealTrue16Insts()) {
9078 Register SrcReg0, SrcReg1;
9079 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9080 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9081 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9082 } else {
9083 SrcReg0 = Src0.getReg();
9084 }
9085
9086 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9087 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9088 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9089 } else {
9090 SrcReg1 = Src1.getReg();
9091 }
9092
9093 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9094 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9095
9096 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9097 switch (Inst.getOpcode()) {
9098 case AMDGPU::S_PACK_LL_B32_B16:
9099 NewMI
9100 .addReg(SrcReg0, 0,
9101 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9102 .addImm(AMDGPU::lo16)
9103 .addReg(SrcReg1, 0,
9104 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9105 .addImm(AMDGPU::hi16);
9106 break;
9107 case AMDGPU::S_PACK_LH_B32_B16:
9108 NewMI
9109 .addReg(SrcReg0, 0,
9110 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9111 .addImm(AMDGPU::lo16)
9112 .addReg(SrcReg1, 0, AMDGPU::hi16)
9113 .addImm(AMDGPU::hi16);
9114 break;
9115 case AMDGPU::S_PACK_HL_B32_B16:
9116 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9117 .addImm(AMDGPU::lo16)
9118 .addReg(SrcReg1, 0,
9119 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9120 .addImm(AMDGPU::hi16);
9121 break;
9122 case AMDGPU::S_PACK_HH_B32_B16:
9123 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9124 .addImm(AMDGPU::lo16)
9125 .addReg(SrcReg1, 0, AMDGPU::hi16)
9126 .addImm(AMDGPU::hi16);
9127 break;
9128 default:
9129 llvm_unreachable("unhandled s_pack_* instruction");
9130 }
9131
9132 MachineOperand &Dest = Inst.getOperand(0);
9133 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9134 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9135 return;
9136 }
9137
9138 switch (Inst.getOpcode()) {
9139 case AMDGPU::S_PACK_LL_B32_B16: {
9140 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9141 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9142
9143 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9144 // 0.
9145 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9146 .addImm(0xffff);
9147
9148 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9149 .addReg(ImmReg, RegState::Kill)
9150 .add(Src0);
9151
9152 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9153 .add(Src1)
9154 .addImm(16)
9155 .addReg(TmpReg, RegState::Kill);
9156 break;
9157 }
9158 case AMDGPU::S_PACK_LH_B32_B16: {
9159 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9160 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9161 .addImm(0xffff);
9162 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9163 .addReg(ImmReg, RegState::Kill)
9164 .add(Src0)
9165 .add(Src1);
9166 break;
9167 }
9168 case AMDGPU::S_PACK_HL_B32_B16: {
9169 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9170 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9171 .addImm(16)
9172 .add(Src0);
9173 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9174 .add(Src1)
9175 .addImm(16)
9176 .addReg(TmpReg, RegState::Kill);
9177 break;
9178 }
9179 case AMDGPU::S_PACK_HH_B32_B16: {
9180 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9181 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9182 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9183 .addImm(16)
9184 .add(Src0);
9185 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9186 .addImm(0xffff0000);
9187 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9188 .add(Src1)
9189 .addReg(ImmReg, RegState::Kill)
9190 .addReg(TmpReg, RegState::Kill);
9191 break;
9192 }
9193 default:
9194 llvm_unreachable("unhandled s_pack_* instruction");
9195 }
9196
9197 MachineOperand &Dest = Inst.getOperand(0);
9198 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9199 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9200}
9201
9202void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9203 MachineInstr &SCCDefInst,
9204 SIInstrWorklist &Worklist,
9205 Register NewCond) const {
9206
9207 // Ensure that def inst defines SCC, which is still live.
9208 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9209 !Op.isDead() && Op.getParent() == &SCCDefInst);
9210 SmallVector<MachineInstr *, 4> CopyToDelete;
9211 // This assumes that all the users of SCC are in the same block
9212 // as the SCC def.
9213 for (MachineInstr &MI : // Skip the def inst itself.
9214 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9215 SCCDefInst.getParent()->end())) {
9216 // Check if SCC is used first.
9217 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9218 if (SCCIdx != -1) {
9219 if (MI.isCopy()) {
9220 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9221 Register DestReg = MI.getOperand(0).getReg();
9222
9223 MRI.replaceRegWith(DestReg, NewCond);
9224 CopyToDelete.push_back(&MI);
9225 } else {
9226
9227 if (NewCond.isValid())
9228 MI.getOperand(SCCIdx).setReg(NewCond);
9229
9230 Worklist.insert(&MI);
9231 }
9232 }
9233 // Exit if we find another SCC def.
9234 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9235 break;
9236 }
9237 for (auto &Copy : CopyToDelete)
9238 Copy->eraseFromParent();
9239}
9240
9241// Instructions that use SCC may be converted to VALU instructions. When that
9242// happens, the SCC register is changed to VCC_LO. The instruction that defines
9243// SCC must be changed to an instruction that defines VCC. This function makes
9244// sure that the instruction that defines SCC is added to the moveToVALU
9245// worklist.
9246void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9247 SIInstrWorklist &Worklist) const {
9248 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9249 // then there is nothing to do because the defining instruction has been
9250 // converted to a VALU already. If SCC then that instruction needs to be
9251 // converted to a VALU.
9252 for (MachineInstr &MI :
9253 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9254 SCCUseInst->getParent()->rend())) {
9255 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9256 break;
9257 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9258 Worklist.insert(&MI);
9259 break;
9260 }
9261 }
9262}
9263
9264const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9265 const MachineInstr &Inst) const {
9266 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9267
9268 switch (Inst.getOpcode()) {
9269 // For target instructions, getOpRegClass just returns the virtual register
9270 // class associated with the operand, so we need to find an equivalent VGPR
9271 // register class in order to move the instruction to the VALU.
9272 case AMDGPU::COPY:
9273 case AMDGPU::PHI:
9274 case AMDGPU::REG_SEQUENCE:
9275 case AMDGPU::INSERT_SUBREG:
9276 case AMDGPU::WQM:
9277 case AMDGPU::SOFT_WQM:
9278 case AMDGPU::STRICT_WWM:
9279 case AMDGPU::STRICT_WQM: {
9280 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9281 if (RI.isAGPRClass(SrcRC)) {
9282 if (RI.isAGPRClass(NewDstRC))
9283 return nullptr;
9284
9285 switch (Inst.getOpcode()) {
9286 case AMDGPU::PHI:
9287 case AMDGPU::REG_SEQUENCE:
9288 case AMDGPU::INSERT_SUBREG:
9289 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9290 break;
9291 default:
9292 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9293 }
9294
9295 if (!NewDstRC)
9296 return nullptr;
9297 } else {
9298 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9299 return nullptr;
9300
9301 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9302 if (!NewDstRC)
9303 return nullptr;
9304 }
9305
9306 return NewDstRC;
9307 }
9308 default:
9309 return NewDstRC;
9310 }
9311}
9312
9313// Find the one SGPR operand we are allowed to use.
9314Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9315 int OpIndices[3]) const {
9316 const MCInstrDesc &Desc = MI.getDesc();
9317
9318 // Find the one SGPR operand we are allowed to use.
9319 //
9320 // First we need to consider the instruction's operand requirements before
9321 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9322 // of VCC, but we are still bound by the constant bus requirement to only use
9323 // one.
9324 //
9325 // If the operand's class is an SGPR, we can never move it.
9326
9327 Register SGPRReg = findImplicitSGPRRead(MI);
9328 if (SGPRReg)
9329 return SGPRReg;
9330
9331 Register UsedSGPRs[3] = {Register()};
9332 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9333
9334 for (unsigned i = 0; i < 3; ++i) {
9335 int Idx = OpIndices[i];
9336 if (Idx == -1)
9337 break;
9338
9339 const MachineOperand &MO = MI.getOperand(Idx);
9340 if (!MO.isReg())
9341 continue;
9342
9343 // Is this operand statically required to be an SGPR based on the operand
9344 // constraints?
9345 const TargetRegisterClass *OpRC =
9346 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9347 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9348 if (IsRequiredSGPR)
9349 return MO.getReg();
9350
9351 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9352 Register Reg = MO.getReg();
9353 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9354 if (RI.isSGPRClass(RegRC))
9355 UsedSGPRs[i] = Reg;
9356 }
9357
9358 // We don't have a required SGPR operand, so we have a bit more freedom in
9359 // selecting operands to move.
9360
9361 // Try to select the most used SGPR. If an SGPR is equal to one of the
9362 // others, we choose that.
9363 //
9364 // e.g.
9365 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9366 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9367
9368 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9369 // prefer those.
9370
9371 if (UsedSGPRs[0]) {
9372 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9373 SGPRReg = UsedSGPRs[0];
9374 }
9375
9376 if (!SGPRReg && UsedSGPRs[1]) {
9377 if (UsedSGPRs[1] == UsedSGPRs[2])
9378 SGPRReg = UsedSGPRs[1];
9379 }
9380
9381 return SGPRReg;
9382}
9383
9385 AMDGPU::OpName OperandName) const {
9386 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9387 return nullptr;
9388
9389 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9390 if (Idx == -1)
9391 return nullptr;
9392
9393 return &MI.getOperand(Idx);
9394}
9395
9397 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9398 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9401 return (Format << 44) |
9402 (1ULL << 56) | // RESOURCE_LEVEL = 1
9403 (3ULL << 60); // OOB_SELECT = 3
9404 }
9405
9406 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9407 if (ST.isAmdHsaOS()) {
9408 // Set ATC = 1. GFX9 doesn't have this bit.
9409 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9410 RsrcDataFormat |= (1ULL << 56);
9411
9412 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9413 // BTW, it disables TC L2 and therefore decreases performance.
9414 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9415 RsrcDataFormat |= (2ULL << 59);
9416 }
9417
9418 return RsrcDataFormat;
9419}
9420
9424 0xffffffff; // Size;
9425
9426 // GFX9 doesn't have ELEMENT_SIZE.
9427 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9428 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9429 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9430 }
9431
9432 // IndexStride = 64 / 32.
9433 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9434 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9435
9436 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9437 // Clear them unless we want a huge stride.
9438 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9439 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9440 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9441
9442 return Rsrc23;
9443}
9444
9446 unsigned Opc = MI.getOpcode();
9447
9448 return isSMRD(Opc);
9449}
9450
9452 return get(Opc).mayLoad() &&
9453 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9454}
9455
9457 int &FrameIndex) const {
9458 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9459 if (!Addr || !Addr->isFI())
9460 return Register();
9461
9462 assert(!MI.memoperands_empty() &&
9463 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9464
9465 FrameIndex = Addr->getIndex();
9466 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9467}
9468
9470 int &FrameIndex) const {
9471 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9472 assert(Addr && Addr->isFI());
9473 FrameIndex = Addr->getIndex();
9474 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9475}
9476
9478 int &FrameIndex) const {
9479 if (!MI.mayLoad())
9480 return Register();
9481
9482 if (isMUBUF(MI) || isVGPRSpill(MI))
9483 return isStackAccess(MI, FrameIndex);
9484
9485 if (isSGPRSpill(MI))
9486 return isSGPRStackAccess(MI, FrameIndex);
9487
9488 return Register();
9489}
9490
9492 int &FrameIndex) const {
9493 if (!MI.mayStore())
9494 return Register();
9495
9496 if (isMUBUF(MI) || isVGPRSpill(MI))
9497 return isStackAccess(MI, FrameIndex);
9498
9499 if (isSGPRSpill(MI))
9500 return isSGPRStackAccess(MI, FrameIndex);
9501
9502 return Register();
9503}
9504
9506 unsigned Size = 0;
9508 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9509 while (++I != E && I->isInsideBundle()) {
9510 assert(!I->isBundle() && "No nested bundle!");
9512 }
9513
9514 return Size;
9515}
9516
9518 unsigned Opc = MI.getOpcode();
9520 unsigned DescSize = Desc.getSize();
9521
9522 // If we have a definitive size, we can use it. Otherwise we need to inspect
9523 // the operands to know the size.
9524 if (isFixedSize(MI)) {
9525 unsigned Size = DescSize;
9526
9527 // If we hit the buggy offset, an extra nop will be inserted in MC so
9528 // estimate the worst case.
9529 if (MI.isBranch() && ST.hasOffset3fBug())
9530 Size += 4;
9531
9532 return Size;
9533 }
9534
9535 // Instructions may have a 32-bit literal encoded after them. Check
9536 // operands that could ever be literals.
9537 if (isVALU(MI) || isSALU(MI)) {
9538 if (isDPP(MI))
9539 return DescSize;
9540 bool HasLiteral = false;
9541 unsigned LiteralSize = 4;
9542 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9543 const MachineOperand &Op = MI.getOperand(I);
9544 const MCOperandInfo &OpInfo = Desc.operands()[I];
9545 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9546 HasLiteral = true;
9547 if (ST.has64BitLiterals()) {
9548 switch (OpInfo.OperandType) {
9549 default:
9550 break;
9552 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9553 LiteralSize = 8;
9554 break;
9556 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9557 LiteralSize = 8;
9558 break;
9559 }
9560 }
9561 break;
9562 }
9563 }
9564 return HasLiteral ? DescSize + LiteralSize : DescSize;
9565 }
9566
9567 // Check whether we have extra NSA words.
9568 if (isMIMG(MI)) {
9569 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9570 if (VAddr0Idx < 0)
9571 return 8;
9572
9573 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9574 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9575 }
9576
9577 switch (Opc) {
9578 case TargetOpcode::BUNDLE:
9579 return getInstBundleSize(MI);
9580 case TargetOpcode::INLINEASM:
9581 case TargetOpcode::INLINEASM_BR: {
9582 const MachineFunction *MF = MI.getParent()->getParent();
9583 const char *AsmStr = MI.getOperand(0).getSymbolName();
9584 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9585 }
9586 default:
9587 if (MI.isMetaInstruction())
9588 return 0;
9589
9590 // If D16 Pseudo inst, get correct MC code size
9591 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9592 if (D16Info) {
9593 // Assume d16_lo/hi inst are always in same size
9594 unsigned LoInstOpcode = D16Info->LoOp;
9595 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9596 DescSize = Desc.getSize();
9597 }
9598
9599 // If FMA Pseudo inst, get correct MC code size
9600 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9601 // All potential lowerings are the same size; arbitrarily pick one.
9602 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9603 DescSize = Desc.getSize();
9604 }
9605
9606 return DescSize;
9607 }
9608}
9609
9611 if (!isFLAT(MI))
9612 return false;
9613
9614 if (MI.memoperands_empty())
9615 return true;
9616
9617 for (const MachineMemOperand *MMO : MI.memoperands()) {
9618 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9619 return true;
9620 }
9621 return false;
9622}
9623
9626 static const std::pair<int, const char *> TargetIndices[] = {
9627 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9628 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9629 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9630 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9631 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9632 return ArrayRef(TargetIndices);
9633}
9634
9635/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9636/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9642
9643/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9644/// pass.
9649
9650// Called during:
9651// - pre-RA scheduling and post-RA scheduling
9654 const ScheduleDAGMI *DAG) const {
9655 // Borrowed from Arm Target
9656 // We would like to restrict this hazard recognizer to only
9657 // post-RA scheduling; we can tell that we're post-RA because we don't
9658 // track VRegLiveness.
9659 if (!DAG->hasVRegLiveness())
9660 return new GCNHazardRecognizer(DAG->MF);
9662}
9663
9664std::pair<unsigned, unsigned>
9666 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9667}
9668
9671 static const std::pair<unsigned, const char *> TargetFlags[] = {
9672 {MO_GOTPCREL, "amdgpu-gotprel"},
9673 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9674 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9675 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9676 {MO_REL32_LO, "amdgpu-rel32-lo"},
9677 {MO_REL32_HI, "amdgpu-rel32-hi"},
9678 {MO_REL64, "amdgpu-rel64"},
9679 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9680 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9681 {MO_ABS64, "amdgpu-abs64"},
9682 };
9683
9684 return ArrayRef(TargetFlags);
9685}
9686
9689 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9690 {
9691 {MONoClobber, "amdgpu-noclobber"},
9692 {MOLastUse, "amdgpu-last-use"},
9693 {MOCooperative, "amdgpu-cooperative"},
9694 };
9695
9696 return ArrayRef(TargetFlags);
9697}
9698
9700 const MachineFunction &MF) const {
9702 assert(SrcReg.isVirtual());
9703 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9704 return AMDGPU::WWM_COPY;
9705
9706 return AMDGPU::COPY;
9707}
9708
9710 Register Reg) const {
9711 // We need to handle instructions which may be inserted during register
9712 // allocation to handle the prolog. The initial prolog instruction may have
9713 // been separated from the start of the block by spills and copies inserted
9714 // needed by the prolog. However, the insertions for scalar registers can
9715 // always be placed at the BB top as they are independent of the exec mask
9716 // value.
9717 const MachineFunction *MF = MI.getParent()->getParent();
9718 bool IsNullOrVectorRegister = true;
9719 if (Reg) {
9720 const MachineRegisterInfo &MRI = MF->getRegInfo();
9721 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9722 }
9723
9724 uint16_t Opcode = MI.getOpcode();
9726 return IsNullOrVectorRegister &&
9727 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9728 (Opcode == AMDGPU::IMPLICIT_DEF &&
9729 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9730 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9731 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9732}
9733
9737 const DebugLoc &DL,
9738 Register DestReg) const {
9739 if (ST.hasAddNoCarry())
9740 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9741
9742 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9743 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9744 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9745
9746 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9747 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9748}
9749
9752 const DebugLoc &DL,
9753 Register DestReg,
9754 RegScavenger &RS) const {
9755 if (ST.hasAddNoCarry())
9756 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9757
9758 // If available, prefer to use vcc.
9759 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9760 ? Register(RI.getVCC())
9761 : RS.scavengeRegisterBackwards(
9762 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9763 0, /* AllowSpill */ false);
9764
9765 // TODO: Users need to deal with this.
9766 if (!UnusedCarry.isValid())
9767 return MachineInstrBuilder();
9768
9769 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9770 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9771}
9772
9773bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9774 switch (Opcode) {
9775 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9776 case AMDGPU::SI_KILL_I1_TERMINATOR:
9777 return true;
9778 default:
9779 return false;
9780 }
9781}
9782
9784 switch (Opcode) {
9785 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9786 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9787 case AMDGPU::SI_KILL_I1_PSEUDO:
9788 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9789 default:
9790 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9791 }
9792}
9793
9794bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9795 return Imm <= getMaxMUBUFImmOffset(ST);
9796}
9797
9799 // GFX12 field is non-negative 24-bit signed byte offset.
9800 const unsigned OffsetBits =
9801 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9802 return (1 << OffsetBits) - 1;
9803}
9804
9806 if (!ST.isWave32())
9807 return;
9808
9809 if (MI.isInlineAsm())
9810 return;
9811
9812 for (auto &Op : MI.implicit_operands()) {
9813 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9814 Op.setReg(AMDGPU::VCC_LO);
9815 }
9816}
9817
9819 if (!isSMRD(MI))
9820 return false;
9821
9822 // Check that it is using a buffer resource.
9823 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9824 if (Idx == -1) // e.g. s_memtime
9825 return false;
9826
9827 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
9828 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9829}
9830
9831// Given Imm, split it into the values to put into the SOffset and ImmOffset
9832// fields in an MUBUF instruction. Return false if it is not possible (due to a
9833// hardware bug needing a workaround).
9834//
9835// The required alignment ensures that individual address components remain
9836// aligned if they are aligned to begin with. It also ensures that additional
9837// offsets within the given alignment can be added to the resulting ImmOffset.
9839 uint32_t &ImmOffset, Align Alignment) const {
9840 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9841 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9842 uint32_t Overflow = 0;
9843
9844 if (Imm > MaxImm) {
9845 if (Imm <= MaxImm + 64) {
9846 // Use an SOffset inline constant for 4..64
9847 Overflow = Imm - MaxImm;
9848 Imm = MaxImm;
9849 } else {
9850 // Try to keep the same value in SOffset for adjacent loads, so that
9851 // the corresponding register contents can be re-used.
9852 //
9853 // Load values with all low-bits (except for alignment bits) set into
9854 // SOffset, so that a larger range of values can be covered using
9855 // s_movk_i32.
9856 //
9857 // Atomic operations fail to work correctly when individual address
9858 // components are unaligned, even if their sum is aligned.
9859 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9860 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9861 Imm = Low;
9862 Overflow = High - Alignment.value();
9863 }
9864 }
9865
9866 if (Overflow > 0) {
9867 // There is a hardware bug in SI and CI which prevents address clamping in
9868 // MUBUF instructions from working correctly with SOffsets. The immediate
9869 // offset is unaffected.
9870 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9871 return false;
9872
9873 // It is not possible to set immediate in SOffset field on some targets.
9874 if (ST.hasRestrictedSOffset())
9875 return false;
9876 }
9877
9878 ImmOffset = Imm;
9879 SOffset = Overflow;
9880 return true;
9881}
9882
9883// Depending on the used address space and instructions, some immediate offsets
9884// are allowed and some are not.
9885// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9886// scratch instruction offsets can also be negative. On GFX12, offsets can be
9887// negative for all variants.
9888//
9889// There are several bugs related to these offsets:
9890// On gfx10.1, flat instructions that go into the global address space cannot
9891// use an offset.
9892//
9893// For scratch instructions, the address can be either an SGPR or a VGPR.
9894// The following offsets can be used, depending on the architecture (x means
9895// cannot be used):
9896// +----------------------------+------+------+
9897// | Address-Mode | SGPR | VGPR |
9898// +----------------------------+------+------+
9899// | gfx9 | | |
9900// | negative, 4-aligned offset | x | ok |
9901// | negative, unaligned offset | x | ok |
9902// +----------------------------+------+------+
9903// | gfx10 | | |
9904// | negative, 4-aligned offset | ok | ok |
9905// | negative, unaligned offset | ok | x |
9906// +----------------------------+------+------+
9907// | gfx10.3 | | |
9908// | negative, 4-aligned offset | ok | ok |
9909// | negative, unaligned offset | ok | ok |
9910// +----------------------------+------+------+
9911//
9912// This function ignores the addressing mode, so if an offset cannot be used in
9913// one addressing mode, it is considered illegal.
9914bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9915 uint64_t FlatVariant) const {
9916 // TODO: Should 0 be special cased?
9917 if (!ST.hasFlatInstOffsets())
9918 return false;
9919
9920 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9921 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9922 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9923 return false;
9924
9925 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9926 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9927 (Offset % 4) != 0) {
9928 return false;
9929 }
9930
9931 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9932 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9933 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9934}
9935
9936// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9937std::pair<int64_t, int64_t>
9938SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9939 uint64_t FlatVariant) const {
9940 int64_t RemainderOffset = COffsetVal;
9941 int64_t ImmField = 0;
9942
9943 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9944 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9945
9946 if (AllowNegative) {
9947 // Use signed division by a power of two to truncate towards 0.
9948 int64_t D = 1LL << NumBits;
9949 RemainderOffset = (COffsetVal / D) * D;
9950 ImmField = COffsetVal - RemainderOffset;
9951
9952 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9953 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9954 (ImmField % 4) != 0) {
9955 // Make ImmField a multiple of 4
9956 RemainderOffset += ImmField % 4;
9957 ImmField -= ImmField % 4;
9958 }
9959 } else if (COffsetVal >= 0) {
9960 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9961 RemainderOffset = COffsetVal - ImmField;
9962 }
9963
9964 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9965 assert(RemainderOffset + ImmField == COffsetVal);
9966 return {ImmField, RemainderOffset};
9967}
9968
9970 if (ST.hasNegativeScratchOffsetBug() &&
9971 FlatVariant == SIInstrFlags::FlatScratch)
9972 return false;
9973
9974 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9975}
9976
9977static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9978 switch (ST.getGeneration()) {
9979 default:
9980 break;
9983 return SIEncodingFamily::SI;
9986 return SIEncodingFamily::VI;
9992 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9994 }
9995 llvm_unreachable("Unknown subtarget generation!");
9996}
9997
9998bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9999 switch(MCOp) {
10000 // These opcodes use indirect register addressing so
10001 // they need special handling by codegen (currently missing).
10002 // Therefore it is too risky to allow these opcodes
10003 // to be selected by dpp combiner or sdwa peepholer.
10004 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10005 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10006 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10007 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10008 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10009 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10010 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10011 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10012 return true;
10013 default:
10014 return false;
10015 }
10016}
10017
10018#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10019 case OPCODE##_dpp: \
10020 case OPCODE##_e32: \
10021 case OPCODE##_e64: \
10022 case OPCODE##_e64_dpp: \
10023 case OPCODE##_sdwa:
10024
10025static bool isRenamedInGFX9(int Opcode) {
10026 switch (Opcode) {
10027 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10028 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10029 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10030 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10031 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10032 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10033 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10034 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10035 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10036 //
10037 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10038 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10039 case AMDGPU::V_FMA_F16_gfx9_e64:
10040 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10041 case AMDGPU::V_INTERP_P2_F16:
10042 case AMDGPU::V_MAD_F16_e64:
10043 case AMDGPU::V_MAD_U16_e64:
10044 case AMDGPU::V_MAD_I16_e64:
10045 return true;
10046 default:
10047 return false;
10048 }
10049}
10050
10051int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10052 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
10053
10054 unsigned Gen = subtargetEncodingFamily(ST);
10055
10056 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10058
10059 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10060 // subtarget has UnpackedD16VMem feature.
10061 // TODO: remove this when we discard GFX80 encoding.
10062 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10064
10065 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10066 switch (ST.getGeneration()) {
10067 default:
10069 break;
10072 break;
10075 break;
10076 }
10077 }
10078
10079 if (isMAI(Opcode)) {
10080 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10081 if (MFMAOp != -1)
10082 Opcode = MFMAOp;
10083 }
10084
10085 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10086
10087 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10089
10090 // -1 means that Opcode is already a native instruction.
10091 if (MCOp == -1)
10092 return Opcode;
10093
10094 if (ST.hasGFX90AInsts()) {
10095 uint16_t NMCOp = (uint16_t)-1;
10096 if (ST.hasGFX940Insts())
10098 if (NMCOp == (uint16_t)-1)
10100 if (NMCOp == (uint16_t)-1)
10102 if (NMCOp != (uint16_t)-1)
10103 MCOp = NMCOp;
10104 }
10105
10106 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10107 // no encoding in the given subtarget generation.
10108 if (MCOp == (uint16_t)-1)
10109 return -1;
10110
10111 if (isAsmOnlyOpcode(MCOp))
10112 return -1;
10113
10114 return MCOp;
10115}
10116
10117static
10119 assert(RegOpnd.isReg());
10120 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10121 getRegSubRegPair(RegOpnd);
10122}
10123
10126 assert(MI.isRegSequence());
10127 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10128 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10129 auto &RegOp = MI.getOperand(1 + 2 * I);
10130 return getRegOrUndef(RegOp);
10131 }
10133}
10134
10135// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10136// Following a subreg of reg:subreg isn't supported
10139 if (!RSR.SubReg)
10140 return false;
10141 switch (MI.getOpcode()) {
10142 default: break;
10143 case AMDGPU::REG_SEQUENCE:
10144 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10145 return true;
10146 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10147 case AMDGPU::INSERT_SUBREG:
10148 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10149 // inserted the subreg we're looking for
10150 RSR = getRegOrUndef(MI.getOperand(2));
10151 else { // the subreg in the rest of the reg
10152 auto R1 = getRegOrUndef(MI.getOperand(1));
10153 if (R1.SubReg) // subreg of subreg isn't supported
10154 return false;
10155 RSR.Reg = R1.Reg;
10156 }
10157 return true;
10158 }
10159 return false;
10160}
10161
10164 assert(MRI.isSSA());
10165 if (!P.Reg.isVirtual())
10166 return nullptr;
10167
10168 auto RSR = P;
10169 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10170 while (auto *MI = DefInst) {
10171 DefInst = nullptr;
10172 switch (MI->getOpcode()) {
10173 case AMDGPU::COPY:
10174 case AMDGPU::V_MOV_B32_e32: {
10175 auto &Op1 = MI->getOperand(1);
10176 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10177 if (Op1.isUndef())
10178 return nullptr;
10179 RSR = getRegSubRegPair(Op1);
10180 DefInst = MRI.getVRegDef(RSR.Reg);
10181 }
10182 break;
10183 }
10184 default:
10185 if (followSubRegDef(*MI, RSR)) {
10186 if (!RSR.Reg)
10187 return nullptr;
10188 DefInst = MRI.getVRegDef(RSR.Reg);
10189 }
10190 }
10191 if (!DefInst)
10192 return MI;
10193 }
10194 return nullptr;
10195}
10196
10198 Register VReg,
10199 const MachineInstr &DefMI,
10200 const MachineInstr &UseMI) {
10201 assert(MRI.isSSA() && "Must be run on SSA");
10202
10203 auto *TRI = MRI.getTargetRegisterInfo();
10204 auto *DefBB = DefMI.getParent();
10205
10206 // Don't bother searching between blocks, although it is possible this block
10207 // doesn't modify exec.
10208 if (UseMI.getParent() != DefBB)
10209 return true;
10210
10211 const int MaxInstScan = 20;
10212 int NumInst = 0;
10213
10214 // Stop scan at the use.
10215 auto E = UseMI.getIterator();
10216 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10217 if (I->isDebugInstr())
10218 continue;
10219
10220 if (++NumInst > MaxInstScan)
10221 return true;
10222
10223 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10224 return true;
10225 }
10226
10227 return false;
10228}
10229
10231 Register VReg,
10232 const MachineInstr &DefMI) {
10233 assert(MRI.isSSA() && "Must be run on SSA");
10234
10235 auto *TRI = MRI.getTargetRegisterInfo();
10236 auto *DefBB = DefMI.getParent();
10237
10238 const int MaxUseScan = 10;
10239 int NumUse = 0;
10240
10241 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10242 auto &UseInst = *Use.getParent();
10243 // Don't bother searching between blocks, although it is possible this block
10244 // doesn't modify exec.
10245 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10246 return true;
10247
10248 if (++NumUse > MaxUseScan)
10249 return true;
10250 }
10251
10252 if (NumUse == 0)
10253 return false;
10254
10255 const int MaxInstScan = 20;
10256 int NumInst = 0;
10257
10258 // Stop scan when we have seen all the uses.
10259 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10260 assert(I != DefBB->end());
10261
10262 if (I->isDebugInstr())
10263 continue;
10264
10265 if (++NumInst > MaxInstScan)
10266 return true;
10267
10268 for (const MachineOperand &Op : I->operands()) {
10269 // We don't check reg masks here as they're used only on calls:
10270 // 1. EXEC is only considered const within one BB
10271 // 2. Call should be a terminator instruction if present in a BB
10272
10273 if (!Op.isReg())
10274 continue;
10275
10276 Register Reg = Op.getReg();
10277 if (Op.isUse()) {
10278 if (Reg == VReg && --NumUse == 0)
10279 return false;
10280 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10281 return true;
10282 }
10283 }
10284}
10285
10288 const DebugLoc &DL, Register Src, Register Dst) const {
10289 auto Cur = MBB.begin();
10290 if (Cur != MBB.end())
10291 do {
10292 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10293 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10294 ++Cur;
10295 } while (Cur != MBB.end() && Cur != LastPHIIt);
10296
10297 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10298 Dst);
10299}
10300
10303 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10304 if (InsPt != MBB.end() &&
10305 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10306 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10307 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10308 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10309 InsPt++;
10310 return BuildMI(MBB, InsPt, DL,
10311 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10312 .addReg(Src, 0, SrcSubReg)
10313 .addReg(AMDGPU::EXEC, RegState::Implicit);
10314 }
10315 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10316 Dst);
10317}
10318
10319bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10320
10323 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10324 VirtRegMap *VRM) const {
10325 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10326 //
10327 // %0:sreg_32 = COPY $m0
10328 //
10329 // We explicitly chose SReg_32 for the virtual register so such a copy might
10330 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10331 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10332 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10333 // TargetInstrInfo::foldMemoryOperand() is going to try.
10334 // A similar issue also exists with spilling and reloading $exec registers.
10335 //
10336 // To prevent that, constrain the %0 register class here.
10337 if (isFullCopyInstr(MI)) {
10338 Register DstReg = MI.getOperand(0).getReg();
10339 Register SrcReg = MI.getOperand(1).getReg();
10340 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10341 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10343 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10344 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10345 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10346 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10347 return nullptr;
10348 }
10349 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10350 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10351 return nullptr;
10352 }
10353 }
10354 }
10355
10356 return nullptr;
10357}
10358
10360 const MachineInstr &MI,
10361 unsigned *PredCost) const {
10362 if (MI.isBundle()) {
10364 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10365 unsigned Lat = 0, Count = 0;
10366 for (++I; I != E && I->isBundledWithPred(); ++I) {
10367 ++Count;
10368 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10369 }
10370 return Lat + Count - 1;
10371 }
10372
10373 return SchedModel.computeInstrLatency(&MI);
10374}
10375
10378 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10379 unsigned Opcode = MI.getOpcode();
10380
10381 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10382 Register Dst = MI.getOperand(0).getReg();
10383 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10384 : MI.getOperand(1).getReg();
10385 LLT DstTy = MRI.getType(Dst);
10386 LLT SrcTy = MRI.getType(Src);
10387 unsigned DstAS = DstTy.getAddressSpace();
10388 unsigned SrcAS = SrcTy.getAddressSpace();
10389 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10390 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10391 ST.hasGloballyAddressableScratch()
10394 };
10395
10396 // If the target supports globally addressable scratch, the mapping from
10397 // scratch memory to the flat aperture changes therefore an address space cast
10398 // is no longer uniform.
10399 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10400 return HandleAddrSpaceCast(MI);
10401
10402 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10403 auto IID = GI->getIntrinsicID();
10408
10409 switch (IID) {
10410 case Intrinsic::amdgcn_addrspacecast_nonnull:
10411 return HandleAddrSpaceCast(MI);
10412 case Intrinsic::amdgcn_if:
10413 case Intrinsic::amdgcn_else:
10414 // FIXME: Uniform if second result
10415 break;
10416 }
10417
10419 }
10420
10421 // Loads from the private and flat address spaces are divergent, because
10422 // threads can execute the load instruction with the same inputs and get
10423 // different results.
10424 //
10425 // All other loads are not divergent, because if threads issue loads with the
10426 // same arguments, they will always get the same result.
10427 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10428 Opcode == AMDGPU::G_SEXTLOAD) {
10429 if (MI.memoperands_empty())
10430 return InstructionUniformity::NeverUniform; // conservative assumption
10431
10432 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10433 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10434 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10435 })) {
10436 // At least one MMO in a non-global address space.
10438 }
10440 }
10441
10442 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10443 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10444 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10445 AMDGPU::isGenericAtomic(Opcode)) {
10447 }
10449}
10450
10453
10454 if (isNeverUniform(MI))
10456
10457 unsigned opcode = MI.getOpcode();
10458 if (opcode == AMDGPU::V_READLANE_B32 ||
10459 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10460 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10462
10463 if (isCopyInstr(MI)) {
10464 const MachineOperand &srcOp = MI.getOperand(1);
10465 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10466 const TargetRegisterClass *regClass =
10467 RI.getPhysRegBaseClass(srcOp.getReg());
10468 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10470 }
10472 }
10473
10474 // GMIR handling
10475 if (MI.isPreISelOpcode())
10477
10478 // Atomics are divergent because they are executed sequentially: when an
10479 // atomic operation refers to the same address in each thread, then each
10480 // thread after the first sees the value written by the previous thread as
10481 // original value.
10482
10483 if (isAtomic(MI))
10485
10486 // Loads from the private and flat address spaces are divergent, because
10487 // threads can execute the load instruction with the same inputs and get
10488 // different results.
10489 if (isFLAT(MI) && MI.mayLoad()) {
10490 if (MI.memoperands_empty())
10491 return InstructionUniformity::NeverUniform; // conservative assumption
10492
10493 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10494 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10495 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10496 })) {
10497 // At least one MMO in a non-global address space.
10499 }
10500
10502 }
10503
10504 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10505 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10506
10507 // FIXME: It's conceptually broken to report this for an instruction, and not
10508 // a specific def operand. For inline asm in particular, there could be mixed
10509 // uniform and divergent results.
10510 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10511 const MachineOperand &SrcOp = MI.getOperand(I);
10512 if (!SrcOp.isReg())
10513 continue;
10514
10515 Register Reg = SrcOp.getReg();
10516 if (!Reg || !SrcOp.readsReg())
10517 continue;
10518
10519 // If RegBank is null, this is unassigned or an unallocatable special
10520 // register, which are all scalars.
10521 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10522 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10524 }
10525
10526 // TODO: Uniformity check condtions above can be rearranged for more
10527 // redability
10528
10529 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10530 // currently turned into no-op COPYs by SelectionDAG ISel and are
10531 // therefore no longer recognizable.
10532
10534}
10535
10537 switch (MF.getFunction().getCallingConv()) {
10539 return 1;
10541 return 2;
10543 return 3;
10547 const Function &F = MF.getFunction();
10548 F.getContext().diagnose(DiagnosticInfoUnsupported(
10549 F, "ds_ordered_count unsupported for this calling conv"));
10550 [[fallthrough]];
10551 }
10554 case CallingConv::C:
10555 case CallingConv::Fast:
10556 default:
10557 // Assume other calling conventions are various compute callable functions
10558 return 0;
10559 }
10560}
10561
10563 Register &SrcReg2, int64_t &CmpMask,
10564 int64_t &CmpValue) const {
10565 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10566 return false;
10567
10568 switch (MI.getOpcode()) {
10569 default:
10570 break;
10571 case AMDGPU::S_CMP_EQ_U32:
10572 case AMDGPU::S_CMP_EQ_I32:
10573 case AMDGPU::S_CMP_LG_U32:
10574 case AMDGPU::S_CMP_LG_I32:
10575 case AMDGPU::S_CMP_LT_U32:
10576 case AMDGPU::S_CMP_LT_I32:
10577 case AMDGPU::S_CMP_GT_U32:
10578 case AMDGPU::S_CMP_GT_I32:
10579 case AMDGPU::S_CMP_LE_U32:
10580 case AMDGPU::S_CMP_LE_I32:
10581 case AMDGPU::S_CMP_GE_U32:
10582 case AMDGPU::S_CMP_GE_I32:
10583 case AMDGPU::S_CMP_EQ_U64:
10584 case AMDGPU::S_CMP_LG_U64:
10585 SrcReg = MI.getOperand(0).getReg();
10586 if (MI.getOperand(1).isReg()) {
10587 if (MI.getOperand(1).getSubReg())
10588 return false;
10589 SrcReg2 = MI.getOperand(1).getReg();
10590 CmpValue = 0;
10591 } else if (MI.getOperand(1).isImm()) {
10592 SrcReg2 = Register();
10593 CmpValue = MI.getOperand(1).getImm();
10594 } else {
10595 return false;
10596 }
10597 CmpMask = ~0;
10598 return true;
10599 case AMDGPU::S_CMPK_EQ_U32:
10600 case AMDGPU::S_CMPK_EQ_I32:
10601 case AMDGPU::S_CMPK_LG_U32:
10602 case AMDGPU::S_CMPK_LG_I32:
10603 case AMDGPU::S_CMPK_LT_U32:
10604 case AMDGPU::S_CMPK_LT_I32:
10605 case AMDGPU::S_CMPK_GT_U32:
10606 case AMDGPU::S_CMPK_GT_I32:
10607 case AMDGPU::S_CMPK_LE_U32:
10608 case AMDGPU::S_CMPK_LE_I32:
10609 case AMDGPU::S_CMPK_GE_U32:
10610 case AMDGPU::S_CMPK_GE_I32:
10611 SrcReg = MI.getOperand(0).getReg();
10612 SrcReg2 = Register();
10613 CmpValue = MI.getOperand(1).getImm();
10614 CmpMask = ~0;
10615 return true;
10616 }
10617
10618 return false;
10619}
10620
10622 Register SrcReg2, int64_t CmpMask,
10623 int64_t CmpValue,
10624 const MachineRegisterInfo *MRI) const {
10625 if (!SrcReg || SrcReg.isPhysical())
10626 return false;
10627
10628 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10629 return false;
10630
10631 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10632 this](int64_t ExpectedValue, unsigned SrcSize,
10633 bool IsReversible, bool IsSigned) -> bool {
10634 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10635 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10636 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10637 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10638 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10639 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10640 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10641 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10642 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10643 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10644 //
10645 // Signed ge/gt are not used for the sign bit.
10646 //
10647 // If result of the AND is unused except in the compare:
10648 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10649 //
10650 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10651 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10652 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10653 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10654 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10655 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10656
10657 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10658 if (!Def || Def->getParent() != CmpInstr.getParent())
10659 return false;
10660
10661 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10662 Def->getOpcode() != AMDGPU::S_AND_B64)
10663 return false;
10664
10665 int64_t Mask;
10666 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10667 if (MO->isImm())
10668 Mask = MO->getImm();
10669 else if (!getFoldableImm(MO, Mask))
10670 return false;
10671 Mask &= maxUIntN(SrcSize);
10672 return isPowerOf2_64(Mask);
10673 };
10674
10675 MachineOperand *SrcOp = &Def->getOperand(1);
10676 if (isMask(SrcOp))
10677 SrcOp = &Def->getOperand(2);
10678 else if (isMask(&Def->getOperand(2)))
10679 SrcOp = &Def->getOperand(1);
10680 else
10681 return false;
10682
10683 // A valid Mask is required to have a single bit set, hence a non-zero and
10684 // power-of-two value. This verifies that we will not do 64-bit shift below.
10685 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10686 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10687 if (IsSigned && BitNo == SrcSize - 1)
10688 return false;
10689
10690 ExpectedValue <<= BitNo;
10691
10692 bool IsReversedCC = false;
10693 if (CmpValue != ExpectedValue) {
10694 if (!IsReversible)
10695 return false;
10696 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10697 if (!IsReversedCC)
10698 return false;
10699 }
10700
10701 Register DefReg = Def->getOperand(0).getReg();
10702 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10703 return false;
10704
10705 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10706 I != E; ++I) {
10707 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10708 I->killsRegister(AMDGPU::SCC, &RI))
10709 return false;
10710 }
10711
10712 MachineOperand *SccDef =
10713 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10714 SccDef->setIsDead(false);
10715 CmpInstr.eraseFromParent();
10716
10717 if (!MRI->use_nodbg_empty(DefReg)) {
10718 assert(!IsReversedCC);
10719 return true;
10720 }
10721
10722 // Replace AND with unused result with a S_BITCMP.
10723 MachineBasicBlock *MBB = Def->getParent();
10724
10725 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10726 : AMDGPU::S_BITCMP1_B32
10727 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10728 : AMDGPU::S_BITCMP1_B64;
10729
10730 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10731 .add(*SrcOp)
10732 .addImm(BitNo);
10733 Def->eraseFromParent();
10734
10735 return true;
10736 };
10737
10738 switch (CmpInstr.getOpcode()) {
10739 default:
10740 break;
10741 case AMDGPU::S_CMP_EQ_U32:
10742 case AMDGPU::S_CMP_EQ_I32:
10743 case AMDGPU::S_CMPK_EQ_U32:
10744 case AMDGPU::S_CMPK_EQ_I32:
10745 return optimizeCmpAnd(1, 32, true, false);
10746 case AMDGPU::S_CMP_GE_U32:
10747 case AMDGPU::S_CMPK_GE_U32:
10748 return optimizeCmpAnd(1, 32, false, false);
10749 case AMDGPU::S_CMP_GE_I32:
10750 case AMDGPU::S_CMPK_GE_I32:
10751 return optimizeCmpAnd(1, 32, false, true);
10752 case AMDGPU::S_CMP_EQ_U64:
10753 return optimizeCmpAnd(1, 64, true, false);
10754 case AMDGPU::S_CMP_LG_U32:
10755 case AMDGPU::S_CMP_LG_I32:
10756 case AMDGPU::S_CMPK_LG_U32:
10757 case AMDGPU::S_CMPK_LG_I32:
10758 return optimizeCmpAnd(0, 32, true, false);
10759 case AMDGPU::S_CMP_GT_U32:
10760 case AMDGPU::S_CMPK_GT_U32:
10761 return optimizeCmpAnd(0, 32, false, false);
10762 case AMDGPU::S_CMP_GT_I32:
10763 case AMDGPU::S_CMPK_GT_I32:
10764 return optimizeCmpAnd(0, 32, false, true);
10765 case AMDGPU::S_CMP_LG_U64:
10766 return optimizeCmpAnd(0, 64, true, false);
10767 }
10768
10769 return false;
10770}
10771
10773 AMDGPU::OpName OpName) const {
10774 if (!ST.needsAlignedVGPRs())
10775 return;
10776
10777 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10778 if (OpNo < 0)
10779 return;
10780 MachineOperand &Op = MI.getOperand(OpNo);
10781 if (getOpSize(MI, OpNo) > 4)
10782 return;
10783
10784 // Add implicit aligned super-reg to force alignment on the data operand.
10785 const DebugLoc &DL = MI.getDebugLoc();
10786 MachineBasicBlock *BB = MI.getParent();
10788 Register DataReg = Op.getReg();
10789 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10790 Register Undef = MRI.createVirtualRegister(
10791 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10792 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10793 Register NewVR =
10794 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10795 : &AMDGPU::VReg_64_Align2RegClass);
10796 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10797 .addReg(DataReg, 0, Op.getSubReg())
10798 .addImm(AMDGPU::sub0)
10799 .addReg(Undef)
10800 .addImm(AMDGPU::sub1);
10801 Op.setReg(NewVR);
10802 Op.setSubReg(AMDGPU::sub0);
10803 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10804}
10805
10807 if (isIGLP(*MI))
10808 return false;
10809
10811}
10812
10814 if (!isWMMA(MI) && !isSWMMAC(MI))
10815 return false;
10816
10817 if (AMDGPU::isGFX1250(ST))
10818 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10819
10820 return true;
10821}
10822
10824 unsigned Opcode = MI.getOpcode();
10825
10826 if (AMDGPU::isGFX12Plus(ST))
10827 return isDOT(MI) || isXDLWMMA(MI);
10828
10829 if (!isMAI(MI) || isDGEMM(Opcode) ||
10830 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10831 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10832 return false;
10833
10834 if (!ST.hasGFX940Insts())
10835 return true;
10836
10837 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10838}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:221
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:87
uint8_t OperandType
Information about the type of the operand.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:96
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:574
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:576
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:573
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:575
@ TI_CONSTDATA_START
Definition AMDGPU.h:572
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:72
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:70
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:71
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:62
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:73
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.