LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
67 RI(ST), ST(ST) {
68 SchedModel.init(&ST);
69}
70
71//===----------------------------------------------------------------------===//
72// TargetInstrInfo callbacks
73//===----------------------------------------------------------------------===//
74
75static unsigned getNumOperandsNoGlue(SDNode *Node) {
76 unsigned N = Node->getNumOperands();
77 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
78 --N;
79 return N;
80}
81
82/// Returns true if both nodes have the same value for the given
83/// operand \p Op, or if both nodes do not have this operand.
85 AMDGPU::OpName OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
152bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
157 if (!Dst)
158 return true;
159
160 Register DstReg = Dst->getReg();
161 if (!DstReg.isVirtual())
162 return true;
163
164 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
165 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
166 switch (Use.getOpcode()) {
167 case AMDGPU::S_AND_SAVEEXEC_B32:
168 case AMDGPU::S_AND_SAVEEXEC_B64:
169 break;
170 case AMDGPU::S_AND_B32:
171 case AMDGPU::S_AND_B64:
172 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
173 return true;
174 break;
175 default:
176 return true;
177 }
178 }
179 return false;
180 }
181
182 switch (MI.getOpcode()) {
183 default:
184 break;
185 case AMDGPU::V_READFIRSTLANE_B32:
186 return true;
187 }
188
189 return false;
190}
191
193 // Any implicit use of exec by VALU is not a real register read.
194 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
195 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
196}
197
199 MachineBasicBlock *SuccToSinkTo,
200 MachineCycleInfo *CI) const {
201 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
202 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
203 return true;
204
205 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
206 // Check if sinking of MI would create temporal divergent use.
207 for (auto Op : MI.uses()) {
208 if (Op.isReg() && Op.getReg().isVirtual() &&
209 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
210 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
211
212 // SgprDef defined inside cycle
213 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
214 if (FromCycle == nullptr)
215 continue;
216
217 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
218 // Check if there is a FromCycle that contains SgprDef's basic block but
219 // does not contain SuccToSinkTo and also has divergent exit condition.
220 while (FromCycle && !FromCycle->contains(ToCycle)) {
222 FromCycle->getExitingBlocks(ExitingBlocks);
223
224 // FromCycle has divergent exit condition.
225 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
226 if (hasDivergentBranch(ExitingBlock))
227 return false;
228 }
229
230 FromCycle = FromCycle->getParentCycle();
231 }
232 }
233 }
234
235 return true;
236}
237
239 int64_t &Offset0,
240 int64_t &Offset1) const {
241 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
242 return false;
243
244 unsigned Opc0 = Load0->getMachineOpcode();
245 unsigned Opc1 = Load1->getMachineOpcode();
246
247 // Make sure both are actually loads.
248 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
249 return false;
250
251 // A mayLoad instruction without a def is not a load. Likely a prefetch.
252 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
253 return false;
254
255 if (isDS(Opc0) && isDS(Opc1)) {
256
257 // FIXME: Handle this case:
258 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
259 return false;
260
261 // Check base reg.
262 if (Load0->getOperand(0) != Load1->getOperand(0))
263 return false;
264
265 // Skip read2 / write2 variants for simplicity.
266 // TODO: We should report true if the used offsets are adjacent (excluded
267 // st64 versions).
268 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
269 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
270 if (Offset0Idx == -1 || Offset1Idx == -1)
271 return false;
272
273 // XXX - be careful of dataless loads
274 // getNamedOperandIdx returns the index for MachineInstrs. Since they
275 // include the output in the operand list, but SDNodes don't, we need to
276 // subtract the index by one.
277 Offset0Idx -= get(Opc0).NumDefs;
278 Offset1Idx -= get(Opc1).NumDefs;
279 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
280 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
281 return true;
282 }
283
284 if (isSMRD(Opc0) && isSMRD(Opc1)) {
285 // Skip time and cache invalidation instructions.
286 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
287 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
288 return false;
289
290 unsigned NumOps = getNumOperandsNoGlue(Load0);
291 if (NumOps != getNumOperandsNoGlue(Load1))
292 return false;
293
294 // Check base reg.
295 if (Load0->getOperand(0) != Load1->getOperand(0))
296 return false;
297
298 // Match register offsets, if both register and immediate offsets present.
299 assert(NumOps == 4 || NumOps == 5);
300 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
301 return false;
302
303 const ConstantSDNode *Load0Offset =
305 const ConstantSDNode *Load1Offset =
307
308 if (!Load0Offset || !Load1Offset)
309 return false;
310
311 Offset0 = Load0Offset->getZExtValue();
312 Offset1 = Load1Offset->getZExtValue();
313 return true;
314 }
315
316 // MUBUF and MTBUF can access the same addresses.
317 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
318
319 // MUBUF and MTBUF have vaddr at different indices.
320 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
323 return false;
324
325 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
326 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
327
328 if (OffIdx0 == -1 || OffIdx1 == -1)
329 return false;
330
331 // getNamedOperandIdx returns the index for MachineInstrs. Since they
332 // include the output in the operand list, but SDNodes don't, we need to
333 // subtract the index by one.
334 OffIdx0 -= get(Opc0).NumDefs;
335 OffIdx1 -= get(Opc1).NumDefs;
336
337 SDValue Off0 = Load0->getOperand(OffIdx0);
338 SDValue Off1 = Load1->getOperand(OffIdx1);
339
340 // The offset might be a FrameIndexSDNode.
341 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
342 return false;
343
344 Offset0 = Off0->getAsZExtVal();
345 Offset1 = Off1->getAsZExtVal();
346 return true;
347 }
348
349 return false;
350}
351
352static bool isStride64(unsigned Opc) {
353 switch (Opc) {
354 case AMDGPU::DS_READ2ST64_B32:
355 case AMDGPU::DS_READ2ST64_B64:
356 case AMDGPU::DS_WRITE2ST64_B32:
357 case AMDGPU::DS_WRITE2ST64_B64:
358 return true;
359 default:
360 return false;
361 }
362}
363
366 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
367 const TargetRegisterInfo *TRI) const {
368 if (!LdSt.mayLoadOrStore())
369 return false;
370
371 unsigned Opc = LdSt.getOpcode();
372 OffsetIsScalable = false;
373 const MachineOperand *BaseOp, *OffsetOp;
374 int DataOpIdx;
375
376 if (isDS(LdSt)) {
377 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
378 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
379 if (OffsetOp) {
380 // Normal, single offset LDS instruction.
381 if (!BaseOp) {
382 // DS_CONSUME/DS_APPEND use M0 for the base address.
383 // TODO: find the implicit use operand for M0 and use that as BaseOp?
384 return false;
385 }
386 BaseOps.push_back(BaseOp);
387 Offset = OffsetOp->getImm();
388 // Get appropriate operand, and compute width accordingly.
389 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
390 if (DataOpIdx == -1)
391 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
392 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
393 Width = LocationSize::precise(64);
394 else
395 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
396 } else {
397 // The 2 offset instructions use offset0 and offset1 instead. We can treat
398 // these as a load with a single offset if the 2 offsets are consecutive.
399 // We will use this for some partially aligned loads.
400 const MachineOperand *Offset0Op =
401 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
402 const MachineOperand *Offset1Op =
403 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
404
405 unsigned Offset0 = Offset0Op->getImm() & 0xff;
406 unsigned Offset1 = Offset1Op->getImm() & 0xff;
407 if (Offset0 + 1 != Offset1)
408 return false;
409
410 // Each of these offsets is in element sized units, so we need to convert
411 // to bytes of the individual reads.
412
413 unsigned EltSize;
414 if (LdSt.mayLoad())
415 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
416 else {
417 assert(LdSt.mayStore());
418 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
419 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
420 }
421
422 if (isStride64(Opc))
423 EltSize *= 64;
424
425 BaseOps.push_back(BaseOp);
426 Offset = EltSize * Offset0;
427 // Get appropriate operand(s), and compute width accordingly.
428 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
429 if (DataOpIdx == -1) {
430 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
431 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
432 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
433 Width = LocationSize::precise(
434 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
435 } else {
436 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
437 }
438 }
439 return true;
440 }
441
442 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
443 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
444 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
445 return false;
446 BaseOps.push_back(RSrc);
447 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
448 if (BaseOp && !BaseOp->isFI())
449 BaseOps.push_back(BaseOp);
450 const MachineOperand *OffsetImm =
451 getNamedOperand(LdSt, AMDGPU::OpName::offset);
452 Offset = OffsetImm->getImm();
453 const MachineOperand *SOffset =
454 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
455 if (SOffset) {
456 if (SOffset->isReg())
457 BaseOps.push_back(SOffset);
458 else
459 Offset += SOffset->getImm();
460 }
461 // Get appropriate operand, and compute width accordingly.
462 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
463 if (DataOpIdx == -1)
464 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
465 if (DataOpIdx == -1) // LDS DMA
466 return false;
467 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
468 return true;
469 }
470
471 if (isImage(LdSt)) {
472 auto RsrcOpName =
473 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
474 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
475 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
476 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
477 if (VAddr0Idx >= 0) {
478 // GFX10 possible NSA encoding.
479 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
480 BaseOps.push_back(&LdSt.getOperand(I));
481 } else {
482 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
483 }
484 Offset = 0;
485 // Get appropriate operand, and compute width accordingly.
486 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
487 if (DataOpIdx == -1)
488 return false; // no return sampler
489 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
490 return true;
491 }
492
493 if (isSMRD(LdSt)) {
494 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
495 if (!BaseOp) // e.g. S_MEMTIME
496 return false;
497 BaseOps.push_back(BaseOp);
498 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
499 Offset = OffsetOp ? OffsetOp->getImm() : 0;
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
502 if (DataOpIdx == -1)
503 return false;
504 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
505 return true;
506 }
507
508 if (isFLAT(LdSt)) {
509 // Instructions have either vaddr or saddr or both or none.
510 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
511 if (BaseOp)
512 BaseOps.push_back(BaseOp);
513 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
514 if (BaseOp)
515 BaseOps.push_back(BaseOp);
516 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
517 // Get appropriate operand, and compute width accordingly.
518 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
519 if (DataOpIdx == -1)
520 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
521 if (DataOpIdx == -1) // LDS DMA
522 return false;
523 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
524 return true;
525 }
526
527 return false;
528}
529
530static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
532 const MachineInstr &MI2,
534 // Only examine the first "base" operand of each instruction, on the
535 // assumption that it represents the real base address of the memory access.
536 // Other operands are typically offsets or indices from this base address.
537 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
538 return true;
539
540 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
541 return false;
542
543 auto *MO1 = *MI1.memoperands_begin();
544 auto *MO2 = *MI2.memoperands_begin();
545 if (MO1->getAddrSpace() != MO2->getAddrSpace())
546 return false;
547
548 const auto *Base1 = MO1->getValue();
549 const auto *Base2 = MO2->getValue();
550 if (!Base1 || !Base2)
551 return false;
552 Base1 = getUnderlyingObject(Base1);
553 Base2 = getUnderlyingObject(Base2);
554
555 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
556 return false;
557
558 return Base1 == Base2;
559}
560
562 int64_t Offset1, bool OffsetIsScalable1,
564 int64_t Offset2, bool OffsetIsScalable2,
565 unsigned ClusterSize,
566 unsigned NumBytes) const {
567 // If the mem ops (to be clustered) do not have the same base ptr, then they
568 // should not be clustered
569 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
570 if (!BaseOps1.empty() && !BaseOps2.empty()) {
571 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
572 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
573 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
574 return false;
575
576 const SIMachineFunctionInfo *MFI =
577 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
578 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
579 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
580 // If only one base op is empty, they do not have the same base ptr
581 return false;
582 }
583
584 // In order to avoid register pressure, on an average, the number of DWORDS
585 // loaded together by all clustered mem ops should not exceed
586 // MaxMemoryClusterDWords. This is an empirical value based on certain
587 // observations and performance related experiments.
588 // The good thing about this heuristic is - it avoids clustering of too many
589 // sub-word loads, and also avoids clustering of wide loads. Below is the
590 // brief summary of how the heuristic behaves for various `LoadSize` when
591 // MaxMemoryClusterDWords is 8.
592 //
593 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
594 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
595 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
596 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
597 // (5) LoadSize >= 17: do not cluster
598 const unsigned LoadSize = NumBytes / ClusterSize;
599 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
600 return NumDWords <= MaxMemoryClusterDWords;
601}
602
603// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
604// the first 16 loads will be interleaved with the stores, and the next 16 will
605// be clustered as expected. It should really split into 2 16 store batches.
606//
607// Loads are clustered until this returns false, rather than trying to schedule
608// groups of stores. This also means we have to deal with saying different
609// address space loads should be clustered, and ones which might cause bank
610// conflicts.
611//
612// This might be deprecated so it might not be worth that much effort to fix.
614 int64_t Offset0, int64_t Offset1,
615 unsigned NumLoads) const {
616 assert(Offset1 > Offset0 &&
617 "Second offset should be larger than first offset!");
618 // If we have less than 16 loads in a row, and the offsets are within 64
619 // bytes, then schedule together.
620
621 // A cacheline is 64 bytes (for global memory).
622 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
623}
624
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 const char *Msg = "illegal VGPR to SGPR copy") {
630 MachineFunction *MF = MBB.getParent();
631
633 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
634
635 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
636 .addReg(SrcReg, getKillRegState(KillSrc));
637}
638
639/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
640/// possible to have a direct copy in these cases on GFX908, so an intermediate
641/// VGPR copy is required.
645 const DebugLoc &DL, MCRegister DestReg,
646 MCRegister SrcReg, bool KillSrc,
647 RegScavenger &RS, bool RegsOverlap,
648 Register ImpDefSuperReg = Register(),
649 Register ImpUseSuperReg = Register()) {
650 assert((TII.getSubtarget().hasMAIInsts() &&
651 !TII.getSubtarget().hasGFX90AInsts()) &&
652 "Expected GFX908 subtarget.");
653
654 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
655 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
656 "Source register of the copy should be either an SGPR or an AGPR.");
657
658 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
659 "Destination register of the copy should be an AGPR.");
660
661 const SIRegisterInfo &RI = TII.getRegisterInfo();
662
663 // First try to find defining accvgpr_write to avoid temporary registers.
664 // In the case of copies of overlapping AGPRs, we conservatively do not
665 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
666 // an accvgpr_write used for this same copy due to implicit-defs
667 if (!RegsOverlap) {
668 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
669 --Def;
670
671 if (!Def->modifiesRegister(SrcReg, &RI))
672 continue;
673
674 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
675 Def->getOperand(0).getReg() != SrcReg)
676 break;
677
678 MachineOperand &DefOp = Def->getOperand(1);
679 assert(DefOp.isReg() || DefOp.isImm());
680
681 if (DefOp.isReg()) {
682 bool SafeToPropagate = true;
683 // Check that register source operand is not clobbered before MI.
684 // Immediate operands are always safe to propagate.
685 for (auto I = Def; I != MI && SafeToPropagate; ++I)
686 if (I->modifiesRegister(DefOp.getReg(), &RI))
687 SafeToPropagate = false;
688
689 if (!SafeToPropagate)
690 break;
691
692 for (auto I = Def; I != MI; ++I)
693 I->clearRegisterKills(DefOp.getReg(), &RI);
694 }
695
696 MachineInstrBuilder Builder =
697 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
698 .add(DefOp);
699 if (ImpDefSuperReg)
700 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
701
702 if (ImpUseSuperReg) {
703 Builder.addReg(ImpUseSuperReg,
705 }
706
707 return;
708 }
709 }
710
712 RS.backward(std::next(MI));
713
714 // Ideally we want to have three registers for a long reg_sequence copy
715 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
716 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
717 *MBB.getParent());
718
719 // Registers in the sequence are allocated contiguously so we can just
720 // use register number to pick one of three round-robin temps.
721 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
722 Register Tmp =
723 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
724 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
725 "VGPR used for an intermediate copy should have been reserved.");
726
727 // Only loop through if there are any free registers left. We don't want to
728 // spill.
729 while (RegNo--) {
730 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
731 /* RestoreAfter */ false, 0,
732 /* AllowSpill */ false);
733 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
734 break;
735 Tmp = Tmp2;
736 RS.setRegUsed(Tmp);
737 }
738
739 // Insert copy to temporary VGPR.
740 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
741 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
742 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
743 } else {
744 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
745 }
746
747 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
748 .addReg(SrcReg, getKillRegState(KillSrc));
749 if (ImpUseSuperReg) {
750 UseBuilder.addReg(ImpUseSuperReg,
752 }
753
754 MachineInstrBuilder DefBuilder
755 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
756 .addReg(Tmp, RegState::Kill);
757
758 if (ImpDefSuperReg)
759 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
760}
761
764 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
765 const TargetRegisterClass *RC, bool Forward) {
766 const SIRegisterInfo &RI = TII.getRegisterInfo();
767 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
769 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
770
771 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
772 int16_t SubIdx = BaseIndices[Idx];
773 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
774 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
775 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
776 unsigned Opcode = AMDGPU::S_MOV_B32;
777
778 // Is SGPR aligned? If so try to combine with next.
779 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
780 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
781 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
782 // Can use SGPR64 copy
783 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
784 SubIdx = RI.getSubRegFromChannel(Channel, 2);
785 DestSubReg = RI.getSubReg(DestReg, SubIdx);
786 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
787 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
788 Opcode = AMDGPU::S_MOV_B64;
789 Idx++;
790 }
791
792 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
793 .addReg(SrcSubReg)
794 .addReg(SrcReg, RegState::Implicit);
795
796 if (!FirstMI)
797 FirstMI = LastMI;
798
799 if (!Forward)
800 I--;
801 }
802
803 assert(FirstMI && LastMI);
804 if (!Forward)
805 std::swap(FirstMI, LastMI);
806
807 FirstMI->addOperand(
808 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
809
810 if (KillSrc)
811 LastMI->addRegisterKilled(SrcReg, &RI);
812}
813
816 const DebugLoc &DL, Register DestReg,
817 Register SrcReg, bool KillSrc, bool RenamableDest,
818 bool RenamableSrc) const {
819 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
820 unsigned Size = RI.getRegSizeInBits(*RC);
821 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
822 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
823
824 // The rest of copyPhysReg assumes Src and Dst size are the same size.
825 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
826 // we remove Fix16BitCopies and this code block?
827 if (Fix16BitCopies) {
828 if (((Size == 16) != (SrcSize == 16))) {
829 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
830 assert(ST.useRealTrue16Insts());
831 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
832 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
833 RegToFix = SubReg;
834
835 if (DestReg == SrcReg) {
836 // Identity copy. Insert empty bundle since ExpandPostRA expects an
837 // instruction here.
838 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
839 return;
840 }
841 RC = RI.getPhysRegBaseClass(DestReg);
842 Size = RI.getRegSizeInBits(*RC);
843 SrcRC = RI.getPhysRegBaseClass(SrcReg);
844 SrcSize = RI.getRegSizeInBits(*SrcRC);
845 }
846 }
847
848 if (RC == &AMDGPU::VGPR_32RegClass) {
849 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
850 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
851 AMDGPU::AGPR_32RegClass.contains(SrcReg));
852 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
853 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
854 BuildMI(MBB, MI, DL, get(Opc), DestReg)
855 .addReg(SrcReg, getKillRegState(KillSrc));
856 return;
857 }
858
859 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
860 RC == &AMDGPU::SReg_32RegClass) {
861 if (SrcReg == AMDGPU::SCC) {
862 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
863 .addImm(1)
864 .addImm(0);
865 return;
866 }
867
868 if (DestReg == AMDGPU::VCC_LO) {
869 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
871 .addReg(SrcReg, getKillRegState(KillSrc));
872 } else {
873 // FIXME: Hack until VReg_1 removed.
874 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
875 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
876 .addImm(0)
877 .addReg(SrcReg, getKillRegState(KillSrc));
878 }
879
880 return;
881 }
882
883 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
884 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
885 return;
886 }
887
888 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 return;
891 }
892
893 if (RC == &AMDGPU::SReg_64RegClass) {
894 if (SrcReg == AMDGPU::SCC) {
895 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
896 .addImm(1)
897 .addImm(0);
898 return;
899 }
900
901 if (DestReg == AMDGPU::VCC) {
902 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
903 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
904 .addReg(SrcReg, getKillRegState(KillSrc));
905 } else {
906 // FIXME: Hack until VReg_1 removed.
907 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
908 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
909 .addImm(0)
910 .addReg(SrcReg, getKillRegState(KillSrc));
911 }
912
913 return;
914 }
915
916 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
917 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
918 return;
919 }
920
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
922 .addReg(SrcReg, getKillRegState(KillSrc));
923 return;
924 }
925
926 if (DestReg == AMDGPU::SCC) {
927 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
928 // but SelectionDAG emits such copies for i1 sources.
929 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
930 // This copy can only be produced by patterns
931 // with explicit SCC, which are known to be enabled
932 // only for subtargets with S_CMP_LG_U64 present.
933 assert(ST.hasScalarCompareEq64());
934 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
935 .addReg(SrcReg, getKillRegState(KillSrc))
936 .addImm(0);
937 } else {
938 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
939 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
940 .addReg(SrcReg, getKillRegState(KillSrc))
941 .addImm(0);
942 }
943
944 return;
945 }
946
947 if (RC == &AMDGPU::AGPR_32RegClass) {
948 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
949 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
950 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
951 .addReg(SrcReg, getKillRegState(KillSrc));
952 return;
953 }
954
955 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
956 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
957 .addReg(SrcReg, getKillRegState(KillSrc));
958 return;
959 }
960
961 // FIXME: Pass should maintain scavenger to avoid scan through the block on
962 // every AGPR spill.
963 RegScavenger RS;
964 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
965 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
966 return;
967 }
968
969 if (Size == 16) {
970 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
971 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
972 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
973
974 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
975 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
976 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
977 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
978 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
979 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
980 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
981 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
982
983 if (IsSGPRDst) {
984 if (!IsSGPRSrc) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
986 return;
987 }
988
989 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
990 .addReg(NewSrcReg, getKillRegState(KillSrc));
991 return;
992 }
993
994 if (IsAGPRDst || IsAGPRSrc) {
995 if (!DstLow || !SrcLow) {
996 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
997 "Cannot use hi16 subreg with an AGPR!");
998 }
999
1000 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
1001 return;
1002 }
1003
1004 if (ST.useRealTrue16Insts()) {
1005 if (IsSGPRSrc) {
1006 assert(SrcLow);
1007 SrcReg = NewSrcReg;
1008 }
1009 // Use the smaller instruction encoding if possible.
1010 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1011 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1012 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1013 .addReg(SrcReg);
1014 } else {
1015 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1016 .addImm(0) // src0_modifiers
1017 .addReg(SrcReg)
1018 .addImm(0); // op_sel
1019 }
1020 return;
1021 }
1022
1023 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1024 if (!DstLow || !SrcLow) {
1025 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1026 "Cannot use hi16 subreg on VI!");
1027 }
1028
1029 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1030 .addReg(NewSrcReg, getKillRegState(KillSrc));
1031 return;
1032 }
1033
1034 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1035 .addImm(0) // src0_modifiers
1036 .addReg(NewSrcReg)
1037 .addImm(0) // clamp
1044 // First implicit operand is $exec.
1045 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1046 return;
1047 }
1048
1049 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1050 if (ST.hasMovB64()) {
1051 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1052 .addReg(SrcReg, getKillRegState(KillSrc));
1053 return;
1054 }
1055 if (ST.hasPkMovB32()) {
1056 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1058 .addReg(SrcReg)
1060 .addReg(SrcReg)
1061 .addImm(0) // op_sel_lo
1062 .addImm(0) // op_sel_hi
1063 .addImm(0) // neg_lo
1064 .addImm(0) // neg_hi
1065 .addImm(0) // clamp
1066 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1067 return;
1068 }
1069 }
1070
1071 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1072 if (RI.isSGPRClass(RC)) {
1073 if (!RI.isSGPRClass(SrcRC)) {
1074 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1075 return;
1076 }
1077 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1078 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1079 Forward);
1080 return;
1081 }
1082
1083 unsigned EltSize = 4;
1084 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1085 if (RI.isAGPRClass(RC)) {
1086 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1087 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1088 else if (RI.hasVGPRs(SrcRC) ||
1089 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1090 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1091 else
1092 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1093 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1094 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1095 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1096 (RI.isProperlyAlignedRC(*RC) &&
1097 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1098 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1099 if (ST.hasMovB64()) {
1100 Opcode = AMDGPU::V_MOV_B64_e32;
1101 EltSize = 8;
1102 } else if (ST.hasPkMovB32()) {
1103 Opcode = AMDGPU::V_PK_MOV_B32;
1104 EltSize = 8;
1105 }
1106 }
1107
1108 // For the cases where we need an intermediate instruction/temporary register
1109 // (destination is an AGPR), we need a scavenger.
1110 //
1111 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1112 // whole block for every handled copy.
1113 std::unique_ptr<RegScavenger> RS;
1114 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1115 RS = std::make_unique<RegScavenger>();
1116
1117 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1118
1119 // If there is an overlap, we can't kill the super-register on the last
1120 // instruction, since it will also kill the components made live by this def.
1121 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1122 const bool CanKillSuperReg = KillSrc && !Overlap;
1123
1124 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1125 unsigned SubIdx;
1126 if (Forward)
1127 SubIdx = SubIndices[Idx];
1128 else
1129 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1130 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1131 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1132 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1133
1134 bool IsFirstSubreg = Idx == 0;
1135 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1136
1137 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1138 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1139 Register ImpUseSuper = SrcReg;
1140 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1141 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1142 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1144 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1146 .addReg(SrcSubReg)
1148 .addReg(SrcSubReg)
1149 .addImm(0) // op_sel_lo
1150 .addImm(0) // op_sel_hi
1151 .addImm(0) // neg_lo
1152 .addImm(0) // neg_hi
1153 .addImm(0) // clamp
1154 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1155 if (IsFirstSubreg)
1157 } else {
1158 MachineInstrBuilder Builder =
1159 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1160 if (IsFirstSubreg)
1161 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1162
1163 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1164 }
1165 }
1166}
1167
1168int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1169 int NewOpc;
1170
1171 // Try to map original to commuted opcode
1172 NewOpc = AMDGPU::getCommuteRev(Opcode);
1173 if (NewOpc != -1)
1174 // Check if the commuted (REV) opcode exists on the target.
1175 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1176
1177 // Try to map commuted to original opcode
1178 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1179 if (NewOpc != -1)
1180 // Check if the original (non-REV) opcode exists on the target.
1181 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1182
1183 return Opcode;
1184}
1185
1186const TargetRegisterClass *
1188 return &AMDGPU::VGPR_32RegClass;
1189}
1190
1193 const DebugLoc &DL, Register DstReg,
1195 Register TrueReg,
1196 Register FalseReg) const {
1197 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1198 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1200 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1201 "Not a VGPR32 reg");
1202
1203 if (Cond.size() == 1) {
1204 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1205 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1206 .add(Cond[0]);
1207 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1208 .addImm(0)
1209 .addReg(FalseReg)
1210 .addImm(0)
1211 .addReg(TrueReg)
1212 .addReg(SReg);
1213 } else if (Cond.size() == 2) {
1214 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1215 switch (Cond[0].getImm()) {
1216 case SIInstrInfo::SCC_TRUE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::SCC_FALSE: {
1228 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1229 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1230 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1231 .addImm(0)
1232 .addReg(FalseReg)
1233 .addImm(0)
1234 .addReg(TrueReg)
1235 .addReg(SReg);
1236 break;
1237 }
1238 case SIInstrInfo::VCCNZ: {
1239 MachineOperand RegOp = Cond[1];
1240 RegOp.setImplicit(false);
1241 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1242 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1243 .add(RegOp);
1244 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1245 .addImm(0)
1246 .addReg(FalseReg)
1247 .addImm(0)
1248 .addReg(TrueReg)
1249 .addReg(SReg);
1250 break;
1251 }
1252 case SIInstrInfo::VCCZ: {
1253 MachineOperand RegOp = Cond[1];
1254 RegOp.setImplicit(false);
1255 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1256 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1257 .add(RegOp);
1258 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1259 .addImm(0)
1260 .addReg(TrueReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addReg(SReg);
1264 break;
1265 }
1266 case SIInstrInfo::EXECNZ: {
1267 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1268 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1269 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1270 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1271 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1272 .addImm(0)
1273 .addReg(FalseReg)
1274 .addImm(0)
1275 .addReg(TrueReg)
1276 .addReg(SReg);
1277 break;
1278 }
1279 case SIInstrInfo::EXECZ: {
1280 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1281 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1282 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1283 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1284 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1285 .addImm(0)
1286 .addReg(FalseReg)
1287 .addImm(0)
1288 .addReg(TrueReg)
1289 .addReg(SReg);
1290 llvm_unreachable("Unhandled branch predicate EXECZ");
1291 break;
1292 }
1293 default:
1294 llvm_unreachable("invalid branch predicate");
1295 }
1296 } else {
1297 llvm_unreachable("Can only handle Cond size 1 or 2");
1298 }
1299}
1300
1303 const DebugLoc &DL,
1304 Register SrcReg, int Value) const {
1305 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1306 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1307 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1308 .addImm(Value)
1309 .addReg(SrcReg);
1310
1311 return Reg;
1312}
1313
1316 const DebugLoc &DL,
1317 Register SrcReg, int Value) const {
1318 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1319 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1320 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1321 .addImm(Value)
1322 .addReg(SrcReg);
1323
1324 return Reg;
1325}
1326
1328 const Register Reg,
1329 int64_t &ImmVal) const {
1330 switch (MI.getOpcode()) {
1331 case AMDGPU::V_MOV_B32_e32:
1332 case AMDGPU::S_MOV_B32:
1333 case AMDGPU::S_MOVK_I32:
1334 case AMDGPU::S_MOV_B64:
1335 case AMDGPU::V_MOV_B64_e32:
1336 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1337 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1338 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1339 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1340 case AMDGPU::V_MOV_B64_PSEUDO: {
1341 const MachineOperand &Src0 = MI.getOperand(1);
1342 if (Src0.isImm()) {
1343 ImmVal = Src0.getImm();
1344 return MI.getOperand(0).getReg() == Reg;
1345 }
1346
1347 return false;
1348 }
1349 case AMDGPU::S_BREV_B32:
1350 case AMDGPU::V_BFREV_B32_e32:
1351 case AMDGPU::V_BFREV_B32_e64: {
1352 const MachineOperand &Src0 = MI.getOperand(1);
1353 if (Src0.isImm()) {
1354 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1355 return MI.getOperand(0).getReg() == Reg;
1356 }
1357
1358 return false;
1359 }
1360 case AMDGPU::S_NOT_B32:
1361 case AMDGPU::V_NOT_B32_e32:
1362 case AMDGPU::V_NOT_B32_e64: {
1363 const MachineOperand &Src0 = MI.getOperand(1);
1364 if (Src0.isImm()) {
1365 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1366 return MI.getOperand(0).getReg() == Reg;
1367 }
1368
1369 return false;
1370 }
1371 default:
1372 return false;
1373 }
1374}
1375
1377
1378 if (RI.isAGPRClass(DstRC))
1379 return AMDGPU::COPY;
1380 if (RI.getRegSizeInBits(*DstRC) == 16) {
1381 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1382 // before RA.
1383 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1384 }
1385 if (RI.getRegSizeInBits(*DstRC) == 32)
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1387 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1388 return AMDGPU::S_MOV_B64;
1389 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1390 return AMDGPU::V_MOV_B64_PSEUDO;
1391 return AMDGPU::COPY;
1392}
1393
1394const MCInstrDesc &
1396 bool IsIndirectSrc) const {
1397 if (IsIndirectSrc) {
1398 if (VecSize <= 32) // 4 bytes
1399 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1400 if (VecSize <= 64) // 8 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1402 if (VecSize <= 96) // 12 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1404 if (VecSize <= 128) // 16 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1406 if (VecSize <= 160) // 20 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1408 if (VecSize <= 256) // 32 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1410 if (VecSize <= 288) // 36 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1412 if (VecSize <= 320) // 40 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1414 if (VecSize <= 352) // 44 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1416 if (VecSize <= 384) // 48 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1418 if (VecSize <= 512) // 64 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1420 if (VecSize <= 1024) // 128 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1422
1423 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1424 }
1425
1426 if (VecSize <= 32) // 4 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1428 if (VecSize <= 64) // 8 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1430 if (VecSize <= 96) // 12 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1432 if (VecSize <= 128) // 16 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1434 if (VecSize <= 160) // 20 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1436 if (VecSize <= 256) // 32 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1438 if (VecSize <= 288) // 36 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1440 if (VecSize <= 320) // 40 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1442 if (VecSize <= 352) // 44 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1444 if (VecSize <= 384) // 48 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1446 if (VecSize <= 512) // 64 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1448 if (VecSize <= 1024) // 128 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1450
1451 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1452}
1453
1454static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1455 if (VecSize <= 32) // 4 bytes
1456 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1457 if (VecSize <= 64) // 8 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1459 if (VecSize <= 96) // 12 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1461 if (VecSize <= 128) // 16 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1463 if (VecSize <= 160) // 20 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1465 if (VecSize <= 256) // 32 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1467 if (VecSize <= 288) // 36 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1469 if (VecSize <= 320) // 40 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1471 if (VecSize <= 352) // 44 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1473 if (VecSize <= 384) // 48 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1475 if (VecSize <= 512) // 64 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1477 if (VecSize <= 1024) // 128 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1479
1480 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1481}
1482
1483static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1484 if (VecSize <= 32) // 4 bytes
1485 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1486 if (VecSize <= 64) // 8 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1488 if (VecSize <= 96) // 12 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1490 if (VecSize <= 128) // 16 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1492 if (VecSize <= 160) // 20 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1494 if (VecSize <= 256) // 32 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1496 if (VecSize <= 288) // 36 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1498 if (VecSize <= 320) // 40 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1500 if (VecSize <= 352) // 44 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1502 if (VecSize <= 384) // 48 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1504 if (VecSize <= 512) // 64 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1506 if (VecSize <= 1024) // 128 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1508
1509 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1510}
1511
1512static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1513 if (VecSize <= 64) // 8 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1515 if (VecSize <= 128) // 16 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1517 if (VecSize <= 256) // 32 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527const MCInstrDesc &
1528SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1529 bool IsSGPR) const {
1530 if (IsSGPR) {
1531 switch (EltSize) {
1532 case 32:
1533 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1534 case 64:
1535 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1536 default:
1537 llvm_unreachable("invalid reg indexing elt size");
1538 }
1539 }
1540
1541 assert(EltSize == 32 && "invalid reg indexing elt size");
1543}
1544
1545static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1546 switch (Size) {
1547 case 4:
1548 return AMDGPU::SI_SPILL_S32_SAVE;
1549 case 8:
1550 return AMDGPU::SI_SPILL_S64_SAVE;
1551 case 12:
1552 return AMDGPU::SI_SPILL_S96_SAVE;
1553 case 16:
1554 return AMDGPU::SI_SPILL_S128_SAVE;
1555 case 20:
1556 return AMDGPU::SI_SPILL_S160_SAVE;
1557 case 24:
1558 return AMDGPU::SI_SPILL_S192_SAVE;
1559 case 28:
1560 return AMDGPU::SI_SPILL_S224_SAVE;
1561 case 32:
1562 return AMDGPU::SI_SPILL_S256_SAVE;
1563 case 36:
1564 return AMDGPU::SI_SPILL_S288_SAVE;
1565 case 40:
1566 return AMDGPU::SI_SPILL_S320_SAVE;
1567 case 44:
1568 return AMDGPU::SI_SPILL_S352_SAVE;
1569 case 48:
1570 return AMDGPU::SI_SPILL_S384_SAVE;
1571 case 64:
1572 return AMDGPU::SI_SPILL_S512_SAVE;
1573 case 128:
1574 return AMDGPU::SI_SPILL_S1024_SAVE;
1575 default:
1576 llvm_unreachable("unknown register size");
1577 }
1578}
1579
1580static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1581 switch (Size) {
1582 case 2:
1583 return AMDGPU::SI_SPILL_V16_SAVE;
1584 case 4:
1585 return AMDGPU::SI_SPILL_V32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_V64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_V96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_V128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_V160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_V192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_V224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_V256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_V288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_V320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_V352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_V384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_V512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_V1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getAVSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 4:
1620 return AMDGPU::SI_SPILL_AV32_SAVE;
1621 case 8:
1622 return AMDGPU::SI_SPILL_AV64_SAVE;
1623 case 12:
1624 return AMDGPU::SI_SPILL_AV96_SAVE;
1625 case 16:
1626 return AMDGPU::SI_SPILL_AV128_SAVE;
1627 case 20:
1628 return AMDGPU::SI_SPILL_AV160_SAVE;
1629 case 24:
1630 return AMDGPU::SI_SPILL_AV192_SAVE;
1631 case 28:
1632 return AMDGPU::SI_SPILL_AV224_SAVE;
1633 case 32:
1634 return AMDGPU::SI_SPILL_AV256_SAVE;
1635 case 36:
1636 return AMDGPU::SI_SPILL_AV288_SAVE;
1637 case 40:
1638 return AMDGPU::SI_SPILL_AV320_SAVE;
1639 case 44:
1640 return AMDGPU::SI_SPILL_AV352_SAVE;
1641 case 48:
1642 return AMDGPU::SI_SPILL_AV384_SAVE;
1643 case 64:
1644 return AMDGPU::SI_SPILL_AV512_SAVE;
1645 case 128:
1646 return AMDGPU::SI_SPILL_AV1024_SAVE;
1647 default:
1648 llvm_unreachable("unknown register size");
1649 }
1650}
1651
1652static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1653 bool IsVectorSuperClass) {
1654 // Currently, there is only 32-bit WWM register spills needed.
1655 if (Size != 4)
1656 llvm_unreachable("unknown wwm register spill size");
1657
1658 if (IsVectorSuperClass)
1659 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1660
1661 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1662}
1663
1665 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1666 const SIMachineFunctionInfo &MFI) const {
1667 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1668
1669 // Choose the right opcode if spilling a WWM register.
1671 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1672
1673 // TODO: Check if AGPRs are available
1674 if (ST.hasMAIInsts())
1675 return getAVSpillSaveOpcode(Size);
1676
1678}
1679
1682 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1683 const TargetRegisterInfo *TRI, Register VReg,
1684 MachineInstr::MIFlag Flags) const {
1685 MachineFunction *MF = MBB.getParent();
1687 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1688 const DebugLoc &DL = MBB.findDebugLoc(MI);
1689
1690 MachinePointerInfo PtrInfo
1691 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1693 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1694 FrameInfo.getObjectAlign(FrameIndex));
1695 unsigned SpillSize = TRI->getSpillSize(*RC);
1696
1698 if (RI.isSGPRClass(RC)) {
1699 MFI->setHasSpilledSGPRs();
1700 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1701 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1702 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1703
1704 // We are only allowed to create one new instruction when spilling
1705 // registers, so we need to use pseudo instruction for spilling SGPRs.
1706 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1707
1708 // The SGPR spill/restore instructions only work on number sgprs, so we need
1709 // to make sure we are using the correct register class.
1710 if (SrcReg.isVirtual() && SpillSize == 4) {
1711 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1712 }
1713
1714 BuildMI(MBB, MI, DL, OpDesc)
1715 .addReg(SrcReg, getKillRegState(isKill)) // data
1716 .addFrameIndex(FrameIndex) // addr
1717 .addMemOperand(MMO)
1719
1720 if (RI.spillSGPRToVGPR())
1721 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1722 return;
1723 }
1724
1725 unsigned Opcode =
1726 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1727 MFI->setHasSpilledVGPRs();
1728
1729 BuildMI(MBB, MI, DL, get(Opcode))
1730 .addReg(SrcReg, getKillRegState(isKill)) // data
1731 .addFrameIndex(FrameIndex) // addr
1732 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1733 .addImm(0) // offset
1734 .addMemOperand(MMO);
1735}
1736
1737static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1738 switch (Size) {
1739 case 4:
1740 return AMDGPU::SI_SPILL_S32_RESTORE;
1741 case 8:
1742 return AMDGPU::SI_SPILL_S64_RESTORE;
1743 case 12:
1744 return AMDGPU::SI_SPILL_S96_RESTORE;
1745 case 16:
1746 return AMDGPU::SI_SPILL_S128_RESTORE;
1747 case 20:
1748 return AMDGPU::SI_SPILL_S160_RESTORE;
1749 case 24:
1750 return AMDGPU::SI_SPILL_S192_RESTORE;
1751 case 28:
1752 return AMDGPU::SI_SPILL_S224_RESTORE;
1753 case 32:
1754 return AMDGPU::SI_SPILL_S256_RESTORE;
1755 case 36:
1756 return AMDGPU::SI_SPILL_S288_RESTORE;
1757 case 40:
1758 return AMDGPU::SI_SPILL_S320_RESTORE;
1759 case 44:
1760 return AMDGPU::SI_SPILL_S352_RESTORE;
1761 case 48:
1762 return AMDGPU::SI_SPILL_S384_RESTORE;
1763 case 64:
1764 return AMDGPU::SI_SPILL_S512_RESTORE;
1765 case 128:
1766 return AMDGPU::SI_SPILL_S1024_RESTORE;
1767 default:
1768 llvm_unreachable("unknown register size");
1769 }
1770}
1771
1772static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1773 switch (Size) {
1774 case 2:
1775 return AMDGPU::SI_SPILL_V16_RESTORE;
1776 case 4:
1777 return AMDGPU::SI_SPILL_V32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_V64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_V96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_V128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_V160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_V192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_V224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_V256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_V288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_V320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_V352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_V384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_V512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_V1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 4:
1812 return AMDGPU::SI_SPILL_AV32_RESTORE;
1813 case 8:
1814 return AMDGPU::SI_SPILL_AV64_RESTORE;
1815 case 12:
1816 return AMDGPU::SI_SPILL_AV96_RESTORE;
1817 case 16:
1818 return AMDGPU::SI_SPILL_AV128_RESTORE;
1819 case 20:
1820 return AMDGPU::SI_SPILL_AV160_RESTORE;
1821 case 24:
1822 return AMDGPU::SI_SPILL_AV192_RESTORE;
1823 case 28:
1824 return AMDGPU::SI_SPILL_AV224_RESTORE;
1825 case 32:
1826 return AMDGPU::SI_SPILL_AV256_RESTORE;
1827 case 36:
1828 return AMDGPU::SI_SPILL_AV288_RESTORE;
1829 case 40:
1830 return AMDGPU::SI_SPILL_AV320_RESTORE;
1831 case 44:
1832 return AMDGPU::SI_SPILL_AV352_RESTORE;
1833 case 48:
1834 return AMDGPU::SI_SPILL_AV384_RESTORE;
1835 case 64:
1836 return AMDGPU::SI_SPILL_AV512_RESTORE;
1837 case 128:
1838 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1839 default:
1840 llvm_unreachable("unknown register size");
1841 }
1842}
1843
1844static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1845 bool IsVectorSuperClass) {
1846 // Currently, there is only 32-bit WWM register spills needed.
1847 if (Size != 4)
1848 llvm_unreachable("unknown wwm register spill size");
1849
1850 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1851 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1852
1853 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1854}
1855
1857 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1858 const SIMachineFunctionInfo &MFI) const {
1859 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1860
1861 // Choose the right opcode if restoring a WWM register.
1863 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1864
1865 // TODO: Check if AGPRs are available
1866 if (ST.hasMAIInsts())
1868
1869 assert(!RI.isAGPRClass(RC));
1871}
1872
1875 Register DestReg, int FrameIndex,
1876 const TargetRegisterClass *RC,
1877 const TargetRegisterInfo *TRI,
1878 Register VReg,
1879 MachineInstr::MIFlag Flags) const {
1880 MachineFunction *MF = MBB.getParent();
1882 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1883 const DebugLoc &DL = MBB.findDebugLoc(MI);
1884 unsigned SpillSize = TRI->getSpillSize(*RC);
1885
1886 MachinePointerInfo PtrInfo
1887 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1888
1890 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1891 FrameInfo.getObjectAlign(FrameIndex));
1892
1893 if (RI.isSGPRClass(RC)) {
1894 MFI->setHasSpilledSGPRs();
1895 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1896 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1897 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1898
1899 // FIXME: Maybe this should not include a memoperand because it will be
1900 // lowered to non-memory instructions.
1901 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1902 if (DestReg.isVirtual() && SpillSize == 4) {
1904 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1905 }
1906
1907 if (RI.spillSGPRToVGPR())
1908 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1909 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1910 .addFrameIndex(FrameIndex) // addr
1911 .addMemOperand(MMO)
1913
1914 return;
1915 }
1916
1917 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1918 SpillSize, *MFI);
1919 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1920 .addFrameIndex(FrameIndex) // vaddr
1921 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1922 .addImm(0) // offset
1923 .addMemOperand(MMO);
1924}
1925
1930
1933 unsigned Quantity) const {
1934 DebugLoc DL = MBB.findDebugLoc(MI);
1935 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1936 while (Quantity > 0) {
1937 unsigned Arg = std::min(Quantity, MaxSNopCount);
1938 Quantity -= Arg;
1939 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1940 }
1941}
1942
1944 auto *MF = MBB.getParent();
1945 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1946
1947 assert(Info->isEntryFunction());
1948
1949 if (MBB.succ_empty()) {
1950 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1951 if (HasNoTerminator) {
1952 if (Info->returnsVoid()) {
1953 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1954 } else {
1955 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1956 }
1957 }
1958 }
1959}
1960
1964 const DebugLoc &DL) const {
1965 MachineFunction *MF = MBB.getParent();
1966 constexpr unsigned DoorbellIDMask = 0x3ff;
1967 constexpr unsigned ECQueueWaveAbort = 0x400;
1968
1969 MachineBasicBlock *TrapBB = &MBB;
1970 MachineBasicBlock *ContBB = &MBB;
1971 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1972
1973 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1974 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1975 TrapBB = MF->CreateMachineBasicBlock();
1976 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1977 MF->push_back(TrapBB);
1978 MBB.addSuccessor(TrapBB);
1979 }
1980
1981 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1982 // will be a nop.
1983 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1984 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1985 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1986 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1987 DoorbellReg)
1989 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1990 .addUse(AMDGPU::M0);
1991 Register DoorbellRegMasked =
1992 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1993 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1994 .addUse(DoorbellReg)
1995 .addImm(DoorbellIDMask);
1996 Register SetWaveAbortBit =
1997 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1998 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1999 .addUse(DoorbellRegMasked)
2000 .addImm(ECQueueWaveAbort);
2001 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2002 .addUse(SetWaveAbortBit);
2003 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2005 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2006 .addUse(AMDGPU::TTMP2);
2007 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2008 TrapBB->addSuccessor(HaltLoopBB);
2009
2010 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2011 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2012 .addMBB(HaltLoopBB);
2013 MF->push_back(HaltLoopBB);
2014 HaltLoopBB->addSuccessor(HaltLoopBB);
2015
2016 return ContBB;
2017}
2018
2020 switch (MI.getOpcode()) {
2021 default:
2022 if (MI.isMetaInstruction())
2023 return 0;
2024 return 1; // FIXME: Do wait states equal cycles?
2025
2026 case AMDGPU::S_NOP:
2027 return MI.getOperand(0).getImm() + 1;
2028 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2029 // hazard, even if one exist, won't really be visible. Should we handle it?
2030 }
2031}
2032
2034 MachineBasicBlock &MBB = *MI.getParent();
2035 DebugLoc DL = MBB.findDebugLoc(MI);
2037 switch (MI.getOpcode()) {
2038 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2039 case AMDGPU::S_MOV_B64_term:
2040 // This is only a terminator to get the correct spill code placement during
2041 // register allocation.
2042 MI.setDesc(get(AMDGPU::S_MOV_B64));
2043 break;
2044
2045 case AMDGPU::S_MOV_B32_term:
2046 // This is only a terminator to get the correct spill code placement during
2047 // register allocation.
2048 MI.setDesc(get(AMDGPU::S_MOV_B32));
2049 break;
2050
2051 case AMDGPU::S_XOR_B64_term:
2052 // This is only a terminator to get the correct spill code placement during
2053 // register allocation.
2054 MI.setDesc(get(AMDGPU::S_XOR_B64));
2055 break;
2056
2057 case AMDGPU::S_XOR_B32_term:
2058 // This is only a terminator to get the correct spill code placement during
2059 // register allocation.
2060 MI.setDesc(get(AMDGPU::S_XOR_B32));
2061 break;
2062 case AMDGPU::S_OR_B64_term:
2063 // This is only a terminator to get the correct spill code placement during
2064 // register allocation.
2065 MI.setDesc(get(AMDGPU::S_OR_B64));
2066 break;
2067 case AMDGPU::S_OR_B32_term:
2068 // This is only a terminator to get the correct spill code placement during
2069 // register allocation.
2070 MI.setDesc(get(AMDGPU::S_OR_B32));
2071 break;
2072
2073 case AMDGPU::S_ANDN2_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2077 break;
2078
2079 case AMDGPU::S_ANDN2_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2083 break;
2084
2085 case AMDGPU::S_AND_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_AND_B64));
2089 break;
2090
2091 case AMDGPU::S_AND_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_AND_B32));
2095 break;
2096
2097 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2098 // This is only a terminator to get the correct spill code placement during
2099 // register allocation.
2100 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2101 break;
2102
2103 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2104 // This is only a terminator to get the correct spill code placement during
2105 // register allocation.
2106 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2107 break;
2108
2109 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2110 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2111 break;
2112
2113 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2114 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2115 MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
2116 &AMDGPU::SReg_32_XM0RegClass);
2117 break;
2118 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2119 Register Dst = MI.getOperand(0).getReg();
2120 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2121 MI.setDesc(
2122 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2123 break;
2124 }
2125 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2126 Register Dst = MI.getOperand(0).getReg();
2127 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2128 int64_t Imm = MI.getOperand(1).getImm();
2129
2130 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2131 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2132 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2135 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2136 .addImm(SignExtend64<32>(Imm >> 32))
2138 MI.eraseFromParent();
2139 break;
2140 }
2141
2142 [[fallthrough]];
2143 }
2144 case AMDGPU::V_MOV_B64_PSEUDO: {
2145 Register Dst = MI.getOperand(0).getReg();
2146 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2147 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2148
2149 const MachineOperand &SrcOp = MI.getOperand(1);
2150 // FIXME: Will this work for 64-bit floating point immediates?
2151 assert(!SrcOp.isFPImm());
2152 if (ST.hasMovB64()) {
2153 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2154 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2155 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2156 break;
2157 }
2158 if (SrcOp.isImm()) {
2159 APInt Imm(64, SrcOp.getImm());
2160 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2161 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2162 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2163 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2165 .addImm(Lo.getSExtValue())
2167 .addImm(Lo.getSExtValue())
2168 .addImm(0) // op_sel_lo
2169 .addImm(0) // op_sel_hi
2170 .addImm(0) // neg_lo
2171 .addImm(0) // neg_hi
2172 .addImm(0); // clamp
2173 } else {
2174 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2175 .addImm(Lo.getSExtValue())
2177 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2178 .addImm(Hi.getSExtValue())
2180 }
2181 } else {
2182 assert(SrcOp.isReg());
2183 if (ST.hasPkMovB32() &&
2184 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2185 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2186 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2187 .addReg(SrcOp.getReg())
2189 .addReg(SrcOp.getReg())
2190 .addImm(0) // op_sel_lo
2191 .addImm(0) // op_sel_hi
2192 .addImm(0) // neg_lo
2193 .addImm(0) // neg_hi
2194 .addImm(0); // clamp
2195 } else {
2196 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2197 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2199 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2200 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2202 }
2203 }
2204 MI.eraseFromParent();
2205 break;
2206 }
2207 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2209 break;
2210 }
2211 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2212 const MachineOperand &SrcOp = MI.getOperand(1);
2213 assert(!SrcOp.isFPImm());
2214
2215 if (ST.has64BitLiterals()) {
2216 MI.setDesc(get(AMDGPU::S_MOV_B64));
2217 break;
2218 }
2219
2220 APInt Imm(64, SrcOp.getImm());
2221 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2222 MI.setDesc(get(AMDGPU::S_MOV_B64));
2223 break;
2224 }
2225
2226 Register Dst = MI.getOperand(0).getReg();
2227 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2228 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2229
2230 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2231 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2232 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2233 .addImm(Lo.getSExtValue())
2235 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2236 .addImm(Hi.getSExtValue())
2238 MI.eraseFromParent();
2239 break;
2240 }
2241 case AMDGPU::V_SET_INACTIVE_B32: {
2242 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2243 Register DstReg = MI.getOperand(0).getReg();
2244 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2245 .add(MI.getOperand(3))
2246 .add(MI.getOperand(4))
2247 .add(MI.getOperand(1))
2248 .add(MI.getOperand(2))
2249 .add(MI.getOperand(5));
2250 MI.eraseFromParent();
2251 break;
2252 }
2253 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2254 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2255 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2256 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2257 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2258 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2259 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2260 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2263 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2264 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2268 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2269 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2270 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2271 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2272 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2273 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2274 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2282 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2283
2284 unsigned Opc;
2285 if (RI.hasVGPRs(EltRC)) {
2286 Opc = AMDGPU::V_MOVRELD_B32_e32;
2287 } else {
2288 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2289 : AMDGPU::S_MOVRELD_B32;
2290 }
2291
2292 const MCInstrDesc &OpDesc = get(Opc);
2293 Register VecReg = MI.getOperand(0).getReg();
2294 bool IsUndef = MI.getOperand(1).isUndef();
2295 unsigned SubReg = MI.getOperand(3).getImm();
2296 assert(VecReg == MI.getOperand(1).getReg());
2297
2299 BuildMI(MBB, MI, DL, OpDesc)
2300 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2301 .add(MI.getOperand(2))
2303 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2304
2305 const int ImpDefIdx =
2306 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2307 const int ImpUseIdx = ImpDefIdx + 1;
2308 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2309 MI.eraseFromParent();
2310 break;
2311 }
2312 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2321 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2322 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2323 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2324 assert(ST.useVGPRIndexMode());
2325 Register VecReg = MI.getOperand(0).getReg();
2326 bool IsUndef = MI.getOperand(1).isUndef();
2327 MachineOperand &Idx = MI.getOperand(3);
2328 Register SubReg = MI.getOperand(4).getImm();
2329
2330 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2331 .add(Idx)
2333 SetOn->getOperand(3).setIsUndef();
2334
2335 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2337 BuildMI(MBB, MI, DL, OpDesc)
2338 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2339 .add(MI.getOperand(2))
2341 .addReg(VecReg,
2342 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2343
2344 const int ImpDefIdx =
2345 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2346 const int ImpUseIdx = ImpDefIdx + 1;
2347 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2348
2349 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2350
2351 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2352
2353 MI.eraseFromParent();
2354 break;
2355 }
2356 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2357 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2358 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2359 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2360 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2361 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2362 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2363 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2364 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2365 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2366 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2367 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2368 assert(ST.useVGPRIndexMode());
2369 Register Dst = MI.getOperand(0).getReg();
2370 Register VecReg = MI.getOperand(1).getReg();
2371 bool IsUndef = MI.getOperand(1).isUndef();
2372 Register Idx = MI.getOperand(2).getReg();
2373 Register SubReg = MI.getOperand(3).getImm();
2374
2375 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2376 .addReg(Idx)
2378 SetOn->getOperand(3).setIsUndef();
2379
2380 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2381 .addDef(Dst)
2382 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2383 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2384
2385 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2386
2387 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2388
2389 MI.eraseFromParent();
2390 break;
2391 }
2392 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2393 MachineFunction &MF = *MBB.getParent();
2394 Register Reg = MI.getOperand(0).getReg();
2395 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2396 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2397 MachineOperand OpLo = MI.getOperand(1);
2398 MachineOperand OpHi = MI.getOperand(2);
2399
2400 // Create a bundle so these instructions won't be re-ordered by the
2401 // post-RA scheduler.
2402 MIBundleBuilder Bundler(MBB, MI);
2403 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2404
2405 // What we want here is an offset from the value returned by s_getpc (which
2406 // is the address of the s_add_u32 instruction) to the global variable, but
2407 // since the encoding of $symbol starts 4 bytes after the start of the
2408 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2409 // small. This requires us to add 4 to the global variable offset in order
2410 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2411 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2412 // instruction.
2413
2414 int64_t Adjust = 0;
2415 if (ST.hasGetPCZeroExtension()) {
2416 // Fix up hardware that does not sign-extend the 48-bit PC value by
2417 // inserting: s_sext_i32_i16 reghi, reghi
2418 Bundler.append(
2419 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2420 Adjust += 4;
2421 }
2422
2423 if (OpLo.isGlobal())
2424 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2425 Bundler.append(
2426 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2427
2428 if (OpHi.isGlobal())
2429 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2430 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2431 .addReg(RegHi)
2432 .add(OpHi));
2433
2434 finalizeBundle(MBB, Bundler.begin());
2435
2436 MI.eraseFromParent();
2437 break;
2438 }
2439 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2440 MachineFunction &MF = *MBB.getParent();
2441 Register Reg = MI.getOperand(0).getReg();
2442 MachineOperand Op = MI.getOperand(1);
2443
2444 // Create a bundle so these instructions won't be re-ordered by the
2445 // post-RA scheduler.
2446 MIBundleBuilder Bundler(MBB, MI);
2447 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2448 if (Op.isGlobal())
2449 Op.setOffset(Op.getOffset() + 4);
2450 Bundler.append(
2451 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2452
2453 finalizeBundle(MBB, Bundler.begin());
2454
2455 MI.eraseFromParent();
2456 break;
2457 }
2458 case AMDGPU::ENTER_STRICT_WWM: {
2459 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2460 // Whole Wave Mode is entered.
2461 MI.setDesc(get(LMC.OrSaveExecOpc));
2462 break;
2463 }
2464 case AMDGPU::ENTER_STRICT_WQM: {
2465 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2466 // STRICT_WQM is entered.
2467 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2468 .addReg(LMC.ExecReg);
2469 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2470
2471 MI.eraseFromParent();
2472 break;
2473 }
2474 case AMDGPU::EXIT_STRICT_WWM:
2475 case AMDGPU::EXIT_STRICT_WQM: {
2476 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2477 // WWM/STICT_WQM is exited.
2478 MI.setDesc(get(LMC.MovOpc));
2479 break;
2480 }
2481 case AMDGPU::SI_RETURN: {
2482 const MachineFunction *MF = MBB.getParent();
2483 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2484 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2485 // Hiding the return address use with SI_RETURN may lead to extra kills in
2486 // the function and missing live-ins. We are fine in practice because callee
2487 // saved register handling ensures the register value is restored before
2488 // RET, but we need the undef flag here to appease the MachineVerifier
2489 // liveness checks.
2491 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2492 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2493
2494 MIB.copyImplicitOps(MI);
2495 MI.eraseFromParent();
2496 break;
2497 }
2498
2499 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2500 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2501 MI.setDesc(get(AMDGPU::S_MUL_U64));
2502 break;
2503
2504 case AMDGPU::S_GETPC_B64_pseudo:
2505 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2506 if (ST.hasGetPCZeroExtension()) {
2507 Register Dst = MI.getOperand(0).getReg();
2508 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2509 // Fix up hardware that does not sign-extend the 48-bit PC value by
2510 // inserting: s_sext_i32_i16 dsthi, dsthi
2511 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2512 DstHi)
2513 .addReg(DstHi);
2514 }
2515 break;
2516
2517 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2518 assert(ST.hasBF16PackedInsts());
2519 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2520 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2521 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2522 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2523 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2524 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2525 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2526 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2527 break;
2528 }
2529
2530 return true;
2531}
2532
2535 unsigned SubIdx, const MachineInstr &Orig,
2536 const TargetRegisterInfo &RI) const {
2537
2538 // Try shrinking the instruction to remat only the part needed for current
2539 // context.
2540 // TODO: Handle more cases.
2541 unsigned Opcode = Orig.getOpcode();
2542 switch (Opcode) {
2543 case AMDGPU::S_LOAD_DWORDX16_IMM:
2544 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2545 if (SubIdx != 0)
2546 break;
2547
2548 if (I == MBB.end())
2549 break;
2550
2551 if (I->isBundled())
2552 break;
2553
2554 // Look for a single use of the register that is also a subreg.
2555 Register RegToFind = Orig.getOperand(0).getReg();
2556 MachineOperand *UseMO = nullptr;
2557 for (auto &CandMO : I->operands()) {
2558 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2559 continue;
2560 if (UseMO) {
2561 UseMO = nullptr;
2562 break;
2563 }
2564 UseMO = &CandMO;
2565 }
2566 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2567 break;
2568
2569 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2570 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2571
2572 MachineFunction *MF = MBB.getParent();
2574 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2575
2576 unsigned NewOpcode = -1;
2577 if (SubregSize == 256)
2578 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2579 else if (SubregSize == 128)
2580 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2581 else
2582 break;
2583
2584 const MCInstrDesc &TID = get(NewOpcode);
2585 const TargetRegisterClass *NewRC =
2586 RI.getAllocatableClass(getRegClass(TID, 0, &RI));
2587 MRI.setRegClass(DestReg, NewRC);
2588
2589 UseMO->setReg(DestReg);
2590 UseMO->setSubReg(AMDGPU::NoSubRegister);
2591
2592 // Use a smaller load with the desired size, possibly with updated offset.
2593 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2594 MI->setDesc(TID);
2595 MI->getOperand(0).setReg(DestReg);
2596 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2597 if (Offset) {
2598 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2599 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2600 OffsetMO->setImm(FinalOffset);
2601 }
2603 for (const MachineMemOperand *MemOp : Orig.memoperands())
2604 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2605 SubregSize / 8));
2606 MI->setMemRefs(*MF, NewMMOs);
2607
2608 MBB.insert(I, MI);
2609 return;
2610 }
2611
2612 default:
2613 break;
2614 }
2615
2616 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2617}
2618
2619std::pair<MachineInstr*, MachineInstr*>
2621 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2622
2623 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2625 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2626 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2627 return std::pair(&MI, nullptr);
2628 }
2629
2630 MachineBasicBlock &MBB = *MI.getParent();
2631 DebugLoc DL = MBB.findDebugLoc(MI);
2632 MachineFunction *MF = MBB.getParent();
2634 Register Dst = MI.getOperand(0).getReg();
2635 unsigned Part = 0;
2636 MachineInstr *Split[2];
2637
2638 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2639 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2640 if (Dst.isPhysical()) {
2641 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2642 } else {
2643 assert(MRI.isSSA());
2644 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2645 MovDPP.addDef(Tmp);
2646 }
2647
2648 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2649 const MachineOperand &SrcOp = MI.getOperand(I);
2650 assert(!SrcOp.isFPImm());
2651 if (SrcOp.isImm()) {
2652 APInt Imm(64, SrcOp.getImm());
2653 Imm.ashrInPlace(Part * 32);
2654 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2655 } else {
2656 assert(SrcOp.isReg());
2657 Register Src = SrcOp.getReg();
2658 if (Src.isPhysical())
2659 MovDPP.addReg(RI.getSubReg(Src, Sub));
2660 else
2661 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2662 }
2663 }
2664
2665 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2666 MovDPP.addImm(MO.getImm());
2667
2668 Split[Part] = MovDPP;
2669 ++Part;
2670 }
2671
2672 if (Dst.isVirtual())
2673 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2674 .addReg(Split[0]->getOperand(0).getReg())
2675 .addImm(AMDGPU::sub0)
2676 .addReg(Split[1]->getOperand(0).getReg())
2677 .addImm(AMDGPU::sub1);
2678
2679 MI.eraseFromParent();
2680 return std::pair(Split[0], Split[1]);
2681}
2682
2683std::optional<DestSourcePair>
2685 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2686 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2687
2688 return std::nullopt;
2689}
2690
2692 AMDGPU::OpName Src0OpName,
2693 MachineOperand &Src1,
2694 AMDGPU::OpName Src1OpName) const {
2695 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2696 if (!Src0Mods)
2697 return false;
2698
2699 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2700 assert(Src1Mods &&
2701 "All commutable instructions have both src0 and src1 modifiers");
2702
2703 int Src0ModsVal = Src0Mods->getImm();
2704 int Src1ModsVal = Src1Mods->getImm();
2705
2706 Src1Mods->setImm(Src0ModsVal);
2707 Src0Mods->setImm(Src1ModsVal);
2708 return true;
2709}
2710
2712 MachineOperand &RegOp,
2713 MachineOperand &NonRegOp) {
2714 Register Reg = RegOp.getReg();
2715 unsigned SubReg = RegOp.getSubReg();
2716 bool IsKill = RegOp.isKill();
2717 bool IsDead = RegOp.isDead();
2718 bool IsUndef = RegOp.isUndef();
2719 bool IsDebug = RegOp.isDebug();
2720
2721 if (NonRegOp.isImm())
2722 RegOp.ChangeToImmediate(NonRegOp.getImm());
2723 else if (NonRegOp.isFI())
2724 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2725 else if (NonRegOp.isGlobal()) {
2726 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2727 NonRegOp.getTargetFlags());
2728 } else
2729 return nullptr;
2730
2731 // Make sure we don't reinterpret a subreg index in the target flags.
2732 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2733
2734 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2735 NonRegOp.setSubReg(SubReg);
2736
2737 return &MI;
2738}
2739
2741 MachineOperand &NonRegOp1,
2742 MachineOperand &NonRegOp2) {
2743 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2744 int64_t NonRegVal = NonRegOp1.getImm();
2745
2746 NonRegOp1.setImm(NonRegOp2.getImm());
2747 NonRegOp2.setImm(NonRegVal);
2748 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2749 NonRegOp2.setTargetFlags(TargetFlags);
2750 return &MI;
2751}
2752
2753bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2754 unsigned OpIdx1) const {
2755 const MCInstrDesc &InstDesc = MI.getDesc();
2756 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2757 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2758
2759 unsigned Opc = MI.getOpcode();
2760 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2761
2762 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2763 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2764
2765 // Swap doesn't breach constant bus or literal limits
2766 // It may move literal to position other than src0, this is not allowed
2767 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2768 // FIXME: After gfx9, literal can be in place other than Src0
2769 if (isVALU(MI)) {
2770 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2771 !isInlineConstant(MO0, OpInfo1))
2772 return false;
2773 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2774 !isInlineConstant(MO1, OpInfo0))
2775 return false;
2776 }
2777
2778 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2779 if (OpInfo1.RegClass == -1)
2780 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2781 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2782 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2783 }
2784 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2785 if (OpInfo0.RegClass == -1)
2786 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2787 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2788 isLegalRegOperand(MI, OpIdx0, MO1);
2789 }
2790
2791 // No need to check 64-bit literals since swapping does not bring new
2792 // 64-bit literals into current instruction to fold to 32-bit
2793
2794 return isImmOperandLegal(MI, OpIdx1, MO0);
2795}
2796
2798 unsigned Src0Idx,
2799 unsigned Src1Idx) const {
2800 assert(!NewMI && "this should never be used");
2801
2802 unsigned Opc = MI.getOpcode();
2803 int CommutedOpcode = commuteOpcode(Opc);
2804 if (CommutedOpcode == -1)
2805 return nullptr;
2806
2807 if (Src0Idx > Src1Idx)
2808 std::swap(Src0Idx, Src1Idx);
2809
2810 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2811 static_cast<int>(Src0Idx) &&
2812 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2813 static_cast<int>(Src1Idx) &&
2814 "inconsistency with findCommutedOpIndices");
2815
2816 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2817 return nullptr;
2818
2819 MachineInstr *CommutedMI = nullptr;
2820 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2821 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2822 if (Src0.isReg() && Src1.isReg()) {
2823 // Be sure to copy the source modifiers to the right place.
2824 CommutedMI =
2825 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2826 } else if (Src0.isReg() && !Src1.isReg()) {
2827 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2828 } else if (!Src0.isReg() && Src1.isReg()) {
2829 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2830 } else if (Src0.isImm() && Src1.isImm()) {
2831 CommutedMI = swapImmOperands(MI, Src0, Src1);
2832 } else {
2833 // FIXME: Found two non registers to commute. This does happen.
2834 return nullptr;
2835 }
2836
2837 if (CommutedMI) {
2838 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2839 Src1, AMDGPU::OpName::src1_modifiers);
2840
2841 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2842 AMDGPU::OpName::src1_sel);
2843
2844 CommutedMI->setDesc(get(CommutedOpcode));
2845 }
2846
2847 return CommutedMI;
2848}
2849
2850// This needs to be implemented because the source modifiers may be inserted
2851// between the true commutable operands, and the base
2852// TargetInstrInfo::commuteInstruction uses it.
2854 unsigned &SrcOpIdx0,
2855 unsigned &SrcOpIdx1) const {
2856 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2857}
2858
2860 unsigned &SrcOpIdx0,
2861 unsigned &SrcOpIdx1) const {
2862 if (!Desc.isCommutable())
2863 return false;
2864
2865 unsigned Opc = Desc.getOpcode();
2866 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2867 if (Src0Idx == -1)
2868 return false;
2869
2870 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2871 if (Src1Idx == -1)
2872 return false;
2873
2874 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2875}
2876
2878 int64_t BrOffset) const {
2879 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2880 // because its dest block is unanalyzable.
2881 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2882
2883 // Convert to dwords.
2884 BrOffset /= 4;
2885
2886 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2887 // from the next instruction.
2888 BrOffset -= 1;
2889
2890 return isIntN(BranchOffsetBits, BrOffset);
2891}
2892
2895 return MI.getOperand(0).getMBB();
2896}
2897
2899 for (const MachineInstr &MI : MBB->terminators()) {
2900 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2901 MI.getOpcode() == AMDGPU::SI_LOOP)
2902 return true;
2903 }
2904 return false;
2905}
2906
2908 MachineBasicBlock &DestBB,
2909 MachineBasicBlock &RestoreBB,
2910 const DebugLoc &DL, int64_t BrOffset,
2911 RegScavenger *RS) const {
2912 assert(MBB.empty() &&
2913 "new block should be inserted for expanding unconditional branch");
2914 assert(MBB.pred_size() == 1);
2915 assert(RestoreBB.empty() &&
2916 "restore block should be inserted for restoring clobbered registers");
2917
2918 MachineFunction *MF = MBB.getParent();
2921 auto I = MBB.end();
2922 auto &MCCtx = MF->getContext();
2923
2924 if (ST.hasAddPC64Inst()) {
2925 MCSymbol *Offset =
2926 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2927 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2929 MCSymbol *PostAddPCLabel =
2930 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2931 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2932 auto *OffsetExpr = MCBinaryExpr::createSub(
2933 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2934 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2935 Offset->setVariableValue(OffsetExpr);
2936 return;
2937 }
2938
2939 assert(RS && "RegScavenger required for long branching");
2940
2941 // FIXME: Virtual register workaround for RegScavenger not working with empty
2942 // blocks.
2943 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2944
2945 // Note: as this is used after hazard recognizer we need to apply some hazard
2946 // workarounds directly.
2947 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2948 ST.hasVALUReadSGPRHazard();
2949 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2950 if (FlushSGPRWrites)
2951 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2953 };
2954
2955 // We need to compute the offset relative to the instruction immediately after
2956 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2957 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2958 ApplyHazardWorkarounds();
2959
2960 MCSymbol *PostGetPCLabel =
2961 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2962 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2963
2964 MCSymbol *OffsetLo =
2965 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2966 MCSymbol *OffsetHi =
2967 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2968 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2969 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2970 .addReg(PCReg, 0, AMDGPU::sub0)
2971 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2972 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2973 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2974 .addReg(PCReg, 0, AMDGPU::sub1)
2975 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2976 ApplyHazardWorkarounds();
2977
2978 // Insert the indirect branch after the other terminator.
2979 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2980 .addReg(PCReg);
2981
2982 // If a spill is needed for the pc register pair, we need to insert a spill
2983 // restore block right before the destination block, and insert a short branch
2984 // into the old destination block's fallthrough predecessor.
2985 // e.g.:
2986 //
2987 // s_cbranch_scc0 skip_long_branch:
2988 //
2989 // long_branch_bb:
2990 // spill s[8:9]
2991 // s_getpc_b64 s[8:9]
2992 // s_add_u32 s8, s8, restore_bb
2993 // s_addc_u32 s9, s9, 0
2994 // s_setpc_b64 s[8:9]
2995 //
2996 // skip_long_branch:
2997 // foo;
2998 //
2999 // .....
3000 //
3001 // dest_bb_fallthrough_predecessor:
3002 // bar;
3003 // s_branch dest_bb
3004 //
3005 // restore_bb:
3006 // restore s[8:9]
3007 // fallthrough dest_bb
3008 ///
3009 // dest_bb:
3010 // buzz;
3011
3012 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3013 Register Scav;
3014
3015 // If we've previously reserved a register for long branches
3016 // avoid running the scavenger and just use those registers
3017 if (LongBranchReservedReg) {
3018 RS->enterBasicBlock(MBB);
3019 Scav = LongBranchReservedReg;
3020 } else {
3022 Scav = RS->scavengeRegisterBackwards(
3023 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3024 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3025 }
3026 if (Scav) {
3027 RS->setRegUsed(Scav);
3028 MRI.replaceRegWith(PCReg, Scav);
3029 MRI.clearVirtRegs();
3030 } else {
3031 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3032 // SGPR spill.
3033 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3034 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3035 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3036 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3037 MRI.clearVirtRegs();
3038 }
3039
3040 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3041 // Now, the distance could be defined.
3043 MCSymbolRefExpr::create(DestLabel, MCCtx),
3044 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3045 // Add offset assignments.
3046 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3047 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3048 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3049 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3050}
3051
3052unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3053 switch (Cond) {
3054 case SIInstrInfo::SCC_TRUE:
3055 return AMDGPU::S_CBRANCH_SCC1;
3056 case SIInstrInfo::SCC_FALSE:
3057 return AMDGPU::S_CBRANCH_SCC0;
3058 case SIInstrInfo::VCCNZ:
3059 return AMDGPU::S_CBRANCH_VCCNZ;
3060 case SIInstrInfo::VCCZ:
3061 return AMDGPU::S_CBRANCH_VCCZ;
3062 case SIInstrInfo::EXECNZ:
3063 return AMDGPU::S_CBRANCH_EXECNZ;
3064 case SIInstrInfo::EXECZ:
3065 return AMDGPU::S_CBRANCH_EXECZ;
3066 default:
3067 llvm_unreachable("invalid branch predicate");
3068 }
3069}
3070
3071SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3072 switch (Opcode) {
3073 case AMDGPU::S_CBRANCH_SCC0:
3074 return SCC_FALSE;
3075 case AMDGPU::S_CBRANCH_SCC1:
3076 return SCC_TRUE;
3077 case AMDGPU::S_CBRANCH_VCCNZ:
3078 return VCCNZ;
3079 case AMDGPU::S_CBRANCH_VCCZ:
3080 return VCCZ;
3081 case AMDGPU::S_CBRANCH_EXECNZ:
3082 return EXECNZ;
3083 case AMDGPU::S_CBRANCH_EXECZ:
3084 return EXECZ;
3085 default:
3086 return INVALID_BR;
3087 }
3088}
3089
3093 MachineBasicBlock *&FBB,
3095 bool AllowModify) const {
3096 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3097 // Unconditional Branch
3098 TBB = I->getOperand(0).getMBB();
3099 return false;
3100 }
3101
3102 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3103 if (Pred == INVALID_BR)
3104 return true;
3105
3106 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3107 Cond.push_back(MachineOperand::CreateImm(Pred));
3108 Cond.push_back(I->getOperand(1)); // Save the branch register.
3109
3110 ++I;
3111
3112 if (I == MBB.end()) {
3113 // Conditional branch followed by fall-through.
3114 TBB = CondBB;
3115 return false;
3116 }
3117
3118 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3119 TBB = CondBB;
3120 FBB = I->getOperand(0).getMBB();
3121 return false;
3122 }
3123
3124 return true;
3125}
3126
3128 MachineBasicBlock *&FBB,
3130 bool AllowModify) const {
3131 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3132 auto E = MBB.end();
3133 if (I == E)
3134 return false;
3135
3136 // Skip over the instructions that are artificially terminators for special
3137 // exec management.
3138 while (I != E && !I->isBranch() && !I->isReturn()) {
3139 switch (I->getOpcode()) {
3140 case AMDGPU::S_MOV_B64_term:
3141 case AMDGPU::S_XOR_B64_term:
3142 case AMDGPU::S_OR_B64_term:
3143 case AMDGPU::S_ANDN2_B64_term:
3144 case AMDGPU::S_AND_B64_term:
3145 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3146 case AMDGPU::S_MOV_B32_term:
3147 case AMDGPU::S_XOR_B32_term:
3148 case AMDGPU::S_OR_B32_term:
3149 case AMDGPU::S_ANDN2_B32_term:
3150 case AMDGPU::S_AND_B32_term:
3151 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3152 break;
3153 case AMDGPU::SI_IF:
3154 case AMDGPU::SI_ELSE:
3155 case AMDGPU::SI_KILL_I1_TERMINATOR:
3156 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3157 // FIXME: It's messy that these need to be considered here at all.
3158 return true;
3159 default:
3160 llvm_unreachable("unexpected non-branch terminator inst");
3161 }
3162
3163 ++I;
3164 }
3165
3166 if (I == E)
3167 return false;
3168
3169 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3170}
3171
3173 int *BytesRemoved) const {
3174 unsigned Count = 0;
3175 unsigned RemovedSize = 0;
3176 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3177 // Skip over artificial terminators when removing instructions.
3178 if (MI.isBranch() || MI.isReturn()) {
3179 RemovedSize += getInstSizeInBytes(MI);
3180 MI.eraseFromParent();
3181 ++Count;
3182 }
3183 }
3184
3185 if (BytesRemoved)
3186 *BytesRemoved = RemovedSize;
3187
3188 return Count;
3189}
3190
3191// Copy the flags onto the implicit condition register operand.
3193 const MachineOperand &OrigCond) {
3194 CondReg.setIsUndef(OrigCond.isUndef());
3195 CondReg.setIsKill(OrigCond.isKill());
3196}
3197
3200 MachineBasicBlock *FBB,
3202 const DebugLoc &DL,
3203 int *BytesAdded) const {
3204 if (!FBB && Cond.empty()) {
3205 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3206 .addMBB(TBB);
3207 if (BytesAdded)
3208 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3209 return 1;
3210 }
3211
3212 assert(TBB && Cond[0].isImm());
3213
3214 unsigned Opcode
3215 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3216
3217 if (!FBB) {
3218 MachineInstr *CondBr =
3219 BuildMI(&MBB, DL, get(Opcode))
3220 .addMBB(TBB);
3221
3222 // Copy the flags onto the implicit condition register operand.
3223 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3224 fixImplicitOperands(*CondBr);
3225
3226 if (BytesAdded)
3227 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3228 return 1;
3229 }
3230
3231 assert(TBB && FBB);
3232
3233 MachineInstr *CondBr =
3234 BuildMI(&MBB, DL, get(Opcode))
3235 .addMBB(TBB);
3236 fixImplicitOperands(*CondBr);
3237 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3238 .addMBB(FBB);
3239
3240 MachineOperand &CondReg = CondBr->getOperand(1);
3241 CondReg.setIsUndef(Cond[1].isUndef());
3242 CondReg.setIsKill(Cond[1].isKill());
3243
3244 if (BytesAdded)
3245 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3246
3247 return 2;
3248}
3249
3252 if (Cond.size() != 2) {
3253 return true;
3254 }
3255
3256 if (Cond[0].isImm()) {
3257 Cond[0].setImm(-Cond[0].getImm());
3258 return false;
3259 }
3260
3261 return true;
3262}
3263
3266 Register DstReg, Register TrueReg,
3267 Register FalseReg, int &CondCycles,
3268 int &TrueCycles, int &FalseCycles) const {
3269 switch (Cond[0].getImm()) {
3270 case VCCNZ:
3271 case VCCZ: {
3272 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3273 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3274 if (MRI.getRegClass(FalseReg) != RC)
3275 return false;
3276
3277 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3278 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3279
3280 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3281 return RI.hasVGPRs(RC) && NumInsts <= 6;
3282 }
3283 case SCC_TRUE:
3284 case SCC_FALSE: {
3285 // FIXME: We could insert for VGPRs if we could replace the original compare
3286 // with a vector one.
3287 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3288 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3289 if (MRI.getRegClass(FalseReg) != RC)
3290 return false;
3291
3292 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3293
3294 // Multiples of 8 can do s_cselect_b64
3295 if (NumInsts % 2 == 0)
3296 NumInsts /= 2;
3297
3298 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3299 return RI.isSGPRClass(RC);
3300 }
3301 default:
3302 return false;
3303 }
3304}
3305
3309 Register TrueReg, Register FalseReg) const {
3310 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3311 if (Pred == VCCZ || Pred == SCC_FALSE) {
3312 Pred = static_cast<BranchPredicate>(-Pred);
3313 std::swap(TrueReg, FalseReg);
3314 }
3315
3316 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3317 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3318 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3319
3320 if (DstSize == 32) {
3322 if (Pred == SCC_TRUE) {
3323 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3324 .addReg(TrueReg)
3325 .addReg(FalseReg);
3326 } else {
3327 // Instruction's operands are backwards from what is expected.
3328 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3329 .addReg(FalseReg)
3330 .addReg(TrueReg);
3331 }
3332
3333 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3334 return;
3335 }
3336
3337 if (DstSize == 64 && Pred == SCC_TRUE) {
3339 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3340 .addReg(TrueReg)
3341 .addReg(FalseReg);
3342
3343 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3344 return;
3345 }
3346
3347 static const int16_t Sub0_15[] = {
3348 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3349 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3350 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3351 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3352 };
3353
3354 static const int16_t Sub0_15_64[] = {
3355 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3356 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3357 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3358 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3359 };
3360
3361 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3362 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3363 const int16_t *SubIndices = Sub0_15;
3364 int NElts = DstSize / 32;
3365
3366 // 64-bit select is only available for SALU.
3367 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3368 if (Pred == SCC_TRUE) {
3369 if (NElts % 2) {
3370 SelOp = AMDGPU::S_CSELECT_B32;
3371 EltRC = &AMDGPU::SGPR_32RegClass;
3372 } else {
3373 SelOp = AMDGPU::S_CSELECT_B64;
3374 EltRC = &AMDGPU::SGPR_64RegClass;
3375 SubIndices = Sub0_15_64;
3376 NElts /= 2;
3377 }
3378 }
3379
3381 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3382
3383 I = MIB->getIterator();
3384
3386 for (int Idx = 0; Idx != NElts; ++Idx) {
3387 Register DstElt = MRI.createVirtualRegister(EltRC);
3388 Regs.push_back(DstElt);
3389
3390 unsigned SubIdx = SubIndices[Idx];
3391
3393 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3394 Select =
3395 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3396 .addReg(FalseReg, 0, SubIdx)
3397 .addReg(TrueReg, 0, SubIdx);
3398 } else {
3399 Select =
3400 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3401 .addReg(TrueReg, 0, SubIdx)
3402 .addReg(FalseReg, 0, SubIdx);
3403 }
3404
3405 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3407
3408 MIB.addReg(DstElt)
3409 .addImm(SubIdx);
3410 }
3411}
3412
3414 switch (MI.getOpcode()) {
3415 case AMDGPU::V_MOV_B16_t16_e32:
3416 case AMDGPU::V_MOV_B16_t16_e64:
3417 case AMDGPU::V_MOV_B32_e32:
3418 case AMDGPU::V_MOV_B32_e64:
3419 case AMDGPU::V_MOV_B64_PSEUDO:
3420 case AMDGPU::V_MOV_B64_e32:
3421 case AMDGPU::V_MOV_B64_e64:
3422 case AMDGPU::S_MOV_B32:
3423 case AMDGPU::S_MOV_B64:
3424 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3425 case AMDGPU::COPY:
3426 case AMDGPU::WWM_COPY:
3427 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3428 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3429 case AMDGPU::V_ACCVGPR_MOV_B32:
3430 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3431 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3432 return true;
3433 default:
3434 return false;
3435 }
3436}
3437
3438static constexpr AMDGPU::OpName ModifierOpNames[] = {
3439 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3440 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3441 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3442
3444 unsigned Opc = MI.getOpcode();
3445 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3446 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3447 if (Idx >= 0)
3448 MI.removeOperand(Idx);
3449 }
3450}
3451
3452std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3453 unsigned SubRegIndex) {
3454 switch (SubRegIndex) {
3455 case AMDGPU::NoSubRegister:
3456 return Imm;
3457 case AMDGPU::sub0:
3458 return SignExtend64<32>(Imm);
3459 case AMDGPU::sub1:
3460 return SignExtend64<32>(Imm >> 32);
3461 case AMDGPU::lo16:
3462 return SignExtend64<16>(Imm);
3463 case AMDGPU::hi16:
3464 return SignExtend64<16>(Imm >> 16);
3465 case AMDGPU::sub1_lo16:
3466 return SignExtend64<16>(Imm >> 32);
3467 case AMDGPU::sub1_hi16:
3468 return SignExtend64<16>(Imm >> 48);
3469 default:
3470 return std::nullopt;
3471 }
3472
3473 llvm_unreachable("covered subregister switch");
3474}
3475
3476static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3477 switch (Opc) {
3478 case AMDGPU::V_MAC_F16_e32:
3479 case AMDGPU::V_MAC_F16_e64:
3480 case AMDGPU::V_MAD_F16_e64:
3481 return AMDGPU::V_MADAK_F16;
3482 case AMDGPU::V_MAC_F32_e32:
3483 case AMDGPU::V_MAC_F32_e64:
3484 case AMDGPU::V_MAD_F32_e64:
3485 return AMDGPU::V_MADAK_F32;
3486 case AMDGPU::V_FMAC_F32_e32:
3487 case AMDGPU::V_FMAC_F32_e64:
3488 case AMDGPU::V_FMA_F32_e64:
3489 return AMDGPU::V_FMAAK_F32;
3490 case AMDGPU::V_FMAC_F16_e32:
3491 case AMDGPU::V_FMAC_F16_e64:
3492 case AMDGPU::V_FMAC_F16_t16_e64:
3493 case AMDGPU::V_FMAC_F16_fake16_e64:
3494 case AMDGPU::V_FMA_F16_e64:
3495 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3496 ? AMDGPU::V_FMAAK_F16_t16
3497 : AMDGPU::V_FMAAK_F16_fake16
3498 : AMDGPU::V_FMAAK_F16;
3499 case AMDGPU::V_FMAC_F64_e32:
3500 case AMDGPU::V_FMAC_F64_e64:
3501 case AMDGPU::V_FMA_F64_e64:
3502 return AMDGPU::V_FMAAK_F64;
3503 default:
3504 llvm_unreachable("invalid instruction");
3505 }
3506}
3507
3508static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3509 switch (Opc) {
3510 case AMDGPU::V_MAC_F16_e32:
3511 case AMDGPU::V_MAC_F16_e64:
3512 case AMDGPU::V_MAD_F16_e64:
3513 return AMDGPU::V_MADMK_F16;
3514 case AMDGPU::V_MAC_F32_e32:
3515 case AMDGPU::V_MAC_F32_e64:
3516 case AMDGPU::V_MAD_F32_e64:
3517 return AMDGPU::V_MADMK_F32;
3518 case AMDGPU::V_FMAC_F32_e32:
3519 case AMDGPU::V_FMAC_F32_e64:
3520 case AMDGPU::V_FMA_F32_e64:
3521 return AMDGPU::V_FMAMK_F32;
3522 case AMDGPU::V_FMAC_F16_e32:
3523 case AMDGPU::V_FMAC_F16_e64:
3524 case AMDGPU::V_FMAC_F16_t16_e64:
3525 case AMDGPU::V_FMAC_F16_fake16_e64:
3526 case AMDGPU::V_FMA_F16_e64:
3527 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3528 ? AMDGPU::V_FMAMK_F16_t16
3529 : AMDGPU::V_FMAMK_F16_fake16
3530 : AMDGPU::V_FMAMK_F16;
3531 case AMDGPU::V_FMAC_F64_e32:
3532 case AMDGPU::V_FMAC_F64_e64:
3533 case AMDGPU::V_FMA_F64_e64:
3534 return AMDGPU::V_FMAMK_F64;
3535 default:
3536 llvm_unreachable("invalid instruction");
3537 }
3538}
3539
3541 Register Reg, MachineRegisterInfo *MRI) const {
3542 int64_t Imm;
3543 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3544 return false;
3545
3546 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3547
3548 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3549
3550 unsigned Opc = UseMI.getOpcode();
3551 if (Opc == AMDGPU::COPY) {
3552 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3553
3554 Register DstReg = UseMI.getOperand(0).getReg();
3555 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3556
3557 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3558
3559 if (HasMultipleUses) {
3560 // TODO: This should fold in more cases with multiple use, but we need to
3561 // more carefully consider what those uses are.
3562 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3563
3564 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3565 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3566 return false;
3567
3568 // Most of the time folding a 32-bit inline constant is free (though this
3569 // might not be true if we can't later fold it into a real user).
3570 //
3571 // FIXME: This isInlineConstant check is imprecise if
3572 // getConstValDefinedInReg handled the tricky non-mov cases.
3573 if (ImmDefSize == 32 &&
3575 return false;
3576 }
3577
3578 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3579 RI.getSubRegIdxSize(UseSubReg) == 16;
3580
3581 if (Is16Bit) {
3582 if (RI.hasVGPRs(DstRC))
3583 return false; // Do not clobber vgpr_hi16
3584
3585 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3586 return false;
3587 }
3588
3589 MachineFunction *MF = UseMI.getMF();
3590
3591 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3592 MCRegister MovDstPhysReg =
3593 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3594
3595 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3596
3597 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3598 for (unsigned MovOp :
3599 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3600 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3601 const MCInstrDesc &MovDesc = get(MovOp);
3602
3603 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
3604 if (Is16Bit) {
3605 // We just need to find a correctly sized register class, so the
3606 // subregister index compatibility doesn't matter since we're statically
3607 // extracting the immediate value.
3608 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3609 if (!MovDstRC)
3610 continue;
3611
3612 if (MovDstPhysReg) {
3613 // FIXME: We probably should not do this. If there is a live value in
3614 // the high half of the register, it will be corrupted.
3615 MovDstPhysReg =
3616 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3617 if (!MovDstPhysReg)
3618 continue;
3619 }
3620 }
3621
3622 // Result class isn't the right size, try the next instruction.
3623 if (MovDstPhysReg) {
3624 if (!MovDstRC->contains(MovDstPhysReg))
3625 return false;
3626 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3627 // TODO: This will be overly conservative in the case of 16-bit virtual
3628 // SGPRs. We could hack up the virtual register uses to use a compatible
3629 // 32-bit class.
3630 continue;
3631 }
3632
3633 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3634
3635 // Ensure the interpreted immediate value is a valid operand in the new
3636 // mov.
3637 //
3638 // FIXME: isImmOperandLegal should have form that doesn't require existing
3639 // MachineInstr or MachineOperand
3640 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3641 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3642 break;
3643
3644 NewOpc = MovOp;
3645 break;
3646 }
3647
3648 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3649 return false;
3650
3651 if (Is16Bit) {
3652 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3653 if (MovDstPhysReg)
3654 UseMI.getOperand(0).setReg(MovDstPhysReg);
3655 assert(UseMI.getOperand(1).getReg().isVirtual());
3656 }
3657
3658 const MCInstrDesc &NewMCID = get(NewOpc);
3659 UseMI.setDesc(NewMCID);
3660 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3661 UseMI.addImplicitDefUseOperands(*MF);
3662 return true;
3663 }
3664
3665 if (HasMultipleUses)
3666 return false;
3667
3668 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3669 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3670 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3671 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3672 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3673 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3674 Opc == AMDGPU::V_FMAC_F64_e64) {
3675 // Don't fold if we are using source or output modifiers. The new VOP2
3676 // instructions don't have them.
3678 return false;
3679
3680 // If this is a free constant, there's no reason to do this.
3681 // TODO: We could fold this here instead of letting SIFoldOperands do it
3682 // later.
3683 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3684
3685 // Any src operand can be used for the legality check.
3686 if (isInlineConstant(UseMI, Src0Idx, Imm))
3687 return false;
3688
3689 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3690
3691 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3692 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3693
3694 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3695 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3696 (Src1->isReg() && Src1->getReg() == Reg)) {
3697 MachineOperand *RegSrc =
3698 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3699 if (!RegSrc->isReg())
3700 return false;
3701 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3702 ST.getConstantBusLimit(Opc) < 2)
3703 return false;
3704
3705 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3706 return false;
3707
3708 // If src2 is also a literal constant then we have to choose which one to
3709 // fold. In general it is better to choose madak so that the other literal
3710 // can be materialized in an sgpr instead of a vgpr:
3711 // s_mov_b32 s0, literal
3712 // v_madak_f32 v0, s0, v0, literal
3713 // Instead of:
3714 // v_mov_b32 v1, literal
3715 // v_madmk_f32 v0, v0, literal, v1
3716 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3717 if (Def && Def->isMoveImmediate() &&
3718 !isInlineConstant(Def->getOperand(1)))
3719 return false;
3720
3721 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3722 if (pseudoToMCOpcode(NewOpc) == -1)
3723 return false;
3724
3725 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3726 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3727 // restricting their register classes. For now just bail out.
3728 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3729 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3730 return false;
3731
3732 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3733 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3734
3735 // FIXME: This would be a lot easier if we could return a new instruction
3736 // instead of having to modify in place.
3737
3738 Register SrcReg = RegSrc->getReg();
3739 unsigned SrcSubReg = RegSrc->getSubReg();
3740 Src0->setReg(SrcReg);
3741 Src0->setSubReg(SrcSubReg);
3742 Src0->setIsKill(RegSrc->isKill());
3743
3744 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3745 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3746 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3747 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3748 UseMI.untieRegOperand(
3749 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3750
3751 Src1->ChangeToImmediate(*SubRegImm);
3752
3754 UseMI.setDesc(get(NewOpc));
3755
3756 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3757 if (DeleteDef)
3758 DefMI.eraseFromParent();
3759
3760 return true;
3761 }
3762
3763 // Added part is the constant: Use v_madak_{f16, f32}.
3764 if (Src2->isReg() && Src2->getReg() == Reg) {
3765 if (ST.getConstantBusLimit(Opc) < 2) {
3766 // Not allowed to use constant bus for another operand.
3767 // We can however allow an inline immediate as src0.
3768 bool Src0Inlined = false;
3769 if (Src0->isReg()) {
3770 // Try to inline constant if possible.
3771 // If the Def moves immediate and the use is single
3772 // We are saving VGPR here.
3773 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3774 if (Def && Def->isMoveImmediate() &&
3775 isInlineConstant(Def->getOperand(1)) &&
3776 MRI->hasOneNonDBGUse(Src0->getReg())) {
3777 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3778 Src0Inlined = true;
3779 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3780 RI.isSGPRReg(*MRI, Src0->getReg())) {
3781 return false;
3782 }
3783 // VGPR is okay as Src0 - fallthrough
3784 }
3785
3786 if (Src1->isReg() && !Src0Inlined) {
3787 // We have one slot for inlinable constant so far - try to fill it
3788 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3789 if (Def && Def->isMoveImmediate() &&
3790 isInlineConstant(Def->getOperand(1)) &&
3791 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3792 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3793 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3794 return false;
3795 // VGPR is okay as Src1 - fallthrough
3796 }
3797 }
3798
3799 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3800 if (pseudoToMCOpcode(NewOpc) == -1)
3801 return false;
3802
3803 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3804 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3805 // restricting their register classes. For now just bail out.
3806 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3807 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3808 return false;
3809
3810 // FIXME: This would be a lot easier if we could return a new instruction
3811 // instead of having to modify in place.
3812
3813 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3814 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3815 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3816 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3817 UseMI.untieRegOperand(
3818 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3819
3820 const std::optional<int64_t> SubRegImm =
3821 extractSubregFromImm(Imm, Src2->getSubReg());
3822
3823 // ChangingToImmediate adds Src2 back to the instruction.
3824 Src2->ChangeToImmediate(*SubRegImm);
3825
3826 // These come before src2.
3828 UseMI.setDesc(get(NewOpc));
3829 // It might happen that UseMI was commuted
3830 // and we now have SGPR as SRC1. If so 2 inlined
3831 // constant and SGPR are illegal.
3833
3834 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3835 if (DeleteDef)
3836 DefMI.eraseFromParent();
3837
3838 return true;
3839 }
3840 }
3841
3842 return false;
3843}
3844
3845static bool
3848 if (BaseOps1.size() != BaseOps2.size())
3849 return false;
3850 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3851 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3852 return false;
3853 }
3854 return true;
3855}
3856
3857static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3858 LocationSize WidthB, int OffsetB) {
3859 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3860 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3861 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3862 return LowWidth.hasValue() &&
3863 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3864}
3865
3866bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3867 const MachineInstr &MIb) const {
3868 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3869 int64_t Offset0, Offset1;
3870 LocationSize Dummy0 = LocationSize::precise(0);
3871 LocationSize Dummy1 = LocationSize::precise(0);
3872 bool Offset0IsScalable, Offset1IsScalable;
3873 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3874 Dummy0, &RI) ||
3875 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3876 Dummy1, &RI))
3877 return false;
3878
3879 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3880 return false;
3881
3882 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3883 // FIXME: Handle ds_read2 / ds_write2.
3884 return false;
3885 }
3886 LocationSize Width0 = MIa.memoperands().front()->getSize();
3887 LocationSize Width1 = MIb.memoperands().front()->getSize();
3888 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3889}
3890
3892 const MachineInstr &MIb) const {
3893 assert(MIa.mayLoadOrStore() &&
3894 "MIa must load from or modify a memory location");
3895 assert(MIb.mayLoadOrStore() &&
3896 "MIb must load from or modify a memory location");
3897
3899 return false;
3900
3901 // XXX - Can we relax this between address spaces?
3902 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3903 return false;
3904
3905 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3906 return false;
3907
3908 // TODO: Should we check the address space from the MachineMemOperand? That
3909 // would allow us to distinguish objects we know don't alias based on the
3910 // underlying address space, even if it was lowered to a different one,
3911 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3912 // buffer.
3913 if (isDS(MIa)) {
3914 if (isDS(MIb))
3915 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3916
3917 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3918 }
3919
3920 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3921 if (isMUBUF(MIb) || isMTBUF(MIb))
3922 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3923
3924 if (isFLAT(MIb))
3925 return isFLATScratch(MIb);
3926
3927 return !isSMRD(MIb);
3928 }
3929
3930 if (isSMRD(MIa)) {
3931 if (isSMRD(MIb))
3932 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3933
3934 if (isFLAT(MIb))
3935 return isFLATScratch(MIb);
3936
3937 return !isMUBUF(MIb) && !isMTBUF(MIb);
3938 }
3939
3940 if (isFLAT(MIa)) {
3941 if (isFLAT(MIb)) {
3942 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3943 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3944 return true;
3945
3946 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3947 }
3948
3949 return false;
3950 }
3951
3952 return false;
3953}
3954
3956 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3957 if (Reg.isPhysical())
3958 return false;
3959 auto *Def = MRI.getUniqueVRegDef(Reg);
3960 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3961 Imm = Def->getOperand(1).getImm();
3962 if (DefMI)
3963 *DefMI = Def;
3964 return true;
3965 }
3966 return false;
3967}
3968
3969static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3970 MachineInstr **DefMI = nullptr) {
3971 if (!MO->isReg())
3972 return false;
3973 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3974 const MachineRegisterInfo &MRI = MF->getRegInfo();
3975 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3976}
3977
3979 MachineInstr &NewMI) {
3980 if (LV) {
3981 unsigned NumOps = MI.getNumOperands();
3982 for (unsigned I = 1; I < NumOps; ++I) {
3983 MachineOperand &Op = MI.getOperand(I);
3984 if (Op.isReg() && Op.isKill())
3985 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3986 }
3987 }
3988}
3989
3990static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3991 switch (Opc) {
3992 case AMDGPU::V_MAC_F16_e32:
3993 case AMDGPU::V_MAC_F16_e64:
3994 return AMDGPU::V_MAD_F16_e64;
3995 case AMDGPU::V_MAC_F32_e32:
3996 case AMDGPU::V_MAC_F32_e64:
3997 return AMDGPU::V_MAD_F32_e64;
3998 case AMDGPU::V_MAC_LEGACY_F32_e32:
3999 case AMDGPU::V_MAC_LEGACY_F32_e64:
4000 return AMDGPU::V_MAD_LEGACY_F32_e64;
4001 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4002 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4003 return AMDGPU::V_FMA_LEGACY_F32_e64;
4004 case AMDGPU::V_FMAC_F16_e32:
4005 case AMDGPU::V_FMAC_F16_e64:
4006 case AMDGPU::V_FMAC_F16_t16_e64:
4007 case AMDGPU::V_FMAC_F16_fake16_e64:
4008 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4009 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4010 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4011 : AMDGPU::V_FMA_F16_gfx9_e64;
4012 case AMDGPU::V_FMAC_F32_e32:
4013 case AMDGPU::V_FMAC_F32_e64:
4014 return AMDGPU::V_FMA_F32_e64;
4015 case AMDGPU::V_FMAC_F64_e32:
4016 case AMDGPU::V_FMAC_F64_e64:
4017 return AMDGPU::V_FMA_F64_e64;
4018 default:
4019 llvm_unreachable("invalid instruction");
4020 }
4021}
4022
4024 LiveVariables *LV,
4025 LiveIntervals *LIS) const {
4026 MachineBasicBlock &MBB = *MI.getParent();
4027 unsigned Opc = MI.getOpcode();
4028
4029 // Handle MFMA.
4030 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4031 if (NewMFMAOpc != -1) {
4033 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4034 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4035 MIB.add(MI.getOperand(I));
4036 updateLiveVariables(LV, MI, *MIB);
4037 if (LIS) {
4038 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4039 // SlotIndex of defs needs to be updated when converting to early-clobber
4040 MachineOperand &Def = MIB->getOperand(0);
4041 if (Def.isEarlyClobber() && Def.isReg() &&
4042 LIS->hasInterval(Def.getReg())) {
4043 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
4044 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
4045 auto &LI = LIS->getInterval(Def.getReg());
4046 auto UpdateDefIndex = [&](LiveRange &LR) {
4047 auto *S = LR.find(OldIndex);
4048 if (S != LR.end() && S->start == OldIndex) {
4049 assert(S->valno && S->valno->def == OldIndex);
4050 S->start = NewIndex;
4051 S->valno->def = NewIndex;
4052 }
4053 };
4054 UpdateDefIndex(LI);
4055 for (auto &SR : LI.subranges())
4056 UpdateDefIndex(SR);
4057 }
4058 }
4059 return MIB;
4060 }
4061
4062 if (SIInstrInfo::isWMMA(MI)) {
4063 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4064 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4065 .setMIFlags(MI.getFlags());
4066 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4067 MIB->addOperand(MI.getOperand(I));
4068
4069 updateLiveVariables(LV, MI, *MIB);
4070 if (LIS)
4071 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4072
4073 return MIB;
4074 }
4075
4076 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4077 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4078 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4079 "present pre-RA");
4080
4081 // Handle MAC/FMAC.
4082 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4083 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4084 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4085 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4086 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4087 bool Src0Literal = false;
4088
4089 switch (Opc) {
4090 default:
4091 return nullptr;
4092 case AMDGPU::V_MAC_F16_e64:
4093 case AMDGPU::V_FMAC_F16_e64:
4094 case AMDGPU::V_FMAC_F16_t16_e64:
4095 case AMDGPU::V_FMAC_F16_fake16_e64:
4096 case AMDGPU::V_MAC_F32_e64:
4097 case AMDGPU::V_MAC_LEGACY_F32_e64:
4098 case AMDGPU::V_FMAC_F32_e64:
4099 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4100 case AMDGPU::V_FMAC_F64_e64:
4101 break;
4102 case AMDGPU::V_MAC_F16_e32:
4103 case AMDGPU::V_FMAC_F16_e32:
4104 case AMDGPU::V_MAC_F32_e32:
4105 case AMDGPU::V_MAC_LEGACY_F32_e32:
4106 case AMDGPU::V_FMAC_F32_e32:
4107 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4108 case AMDGPU::V_FMAC_F64_e32: {
4109 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4110 AMDGPU::OpName::src0);
4111 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4112 if (!Src0->isReg() && !Src0->isImm())
4113 return nullptr;
4114
4115 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4116 Src0Literal = true;
4117
4118 break;
4119 }
4120 }
4121
4123 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4124 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4125 const MachineOperand *Src0Mods =
4126 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4127 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4128 const MachineOperand *Src1Mods =
4129 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4130 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4131 const MachineOperand *Src2Mods =
4132 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4133 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4134 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4135 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4136
4137 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4138 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4139 // If we have an SGPR input, we will violate the constant bus restriction.
4140 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4141 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4143 const auto killDef = [&]() -> void {
4144 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4145 // The only user is the instruction which will be killed.
4146 Register DefReg = DefMI->getOperand(0).getReg();
4147
4148 if (MRI.hasOneNonDBGUse(DefReg)) {
4149 // We cannot just remove the DefMI here, calling pass will crash.
4150 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4151 DefMI->getOperand(0).setIsDead(true);
4152 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4153 DefMI->removeOperand(I);
4154 if (LV)
4155 LV->getVarInfo(DefReg).AliveBlocks.clear();
4156 }
4157
4158 if (LIS) {
4159 LiveInterval &DefLI = LIS->getInterval(DefReg);
4160
4161 // We cannot delete the original instruction here, so hack out the use
4162 // in the original instruction with a dummy register so we can use
4163 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4164 // not have the complexity of deleting a use to consider here.
4165 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4166 for (MachineOperand &MIOp : MI.uses()) {
4167 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4168 MIOp.setIsUndef(true);
4169 MIOp.setReg(DummyReg);
4170 }
4171 }
4172
4173 LIS->shrinkToUses(&DefLI);
4174 }
4175 };
4176
4177 int64_t Imm;
4178 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4179 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4180 if (pseudoToMCOpcode(NewOpc) != -1) {
4181 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4182 .add(*Dst)
4183 .add(*Src0)
4184 .add(*Src1)
4185 .addImm(Imm)
4186 .setMIFlags(MI.getFlags());
4187 updateLiveVariables(LV, MI, *MIB);
4188 if (LIS)
4189 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4190 killDef();
4191 return MIB;
4192 }
4193 }
4194 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4195 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4196 if (pseudoToMCOpcode(NewOpc) != -1) {
4197 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4198 .add(*Dst)
4199 .add(*Src0)
4200 .addImm(Imm)
4201 .add(*Src2)
4202 .setMIFlags(MI.getFlags());
4203 updateLiveVariables(LV, MI, *MIB);
4204
4205 if (LIS)
4206 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4207 killDef();
4208 return MIB;
4209 }
4210 }
4211 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4212 if (Src0Literal) {
4213 Imm = Src0->getImm();
4214 DefMI = nullptr;
4215 }
4216 if (pseudoToMCOpcode(NewOpc) != -1 &&
4218 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4219 Src1)) {
4220 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4221 .add(*Dst)
4222 .add(*Src1)
4223 .addImm(Imm)
4224 .add(*Src2)
4225 .setMIFlags(MI.getFlags());
4226 updateLiveVariables(LV, MI, *MIB);
4227
4228 if (LIS)
4229 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4230 if (DefMI)
4231 killDef();
4232 return MIB;
4233 }
4234 }
4235 }
4236
4237 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4238 // if VOP3 does not allow a literal operand.
4239 if (Src0Literal && !ST.hasVOP3Literal())
4240 return nullptr;
4241
4242 unsigned NewOpc = getNewFMAInst(ST, Opc);
4243
4244 if (pseudoToMCOpcode(NewOpc) == -1)
4245 return nullptr;
4246
4247 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4248 .add(*Dst)
4249 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4250 .add(*Src0)
4251 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4252 .add(*Src1)
4253 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4254 .add(*Src2)
4255 .addImm(Clamp ? Clamp->getImm() : 0)
4256 .addImm(Omod ? Omod->getImm() : 0)
4257 .setMIFlags(MI.getFlags());
4258 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4259 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4260 updateLiveVariables(LV, MI, *MIB);
4261 if (LIS)
4262 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4263 return MIB;
4264}
4265
4266// It's not generally safe to move VALU instructions across these since it will
4267// start using the register as a base index rather than directly.
4268// XXX - Why isn't hasSideEffects sufficient for these?
4270 switch (MI.getOpcode()) {
4271 case AMDGPU::S_SET_GPR_IDX_ON:
4272 case AMDGPU::S_SET_GPR_IDX_MODE:
4273 case AMDGPU::S_SET_GPR_IDX_OFF:
4274 return true;
4275 default:
4276 return false;
4277 }
4278}
4279
4281 const MachineBasicBlock *MBB,
4282 const MachineFunction &MF) const {
4283 // Skipping the check for SP writes in the base implementation. The reason it
4284 // was added was apparently due to compile time concerns.
4285 //
4286 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4287 // but is probably avoidable.
4288
4289 // Copied from base implementation.
4290 // Terminators and labels can't be scheduled around.
4291 if (MI.isTerminator() || MI.isPosition())
4292 return true;
4293
4294 // INLINEASM_BR can jump to another block
4295 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4296 return true;
4297
4298 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4299 return true;
4300
4301 // Target-independent instructions do not have an implicit-use of EXEC, even
4302 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4303 // boundaries prevents incorrect movements of such instructions.
4304 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4305 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4306 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4307 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4308 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4310}
4311
4313 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4314 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4315 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4316}
4317
4319 if (!isFLAT(MI) || isFLATGlobal(MI))
4320 return false;
4321
4322 // If scratch is not initialized, we can never access it.
4323 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4324 return false;
4325
4326 // SCRATCH instructions always access scratch.
4327 if (isFLATScratch(MI))
4328 return true;
4329
4330 // If there are no memory operands then conservatively assume the flat
4331 // operation may access scratch.
4332 if (MI.memoperands_empty())
4333 return true;
4334
4335 // See if any memory operand specifies an address space that involves scratch.
4336 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4337 unsigned AS = Memop->getAddrSpace();
4338 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4339 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4340 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4341 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4342 }
4343 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4344 });
4345}
4346
4348 assert(isFLAT(MI));
4349
4350 // All flat instructions use the VMEM counter except prefetch.
4351 if (!usesVM_CNT(MI))
4352 return false;
4353
4354 // If there are no memory operands then conservatively assume the flat
4355 // operation may access VMEM.
4356 if (MI.memoperands_empty())
4357 return true;
4358
4359 // See if any memory operand specifies an address space that involves VMEM.
4360 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4361 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4362 // (GDS) address space is not supported by flat operations. Therefore, simply
4363 // return true unless only the LDS address space is found.
4364 for (const MachineMemOperand *Memop : MI.memoperands()) {
4365 unsigned AS = Memop->getAddrSpace();
4367 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4368 return true;
4369 }
4370
4371 return false;
4372}
4373
4375 assert(isFLAT(MI));
4376
4377 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4378 if (!usesLGKM_CNT(MI))
4379 return false;
4380
4381 // If in tgsplit mode then there can be no use of LDS.
4382 if (ST.isTgSplitEnabled())
4383 return false;
4384
4385 // If there are no memory operands then conservatively assume the flat
4386 // operation may access LDS.
4387 if (MI.memoperands_empty())
4388 return true;
4389
4390 // See if any memory operand specifies an address space that involves LDS.
4391 for (const MachineMemOperand *Memop : MI.memoperands()) {
4392 unsigned AS = Memop->getAddrSpace();
4394 return true;
4395 }
4396
4397 return false;
4398}
4399
4401 // Skip the full operand and register alias search modifiesRegister
4402 // does. There's only a handful of instructions that touch this, it's only an
4403 // implicit def, and doesn't alias any other registers.
4404 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4405}
4406
4408 unsigned Opcode = MI.getOpcode();
4409
4410 if (MI.mayStore() && isSMRD(MI))
4411 return true; // scalar store or atomic
4412
4413 // This will terminate the function when other lanes may need to continue.
4414 if (MI.isReturn())
4415 return true;
4416
4417 // These instructions cause shader I/O that may cause hardware lockups
4418 // when executed with an empty EXEC mask.
4419 //
4420 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4421 // EXEC = 0, but checking for that case here seems not worth it
4422 // given the typical code patterns.
4423 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4424 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4425 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4426 return true;
4427
4428 if (MI.isCall() || MI.isInlineAsm())
4429 return true; // conservative assumption
4430
4431 // Assume that barrier interactions are only intended with active lanes.
4432 if (isBarrier(Opcode))
4433 return true;
4434
4435 // A mode change is a scalar operation that influences vector instructions.
4437 return true;
4438
4439 // These are like SALU instructions in terms of effects, so it's questionable
4440 // whether we should return true for those.
4441 //
4442 // However, executing them with EXEC = 0 causes them to operate on undefined
4443 // data, which we avoid by returning true here.
4444 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4445 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4446 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4447 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4448 return true;
4449
4450 return false;
4451}
4452
4454 const MachineInstr &MI) const {
4455 if (MI.isMetaInstruction())
4456 return false;
4457
4458 // This won't read exec if this is an SGPR->SGPR copy.
4459 if (MI.isCopyLike()) {
4460 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4461 return true;
4462
4463 // Make sure this isn't copying exec as a normal operand
4464 return MI.readsRegister(AMDGPU::EXEC, &RI);
4465 }
4466
4467 // Make a conservative assumption about the callee.
4468 if (MI.isCall())
4469 return true;
4470
4471 // Be conservative with any unhandled generic opcodes.
4472 if (!isTargetSpecificOpcode(MI.getOpcode()))
4473 return true;
4474
4475 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4476}
4477
4478bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4479 switch (Imm.getBitWidth()) {
4480 case 1: // This likely will be a condition code mask.
4481 return true;
4482
4483 case 32:
4484 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4485 ST.hasInv2PiInlineImm());
4486 case 64:
4487 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4488 ST.hasInv2PiInlineImm());
4489 case 16:
4490 return ST.has16BitInsts() &&
4491 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4492 ST.hasInv2PiInlineImm());
4493 default:
4494 llvm_unreachable("invalid bitwidth");
4495 }
4496}
4497
4499 APInt IntImm = Imm.bitcastToAPInt();
4500 int64_t IntImmVal = IntImm.getSExtValue();
4501 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4502 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4503 default:
4504 llvm_unreachable("invalid fltSemantics");
4507 return isInlineConstant(IntImm);
4509 return ST.has16BitInsts() &&
4510 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4512 return ST.has16BitInsts() &&
4513 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4514 }
4515}
4516
4517bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4518 // MachineOperand provides no way to tell the true operand size, since it only
4519 // records a 64-bit value. We need to know the size to determine if a 32-bit
4520 // floating point immediate bit pattern is legal for an integer immediate. It
4521 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4522 switch (OperandType) {
4532 int32_t Trunc = static_cast<int32_t>(Imm);
4533 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4534 }
4540 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4543 // We would expect inline immediates to not be concerned with an integer/fp
4544 // distinction. However, in the case of 16-bit integer operations, the
4545 // "floating point" values appear to not work. It seems read the low 16-bits
4546 // of 32-bit immediates, which happens to always work for the integer
4547 // values.
4548 //
4549 // See llvm bugzilla 46302.
4550 //
4551 // TODO: Theoretically we could use op-sel to use the high bits of the
4552 // 32-bit FP values.
4564 return false;
4567 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4568 // A few special case instructions have 16-bit operands on subtargets
4569 // where 16-bit instructions are not legal.
4570 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4571 // constants in these cases
4572 int16_t Trunc = static_cast<int16_t>(Imm);
4573 return ST.has16BitInsts() &&
4574 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4575 }
4576
4577 return false;
4578 }
4581 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4582 int16_t Trunc = static_cast<int16_t>(Imm);
4583 return ST.has16BitInsts() &&
4584 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4585 }
4586 return false;
4587 }
4591 return false;
4593 return isLegalAV64PseudoImm(Imm);
4596 // Always embedded in the instruction for free.
4597 return true;
4607 // Just ignore anything else.
4608 return true;
4609 default:
4610 llvm_unreachable("invalid operand type");
4611 }
4612}
4613
4614static bool compareMachineOp(const MachineOperand &Op0,
4615 const MachineOperand &Op1) {
4616 if (Op0.getType() != Op1.getType())
4617 return false;
4618
4619 switch (Op0.getType()) {
4621 return Op0.getReg() == Op1.getReg();
4623 return Op0.getImm() == Op1.getImm();
4624 default:
4625 llvm_unreachable("Didn't expect to be comparing these operand types");
4626 }
4627}
4628
4630 const MCOperandInfo &OpInfo) const {
4631 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4632 return true;
4633
4634 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4635 return false;
4636
4637 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4638 return true;
4639
4640 return ST.hasVOP3Literal();
4641}
4642
4643bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4644 int64_t ImmVal) const {
4645 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4646 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4647 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4648 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4649 AMDGPU::OpName::src2))
4650 return false;
4651 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4652 }
4653
4654 return isLiteralOperandLegal(InstDesc, OpInfo);
4655}
4656
4657bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4658 const MachineOperand &MO) const {
4659 if (MO.isImm())
4660 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4661
4662 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4663 "unexpected imm-like operand kind");
4664 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4665 return isLiteralOperandLegal(InstDesc, OpInfo);
4666}
4667
4669 // 2 32-bit inline constants packed into one.
4670 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4671 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4672}
4673
4674bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4675 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4676 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4677 return false;
4678
4679 int Op32 = AMDGPU::getVOPe32(Opcode);
4680 if (Op32 == -1)
4681 return false;
4682
4683 return pseudoToMCOpcode(Op32) != -1;
4684}
4685
4686bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4687 // The src0_modifier operand is present on all instructions
4688 // that have modifiers.
4689
4690 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4691}
4692
4694 AMDGPU::OpName OpName) const {
4695 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4696 return Mods && Mods->getImm();
4697}
4698
4700 return any_of(ModifierOpNames,
4701 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4702}
4703
4705 const MachineRegisterInfo &MRI) const {
4706 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4707 // Can't shrink instruction with three operands.
4708 if (Src2) {
4709 switch (MI.getOpcode()) {
4710 default: return false;
4711
4712 case AMDGPU::V_ADDC_U32_e64:
4713 case AMDGPU::V_SUBB_U32_e64:
4714 case AMDGPU::V_SUBBREV_U32_e64: {
4715 const MachineOperand *Src1
4716 = getNamedOperand(MI, AMDGPU::OpName::src1);
4717 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4718 return false;
4719 // Additional verification is needed for sdst/src2.
4720 return true;
4721 }
4722 case AMDGPU::V_MAC_F16_e64:
4723 case AMDGPU::V_MAC_F32_e64:
4724 case AMDGPU::V_MAC_LEGACY_F32_e64:
4725 case AMDGPU::V_FMAC_F16_e64:
4726 case AMDGPU::V_FMAC_F16_t16_e64:
4727 case AMDGPU::V_FMAC_F16_fake16_e64:
4728 case AMDGPU::V_FMAC_F32_e64:
4729 case AMDGPU::V_FMAC_F64_e64:
4730 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4731 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4732 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4733 return false;
4734 break;
4735
4736 case AMDGPU::V_CNDMASK_B32_e64:
4737 break;
4738 }
4739 }
4740
4741 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4742 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4743 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4744 return false;
4745
4746 // We don't need to check src0, all input types are legal, so just make sure
4747 // src0 isn't using any modifiers.
4748 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4749 return false;
4750
4751 // Can it be shrunk to a valid 32 bit opcode?
4752 if (!hasVALU32BitEncoding(MI.getOpcode()))
4753 return false;
4754
4755 // Check output modifiers
4756 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4757 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4758 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4759 // TODO: Can we avoid checking bound_ctrl/fi here?
4760 // They are only used by permlane*_swap special case.
4761 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4762 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4763}
4764
4765// Set VCC operand with all flags from \p Orig, except for setting it as
4766// implicit.
4768 const MachineOperand &Orig) {
4769
4770 for (MachineOperand &Use : MI.implicit_operands()) {
4771 if (Use.isUse() &&
4772 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4773 Use.setIsUndef(Orig.isUndef());
4774 Use.setIsKill(Orig.isKill());
4775 return;
4776 }
4777 }
4778}
4779
4781 unsigned Op32) const {
4782 MachineBasicBlock *MBB = MI.getParent();
4783
4784 const MCInstrDesc &Op32Desc = get(Op32);
4785 MachineInstrBuilder Inst32 =
4786 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4787 .setMIFlags(MI.getFlags());
4788
4789 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4790 // For VOPC instructions, this is replaced by an implicit def of vcc.
4791
4792 // We assume the defs of the shrunk opcode are in the same order, and the
4793 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4794 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4795 Inst32.add(MI.getOperand(I));
4796
4797 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4798
4799 int Idx = MI.getNumExplicitDefs();
4800 for (const MachineOperand &Use : MI.explicit_uses()) {
4801 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4803 continue;
4804
4805 if (&Use == Src2) {
4806 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4807 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4808 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4809 // of vcc was already added during the initial BuildMI, but we
4810 // 1) may need to change vcc to vcc_lo to preserve the original register
4811 // 2) have to preserve the original flags.
4812 copyFlagsToImplicitVCC(*Inst32, *Src2);
4813 continue;
4814 }
4815 }
4816
4817 Inst32.add(Use);
4818 }
4819
4820 // FIXME: Losing implicit operands
4821 fixImplicitOperands(*Inst32);
4822 return Inst32;
4823}
4824
4826 // Null is free
4827 Register Reg = RegOp.getReg();
4828 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4829 return false;
4830
4831 // SGPRs use the constant bus
4832
4833 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4834 // physical register operands should also count, except for exec.
4835 if (RegOp.isImplicit())
4836 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4837
4838 // SGPRs use the constant bus
4839 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4840 AMDGPU::SReg_64RegClass.contains(Reg);
4841}
4842
4844 const MachineRegisterInfo &MRI) const {
4845 Register Reg = RegOp.getReg();
4846 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4847 : physRegUsesConstantBus(RegOp);
4848}
4849
4851 const MachineOperand &MO,
4852 const MCOperandInfo &OpInfo) const {
4853 // Literal constants use the constant bus.
4854 if (!MO.isReg())
4855 return !isInlineConstant(MO, OpInfo);
4856
4857 Register Reg = MO.getReg();
4858 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4860}
4861
4863 for (const MachineOperand &MO : MI.implicit_operands()) {
4864 // We only care about reads.
4865 if (MO.isDef())
4866 continue;
4867
4868 switch (MO.getReg()) {
4869 case AMDGPU::VCC:
4870 case AMDGPU::VCC_LO:
4871 case AMDGPU::VCC_HI:
4872 case AMDGPU::M0:
4873 case AMDGPU::FLAT_SCR:
4874 return MO.getReg();
4875
4876 default:
4877 break;
4878 }
4879 }
4880
4881 return Register();
4882}
4883
4884static bool shouldReadExec(const MachineInstr &MI) {
4885 if (SIInstrInfo::isVALU(MI)) {
4886 switch (MI.getOpcode()) {
4887 case AMDGPU::V_READLANE_B32:
4888 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4889 case AMDGPU::V_WRITELANE_B32:
4890 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4891 return false;
4892 }
4893
4894 return true;
4895 }
4896
4897 if (MI.isPreISelOpcode() ||
4898 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4901 return false;
4902
4903 return true;
4904}
4905
4906static bool isRegOrFI(const MachineOperand &MO) {
4907 return MO.isReg() || MO.isFI();
4908}
4909
4910static bool isSubRegOf(const SIRegisterInfo &TRI,
4911 const MachineOperand &SuperVec,
4912 const MachineOperand &SubReg) {
4913 if (SubReg.getReg().isPhysical())
4914 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4915
4916 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4917 SubReg.getReg() == SuperVec.getReg();
4918}
4919
4920// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4921bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4922 const MachineRegisterInfo &MRI,
4923 StringRef &ErrInfo) const {
4924 Register DstReg = MI.getOperand(0).getReg();
4925 Register SrcReg = MI.getOperand(1).getReg();
4926 // This is a check for copy from vector register to SGPR
4927 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4928 ErrInfo = "illegal copy from vector register to SGPR";
4929 return false;
4930 }
4931 return true;
4932}
4933
4935 StringRef &ErrInfo) const {
4936 uint16_t Opcode = MI.getOpcode();
4937 const MachineFunction *MF = MI.getParent()->getParent();
4938 const MachineRegisterInfo &MRI = MF->getRegInfo();
4939
4940 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4941 // Find a better property to recognize the point where instruction selection
4942 // is just done.
4943 // We can only enforce this check after SIFixSGPRCopies pass so that the
4944 // illegal copies are legalized and thereafter we don't expect a pass
4945 // inserting similar copies.
4946 if (!MRI.isSSA() && MI.isCopy())
4947 return verifyCopy(MI, MRI, ErrInfo);
4948
4949 if (SIInstrInfo::isGenericOpcode(Opcode))
4950 return true;
4951
4952 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4953 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4954 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4955 int Src3Idx = -1;
4956 if (Src0Idx == -1) {
4957 // VOPD V_DUAL_* instructions use different operand names.
4958 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4959 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4960 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4961 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4962 }
4963
4964 // Make sure the number of operands is correct.
4965 const MCInstrDesc &Desc = get(Opcode);
4966 if (!Desc.isVariadic() &&
4967 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4968 ErrInfo = "Instruction has wrong number of operands.";
4969 return false;
4970 }
4971
4972 if (MI.isInlineAsm()) {
4973 // Verify register classes for inlineasm constraints.
4974 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4975 I != E; ++I) {
4976 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4977 if (!RC)
4978 continue;
4979
4980 const MachineOperand &Op = MI.getOperand(I);
4981 if (!Op.isReg())
4982 continue;
4983
4984 Register Reg = Op.getReg();
4985 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4986 ErrInfo = "inlineasm operand has incorrect register class.";
4987 return false;
4988 }
4989 }
4990
4991 return true;
4992 }
4993
4994 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4995 ErrInfo = "missing memory operand from image instruction.";
4996 return false;
4997 }
4998
4999 // Make sure the register classes are correct.
5000 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5001 const MachineOperand &MO = MI.getOperand(i);
5002 if (MO.isFPImm()) {
5003 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5004 "all fp values to integers.";
5005 return false;
5006 }
5007
5008 int RegClass = Desc.operands()[i].RegClass;
5009
5010 const MCOperandInfo &OpInfo = Desc.operands()[i];
5011 switch (OpInfo.OperandType) {
5013 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5014 ErrInfo = "Illegal immediate value for operand.";
5015 return false;
5016 }
5017 break;
5030 break;
5032 break;
5033 break;
5047 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5048 ErrInfo = "Illegal immediate value for operand.";
5049 return false;
5050 }
5051 break;
5052 }
5054 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5055 ErrInfo = "Expected inline constant for operand.";
5056 return false;
5057 }
5058 break;
5062 break;
5067 // Check if this operand is an immediate.
5068 // FrameIndex operands will be replaced by immediates, so they are
5069 // allowed.
5070 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5071 ErrInfo = "Expected immediate, but got non-immediate";
5072 return false;
5073 }
5074 break;
5078 break;
5079 default:
5080 if (OpInfo.isGenericType())
5081 continue;
5082 break;
5083 }
5084
5085 if (!MO.isReg())
5086 continue;
5087 Register Reg = MO.getReg();
5088 if (!Reg)
5089 continue;
5090
5091 // FIXME: Ideally we would have separate instruction definitions with the
5092 // aligned register constraint.
5093 // FIXME: We do not verify inline asm operands, but custom inline asm
5094 // verification is broken anyway
5095 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5096 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5097 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5098 if (const TargetRegisterClass *SubRC =
5099 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5100 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5101 if (RC)
5102 RC = SubRC;
5103 }
5104 }
5105
5106 // Check that this is the aligned version of the class.
5107 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5108 ErrInfo = "Subtarget requires even aligned vector registers";
5109 return false;
5110 }
5111 }
5112
5113 if (RegClass != -1) {
5114 if (Reg.isVirtual())
5115 continue;
5116
5117 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5118 if (!RC->contains(Reg)) {
5119 ErrInfo = "Operand has incorrect register class.";
5120 return false;
5121 }
5122 }
5123 }
5124
5125 // Verify SDWA
5126 if (isSDWA(MI)) {
5127 if (!ST.hasSDWA()) {
5128 ErrInfo = "SDWA is not supported on this target";
5129 return false;
5130 }
5131
5132 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5133 AMDGPU::OpName::dst_sel}) {
5134 const MachineOperand *MO = getNamedOperand(MI, Op);
5135 if (!MO)
5136 continue;
5137 int64_t Imm = MO->getImm();
5138 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5139 ErrInfo = "Invalid SDWA selection";
5140 return false;
5141 }
5142 }
5143
5144 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5145
5146 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5147 if (OpIdx == -1)
5148 continue;
5149 const MachineOperand &MO = MI.getOperand(OpIdx);
5150
5151 if (!ST.hasSDWAScalar()) {
5152 // Only VGPRS on VI
5153 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5154 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5155 return false;
5156 }
5157 } else {
5158 // No immediates on GFX9
5159 if (!MO.isReg()) {
5160 ErrInfo =
5161 "Only reg allowed as operands in SDWA instructions on GFX9+";
5162 return false;
5163 }
5164 }
5165 }
5166
5167 if (!ST.hasSDWAOmod()) {
5168 // No omod allowed on VI
5169 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5170 if (OMod != nullptr &&
5171 (!OMod->isImm() || OMod->getImm() != 0)) {
5172 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5173 return false;
5174 }
5175 }
5176
5177 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5178 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5179 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5180 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5181 const MachineOperand *Src0ModsMO =
5182 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5183 unsigned Mods = Src0ModsMO->getImm();
5184 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5185 Mods & SISrcMods::SEXT) {
5186 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5187 return false;
5188 }
5189 }
5190
5191 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5192 if (isVOPC(BasicOpcode)) {
5193 if (!ST.hasSDWASdst() && DstIdx != -1) {
5194 // Only vcc allowed as dst on VI for VOPC
5195 const MachineOperand &Dst = MI.getOperand(DstIdx);
5196 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5197 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5198 return false;
5199 }
5200 } else if (!ST.hasSDWAOutModsVOPC()) {
5201 // No clamp allowed on GFX9 for VOPC
5202 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5203 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5204 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5205 return false;
5206 }
5207
5208 // No omod allowed on GFX9 for VOPC
5209 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5210 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5211 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5212 return false;
5213 }
5214 }
5215 }
5216
5217 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5218 if (DstUnused && DstUnused->isImm() &&
5219 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5220 const MachineOperand &Dst = MI.getOperand(DstIdx);
5221 if (!Dst.isReg() || !Dst.isTied()) {
5222 ErrInfo = "Dst register should have tied register";
5223 return false;
5224 }
5225
5226 const MachineOperand &TiedMO =
5227 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5228 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5229 ErrInfo =
5230 "Dst register should be tied to implicit use of preserved register";
5231 return false;
5232 }
5233 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5234 ErrInfo = "Dst register should use same physical register as preserved";
5235 return false;
5236 }
5237 }
5238 }
5239
5240 // Verify MIMG / VIMAGE / VSAMPLE
5241 if (isImage(Opcode) && !MI.mayStore()) {
5242 // Ensure that the return type used is large enough for all the options
5243 // being used TFE/LWE require an extra result register.
5244 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5245 if (DMask) {
5246 uint64_t DMaskImm = DMask->getImm();
5247 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5248 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5249 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5250 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5251
5252 // Adjust for packed 16 bit values
5253 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5254 RegCount = divideCeil(RegCount, 2);
5255
5256 // Adjust if using LWE or TFE
5257 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5258 RegCount += 1;
5259
5260 const uint32_t DstIdx =
5261 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5262 const MachineOperand &Dst = MI.getOperand(DstIdx);
5263 if (Dst.isReg()) {
5264 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5265 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5266 if (RegCount > DstSize) {
5267 ErrInfo = "Image instruction returns too many registers for dst "
5268 "register class";
5269 return false;
5270 }
5271 }
5272 }
5273 }
5274
5275 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5276 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5277 unsigned ConstantBusCount = 0;
5278 bool UsesLiteral = false;
5279 const MachineOperand *LiteralVal = nullptr;
5280
5281 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5282 if (ImmIdx != -1) {
5283 ++ConstantBusCount;
5284 UsesLiteral = true;
5285 LiteralVal = &MI.getOperand(ImmIdx);
5286 }
5287
5288 SmallVector<Register, 2> SGPRsUsed;
5289 Register SGPRUsed;
5290
5291 // Only look at the true operands. Only a real operand can use the constant
5292 // bus, and we don't want to check pseudo-operands like the source modifier
5293 // flags.
5294 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5295 if (OpIdx == -1)
5296 continue;
5297 const MachineOperand &MO = MI.getOperand(OpIdx);
5298 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5299 if (MO.isReg()) {
5300 SGPRUsed = MO.getReg();
5301 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5302 ++ConstantBusCount;
5303 SGPRsUsed.push_back(SGPRUsed);
5304 }
5305 } else if (!MO.isFI()) { // Treat FI like a register.
5306 if (!UsesLiteral) {
5307 ++ConstantBusCount;
5308 UsesLiteral = true;
5309 LiteralVal = &MO;
5310 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5311 assert(isVOP2(MI) || isVOP3(MI));
5312 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5313 return false;
5314 }
5315 }
5316 }
5317 }
5318
5319 SGPRUsed = findImplicitSGPRRead(MI);
5320 if (SGPRUsed) {
5321 // Implicit uses may safely overlap true operands
5322 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5323 return !RI.regsOverlap(SGPRUsed, SGPR);
5324 })) {
5325 ++ConstantBusCount;
5326 SGPRsUsed.push_back(SGPRUsed);
5327 }
5328 }
5329
5330 // v_writelane_b32 is an exception from constant bus restriction:
5331 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5332 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5333 Opcode != AMDGPU::V_WRITELANE_B32) {
5334 ErrInfo = "VOP* instruction violates constant bus restriction";
5335 return false;
5336 }
5337
5338 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5339 ErrInfo = "VOP3 instruction uses literal";
5340 return false;
5341 }
5342 }
5343
5344 // Special case for writelane - this can break the multiple constant bus rule,
5345 // but still can't use more than one SGPR register
5346 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5347 unsigned SGPRCount = 0;
5348 Register SGPRUsed;
5349
5350 for (int OpIdx : {Src0Idx, Src1Idx}) {
5351 if (OpIdx == -1)
5352 break;
5353
5354 const MachineOperand &MO = MI.getOperand(OpIdx);
5355
5356 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5357 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5358 if (MO.getReg() != SGPRUsed)
5359 ++SGPRCount;
5360 SGPRUsed = MO.getReg();
5361 }
5362 }
5363 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5364 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5365 return false;
5366 }
5367 }
5368 }
5369
5370 // Verify misc. restrictions on specific instructions.
5371 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5372 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5373 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5374 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5375 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5376 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5377 if (!compareMachineOp(Src0, Src1) &&
5378 !compareMachineOp(Src0, Src2)) {
5379 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5380 return false;
5381 }
5382 }
5383 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5384 SISrcMods::ABS) ||
5385 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5386 SISrcMods::ABS) ||
5387 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5388 SISrcMods::ABS)) {
5389 ErrInfo = "ABS not allowed in VOP3B instructions";
5390 return false;
5391 }
5392 }
5393
5394 if (isSOP2(MI) || isSOPC(MI)) {
5395 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5396 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5397
5398 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5399 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5400 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5401 !Src0.isIdenticalTo(Src1)) {
5402 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5403 return false;
5404 }
5405 }
5406
5407 if (isSOPK(MI)) {
5408 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5409 if (Desc.isBranch()) {
5410 if (!Op->isMBB()) {
5411 ErrInfo = "invalid branch target for SOPK instruction";
5412 return false;
5413 }
5414 } else {
5415 uint64_t Imm = Op->getImm();
5416 if (sopkIsZext(Opcode)) {
5417 if (!isUInt<16>(Imm)) {
5418 ErrInfo = "invalid immediate for SOPK instruction";
5419 return false;
5420 }
5421 } else {
5422 if (!isInt<16>(Imm)) {
5423 ErrInfo = "invalid immediate for SOPK instruction";
5424 return false;
5425 }
5426 }
5427 }
5428 }
5429
5430 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5431 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5432 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5433 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5434 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5435 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5436
5437 const unsigned StaticNumOps =
5438 Desc.getNumOperands() + Desc.implicit_uses().size();
5439 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5440
5441 // Allow additional implicit operands. This allows a fixup done by the post
5442 // RA scheduler where the main implicit operand is killed and implicit-defs
5443 // are added for sub-registers that remain live after this instruction.
5444 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5445 ErrInfo = "missing implicit register operands";
5446 return false;
5447 }
5448
5449 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5450 if (IsDst) {
5451 if (!Dst->isUse()) {
5452 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5453 return false;
5454 }
5455
5456 unsigned UseOpIdx;
5457 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5458 UseOpIdx != StaticNumOps + 1) {
5459 ErrInfo = "movrel implicit operands should be tied";
5460 return false;
5461 }
5462 }
5463
5464 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5465 const MachineOperand &ImpUse
5466 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5467 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5468 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5469 ErrInfo = "src0 should be subreg of implicit vector use";
5470 return false;
5471 }
5472 }
5473
5474 // Make sure we aren't losing exec uses in the td files. This mostly requires
5475 // being careful when using let Uses to try to add other use registers.
5476 if (shouldReadExec(MI)) {
5477 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5478 ErrInfo = "VALU instruction does not implicitly read exec mask";
5479 return false;
5480 }
5481 }
5482
5483 if (isSMRD(MI)) {
5484 if (MI.mayStore() &&
5485 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5486 // The register offset form of scalar stores may only use m0 as the
5487 // soffset register.
5488 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5489 if (Soff && Soff->getReg() != AMDGPU::M0) {
5490 ErrInfo = "scalar stores must use m0 as offset register";
5491 return false;
5492 }
5493 }
5494 }
5495
5496 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5497 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5498 if (Offset->getImm() != 0) {
5499 ErrInfo = "subtarget does not support offsets in flat instructions";
5500 return false;
5501 }
5502 }
5503
5504 if (isDS(MI) && !ST.hasGDS()) {
5505 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5506 if (GDSOp && GDSOp->getImm() != 0) {
5507 ErrInfo = "GDS is not supported on this subtarget";
5508 return false;
5509 }
5510 }
5511
5512 if (isImage(MI)) {
5513 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5514 if (DimOp) {
5515 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5516 AMDGPU::OpName::vaddr0);
5517 AMDGPU::OpName RSrcOpName =
5518 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5519 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5520 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5521 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5522 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5523 const AMDGPU::MIMGDimInfo *Dim =
5525
5526 if (!Dim) {
5527 ErrInfo = "dim is out of range";
5528 return false;
5529 }
5530
5531 bool IsA16 = false;
5532 if (ST.hasR128A16()) {
5533 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5534 IsA16 = R128A16->getImm() != 0;
5535 } else if (ST.hasA16()) {
5536 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5537 IsA16 = A16->getImm() != 0;
5538 }
5539
5540 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5541
5542 unsigned AddrWords =
5543 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5544
5545 unsigned VAddrWords;
5546 if (IsNSA) {
5547 VAddrWords = RsrcIdx - VAddr0Idx;
5548 if (ST.hasPartialNSAEncoding() &&
5549 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5550 unsigned LastVAddrIdx = RsrcIdx - 1;
5551 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5552 }
5553 } else {
5554 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5555 if (AddrWords > 12)
5556 AddrWords = 16;
5557 }
5558
5559 if (VAddrWords != AddrWords) {
5560 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5561 << " but got " << VAddrWords << "\n");
5562 ErrInfo = "bad vaddr size";
5563 return false;
5564 }
5565 }
5566 }
5567
5568 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5569 if (DppCt) {
5570 using namespace AMDGPU::DPP;
5571
5572 unsigned DC = DppCt->getImm();
5573 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5574 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5575 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5576 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5577 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5578 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5579 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5580 ErrInfo = "Invalid dpp_ctrl value";
5581 return false;
5582 }
5583 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5584 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5585 ErrInfo = "Invalid dpp_ctrl value: "
5586 "wavefront shifts are not supported on GFX10+";
5587 return false;
5588 }
5589 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5590 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5591 ErrInfo = "Invalid dpp_ctrl value: "
5592 "broadcasts are not supported on GFX10+";
5593 return false;
5594 }
5595 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5596 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5597 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5598 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5599 !ST.hasGFX90AInsts()) {
5600 ErrInfo = "Invalid dpp_ctrl value: "
5601 "row_newbroadcast/row_share is not supported before "
5602 "GFX90A/GFX10";
5603 return false;
5604 }
5605 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5606 ErrInfo = "Invalid dpp_ctrl value: "
5607 "row_share and row_xmask are not supported before GFX10";
5608 return false;
5609 }
5610 }
5611
5612 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5615 ErrInfo = "Invalid dpp_ctrl value: "
5616 "DP ALU dpp only support row_newbcast";
5617 return false;
5618 }
5619 }
5620
5621 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5622 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5623 AMDGPU::OpName DataName =
5624 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5625 const MachineOperand *Data = getNamedOperand(MI, DataName);
5626 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5627 if (Data && !Data->isReg())
5628 Data = nullptr;
5629
5630 if (ST.hasGFX90AInsts()) {
5631 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5632 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5633 ErrInfo = "Invalid register class: "
5634 "vdata and vdst should be both VGPR or AGPR";
5635 return false;
5636 }
5637 if (Data && Data2 &&
5638 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5639 ErrInfo = "Invalid register class: "
5640 "both data operands should be VGPR or AGPR";
5641 return false;
5642 }
5643 } else {
5644 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5645 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5646 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5647 ErrInfo = "Invalid register class: "
5648 "agpr loads and stores not supported on this GPU";
5649 return false;
5650 }
5651 }
5652 }
5653
5654 if (ST.needsAlignedVGPRs()) {
5655 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5657 if (!Op)
5658 return true;
5659 Register Reg = Op->getReg();
5660 if (Reg.isPhysical())
5661 return !(RI.getHWRegIndex(Reg) & 1);
5662 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5663 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5664 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5665 };
5666
5667 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5668 Opcode == AMDGPU::DS_GWS_BARRIER) {
5669
5670 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5671 ErrInfo = "Subtarget requires even aligned vector registers "
5672 "for DS_GWS instructions";
5673 return false;
5674 }
5675 }
5676
5677 if (isMIMG(MI)) {
5678 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5679 ErrInfo = "Subtarget requires even aligned vector registers "
5680 "for vaddr operand of image instructions";
5681 return false;
5682 }
5683 }
5684 }
5685
5686 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5687 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5688 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5689 ErrInfo = "Invalid register class: "
5690 "v_accvgpr_write with an SGPR is not supported on this GPU";
5691 return false;
5692 }
5693 }
5694
5695 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5696 const MachineOperand &SrcOp = MI.getOperand(1);
5697 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5698 ErrInfo = "pseudo expects only physical SGPRs";
5699 return false;
5700 }
5701 }
5702
5703 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5704 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5705 if (!ST.hasScaleOffset()) {
5706 ErrInfo = "Subtarget does not support offset scaling";
5707 return false;
5708 }
5709 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5710 ErrInfo = "Instruction does not support offset scaling";
5711 return false;
5712 }
5713 }
5714 }
5715
5716 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5717 // information.
5718 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5719 for (unsigned I = 0; I < 3; ++I) {
5721 return false;
5722 }
5723 }
5724
5725 return true;
5726}
5727
5728// It is more readable to list mapped opcodes on the same line.
5729// clang-format off
5730
5732 switch (MI.getOpcode()) {
5733 default: return AMDGPU::INSTRUCTION_LIST_END;
5734 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5735 case AMDGPU::COPY: return AMDGPU::COPY;
5736 case AMDGPU::PHI: return AMDGPU::PHI;
5737 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5738 case AMDGPU::WQM: return AMDGPU::WQM;
5739 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5740 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5741 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5742 case AMDGPU::S_MOV_B32: {
5743 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5744 return MI.getOperand(1).isReg() ||
5745 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5746 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5747 }
5748 case AMDGPU::S_ADD_I32:
5749 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5750 case AMDGPU::S_ADDC_U32:
5751 return AMDGPU::V_ADDC_U32_e32;
5752 case AMDGPU::S_SUB_I32:
5753 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5754 // FIXME: These are not consistently handled, and selected when the carry is
5755 // used.
5756 case AMDGPU::S_ADD_U32:
5757 return AMDGPU::V_ADD_CO_U32_e32;
5758 case AMDGPU::S_SUB_U32:
5759 return AMDGPU::V_SUB_CO_U32_e32;
5760 case AMDGPU::S_ADD_U64_PSEUDO:
5761 return AMDGPU::V_ADD_U64_PSEUDO;
5762 case AMDGPU::S_SUB_U64_PSEUDO:
5763 return AMDGPU::V_SUB_U64_PSEUDO;
5764 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5765 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5766 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5767 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5768 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5769 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5770 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5771 case AMDGPU::S_XNOR_B32:
5772 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5773 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5774 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5775 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5776 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5777 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5778 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5779 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5780 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5781 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5782 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5783 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5784 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5785 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5786 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5787 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5788 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5789 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5790 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5791 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5792 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5793 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5794 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5795 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5796 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5797 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5798 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5799 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5800 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5801 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5802 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5803 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5804 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5805 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5806 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5807 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5808 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5809 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5810 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5811 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5812 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5813 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5814 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5815 case AMDGPU::S_CVT_F32_F16:
5816 case AMDGPU::S_CVT_HI_F32_F16:
5817 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5818 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5819 case AMDGPU::S_CVT_F16_F32:
5820 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5821 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5822 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5823 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5824 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5825 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5826 case AMDGPU::S_CEIL_F16:
5827 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5828 : AMDGPU::V_CEIL_F16_fake16_e64;
5829 case AMDGPU::S_FLOOR_F16:
5830 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5831 : AMDGPU::V_FLOOR_F16_fake16_e64;
5832 case AMDGPU::S_TRUNC_F16:
5833 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5834 : AMDGPU::V_TRUNC_F16_fake16_e64;
5835 case AMDGPU::S_RNDNE_F16:
5836 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5837 : AMDGPU::V_RNDNE_F16_fake16_e64;
5838 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5839 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5840 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5841 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5842 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5843 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5844 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5845 case AMDGPU::S_ADD_F16:
5846 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5847 : AMDGPU::V_ADD_F16_fake16_e64;
5848 case AMDGPU::S_SUB_F16:
5849 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5850 : AMDGPU::V_SUB_F16_fake16_e64;
5851 case AMDGPU::S_MIN_F16:
5852 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5853 : AMDGPU::V_MIN_F16_fake16_e64;
5854 case AMDGPU::S_MAX_F16:
5855 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5856 : AMDGPU::V_MAX_F16_fake16_e64;
5857 case AMDGPU::S_MINIMUM_F16:
5858 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5859 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5860 case AMDGPU::S_MAXIMUM_F16:
5861 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5862 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5863 case AMDGPU::S_MUL_F16:
5864 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5865 : AMDGPU::V_MUL_F16_fake16_e64;
5866 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5867 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5868 case AMDGPU::S_FMAC_F16:
5869 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5870 : AMDGPU::V_FMAC_F16_fake16_e64;
5871 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5872 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5873 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5874 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5875 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5876 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5877 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5878 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5879 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5880 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5881 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5882 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5883 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5884 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5885 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5886 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5887 case AMDGPU::S_CMP_LT_F16:
5888 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5889 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5890 case AMDGPU::S_CMP_EQ_F16:
5891 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5892 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5893 case AMDGPU::S_CMP_LE_F16:
5894 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5895 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5896 case AMDGPU::S_CMP_GT_F16:
5897 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5898 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5899 case AMDGPU::S_CMP_LG_F16:
5900 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5901 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5902 case AMDGPU::S_CMP_GE_F16:
5903 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5904 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5905 case AMDGPU::S_CMP_O_F16:
5906 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5907 : AMDGPU::V_CMP_O_F16_fake16_e64;
5908 case AMDGPU::S_CMP_U_F16:
5909 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5910 : AMDGPU::V_CMP_U_F16_fake16_e64;
5911 case AMDGPU::S_CMP_NGE_F16:
5912 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5913 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5914 case AMDGPU::S_CMP_NLG_F16:
5915 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5916 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5917 case AMDGPU::S_CMP_NGT_F16:
5918 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5919 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5920 case AMDGPU::S_CMP_NLE_F16:
5921 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5922 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5923 case AMDGPU::S_CMP_NEQ_F16:
5924 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5925 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5926 case AMDGPU::S_CMP_NLT_F16:
5927 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5928 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5929 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5930 case AMDGPU::V_S_EXP_F16_e64:
5931 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5932 : AMDGPU::V_EXP_F16_fake16_e64;
5933 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5934 case AMDGPU::V_S_LOG_F16_e64:
5935 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5936 : AMDGPU::V_LOG_F16_fake16_e64;
5937 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5938 case AMDGPU::V_S_RCP_F16_e64:
5939 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5940 : AMDGPU::V_RCP_F16_fake16_e64;
5941 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5942 case AMDGPU::V_S_RSQ_F16_e64:
5943 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5944 : AMDGPU::V_RSQ_F16_fake16_e64;
5945 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5946 case AMDGPU::V_S_SQRT_F16_e64:
5947 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5948 : AMDGPU::V_SQRT_F16_fake16_e64;
5949 }
5951 "Unexpected scalar opcode without corresponding vector one!");
5952}
5953
5954// clang-format on
5955
5959 const DebugLoc &DL, Register Reg,
5960 bool IsSCCLive,
5961 SlotIndexes *Indexes) const {
5962 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5963 const SIInstrInfo *TII = ST.getInstrInfo();
5965 if (IsSCCLive) {
5966 // Insert two move instructions, one to save the original value of EXEC and
5967 // the other to turn on all bits in EXEC. This is required as we can't use
5968 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5969 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
5971 auto FlipExecMI =
5972 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
5973 if (Indexes) {
5974 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5975 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5976 }
5977 } else {
5978 auto SaveExec =
5979 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
5980 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5981 if (Indexes)
5982 Indexes->insertMachineInstrInMaps(*SaveExec);
5983 }
5984}
5985
5988 const DebugLoc &DL, Register Reg,
5989 SlotIndexes *Indexes) const {
5991 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
5992 .addReg(Reg, RegState::Kill);
5993 if (Indexes)
5994 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5995}
5996
6000 "Not a whole wave func");
6001 MachineBasicBlock &MBB = *MF.begin();
6002 for (MachineInstr &MI : MBB)
6003 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6004 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6005 return &MI;
6006
6007 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6008}
6009
6010static const TargetRegisterClass *
6012 const MCInstrDesc &TID, unsigned RCID) {
6013 if (!ST.hasGFX90AInsts() && (TID.mayLoad() || TID.mayStore())) {
6014 switch (RCID) {
6015 case AMDGPU::AV_32RegClassID:
6016 RCID = AMDGPU::VGPR_32RegClassID;
6017 break;
6018 case AMDGPU::AV_64RegClassID:
6019 RCID = AMDGPU::VReg_64RegClassID;
6020 break;
6021 case AMDGPU::AV_96RegClassID:
6022 RCID = AMDGPU::VReg_96RegClassID;
6023 break;
6024 case AMDGPU::AV_128RegClassID:
6025 RCID = AMDGPU::VReg_128RegClassID;
6026 break;
6027 case AMDGPU::AV_160RegClassID:
6028 RCID = AMDGPU::VReg_160RegClassID;
6029 break;
6030 case AMDGPU::AV_512RegClassID:
6031 RCID = AMDGPU::VReg_512RegClassID;
6032 break;
6033 default:
6034 break;
6035 }
6036 }
6037
6038 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
6039}
6040
6041const TargetRegisterClass *
6042SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
6043 const TargetRegisterInfo *TRI) const {
6044 if (OpNum >= TID.getNumOperands())
6045 return nullptr;
6046 auto RegClass = TID.operands()[OpNum].RegClass;
6047 // Special pseudos have no alignment requirement.
6048 if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID))
6049 return RI.getRegClass(RegClass);
6050
6051 return adjustAllocatableRegClass(ST, RI, TID, RegClass);
6052}
6053
6055 unsigned OpNo) const {
6056 const MCInstrDesc &Desc = get(MI.getOpcode());
6057 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6058 Desc.operands()[OpNo].RegClass == -1) {
6059 Register Reg = MI.getOperand(OpNo).getReg();
6060
6061 if (Reg.isVirtual()) {
6062 const MachineRegisterInfo &MRI =
6063 MI.getParent()->getParent()->getRegInfo();
6064 return MRI.getRegClass(Reg);
6065 }
6066 return RI.getPhysRegBaseClass(Reg);
6067 }
6068
6069 unsigned RCID = Desc.operands()[OpNo].RegClass;
6070 return adjustAllocatableRegClass(ST, RI, Desc, RCID);
6071}
6072
6075 MachineBasicBlock *MBB = MI.getParent();
6076 MachineOperand &MO = MI.getOperand(OpIdx);
6077 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6078 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
6079 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6080 unsigned Size = RI.getRegSizeInBits(*RC);
6081 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6082 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6083 : AMDGPU::V_MOV_B32_e32;
6084 if (MO.isReg())
6085 Opcode = AMDGPU::COPY;
6086 else if (RI.isSGPRClass(RC))
6087 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6088
6089 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6090 Register Reg = MRI.createVirtualRegister(VRC);
6091 DebugLoc DL = MBB->findDebugLoc(I);
6092 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6093 MO.ChangeToRegister(Reg, false);
6094}
6095
6098 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6099 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6100 if (!SuperReg.getReg().isVirtual())
6101 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6102
6103 MachineBasicBlock *MBB = MI->getParent();
6104 const DebugLoc &DL = MI->getDebugLoc();
6105 Register SubReg = MRI.createVirtualRegister(SubRC);
6106
6107 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6108 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6109 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6110 return SubReg;
6111}
6112
6115 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6116 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6117 if (Op.isImm()) {
6118 if (SubIdx == AMDGPU::sub0)
6119 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6120 if (SubIdx == AMDGPU::sub1)
6121 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6122
6123 llvm_unreachable("Unhandled register index for immediate");
6124 }
6125
6126 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6127 SubIdx, SubRC);
6128 return MachineOperand::CreateReg(SubReg, false);
6129}
6130
6131// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6132void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6133 assert(Inst.getNumExplicitOperands() == 3);
6134 MachineOperand Op1 = Inst.getOperand(1);
6135 Inst.removeOperand(1);
6136 Inst.addOperand(Op1);
6137}
6138
6140 const MCOperandInfo &OpInfo,
6141 const MachineOperand &MO) const {
6142 if (!MO.isReg())
6143 return false;
6144
6145 Register Reg = MO.getReg();
6146
6147 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
6148 if (Reg.isPhysical())
6149 return DRC->contains(Reg);
6150
6151 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6152
6153 if (MO.getSubReg()) {
6154 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6155 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6156 if (!SuperRC)
6157 return false;
6158 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6159 }
6160
6161 return RI.getCommonSubClass(DRC, RC) != nullptr;
6162}
6163
6165 const MachineOperand &MO) const {
6166 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6167 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6168 unsigned Opc = MI.getOpcode();
6169
6170 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6171 // information.
6172 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6173 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6174 constexpr const AMDGPU::OpName OpNames[] = {
6175 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6176
6177 for (auto [I, OpName] : enumerate(OpNames)) {
6178 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6179 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6181 return false;
6182 }
6183 }
6184
6185 if (!isLegalRegOperand(MRI, OpInfo, MO))
6186 return false;
6187
6188 // check Accumulate GPR operand
6189 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6190 if (IsAGPR && !ST.hasMAIInsts())
6191 return false;
6192 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6193 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6194 return false;
6195 // Atomics should have both vdst and vdata either vgpr or agpr.
6196 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6197 const int DataIdx = AMDGPU::getNamedOperandIdx(
6198 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6199 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6200 MI.getOperand(DataIdx).isReg() &&
6201 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6202 return false;
6203 if ((int)OpIdx == DataIdx) {
6204 if (VDstIdx != -1 &&
6205 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6206 return false;
6207 // DS instructions with 2 src operands also must have tied RC.
6208 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6209 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6210 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6211 return false;
6212 }
6213
6214 // Check V_ACCVGPR_WRITE_B32_e64
6215 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6216 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6217 RI.isSGPRReg(MRI, MO.getReg()))
6218 return false;
6219 return true;
6220}
6221
6223 const MCOperandInfo &OpInfo,
6224 const MachineOperand &MO) const {
6225 if (MO.isReg())
6226 return isLegalRegOperand(MRI, OpInfo, MO);
6227
6228 // Handle non-register types that are treated like immediates.
6229 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6230 return true;
6231}
6232
6234 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6235 const MachineOperand *MO) const {
6236 constexpr const unsigned NumOps = 3;
6237 constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
6238 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6239 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6240 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6241
6242 assert(SrcN < NumOps);
6243
6244 if (!MO) {
6245 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6246 if (SrcIdx == -1)
6247 return true;
6248 MO = &MI.getOperand(SrcIdx);
6249 }
6250
6251 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6252 return true;
6253
6254 int ModsIdx =
6255 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6256 if (ModsIdx == -1)
6257 return true;
6258
6259 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6260 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6261 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6262
6263 return !OpSel && !OpSelHi;
6264}
6265
6267 const MachineOperand *MO) const {
6268 const MachineFunction &MF = *MI.getParent()->getParent();
6269 const MachineRegisterInfo &MRI = MF.getRegInfo();
6270 const MCInstrDesc &InstDesc = MI.getDesc();
6271 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6272 const TargetRegisterClass *DefinedRC =
6273 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
6274 if (!MO)
6275 MO = &MI.getOperand(OpIdx);
6276
6277 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6278
6279 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6280 const MachineOperand *UsedLiteral = nullptr;
6281
6282 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6283 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6284
6285 // TODO: Be more permissive with frame indexes.
6286 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6287 if (!LiteralLimit--)
6288 return false;
6289
6290 UsedLiteral = MO;
6291 }
6292
6294 if (MO->isReg())
6295 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6296
6297 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6298 if (i == OpIdx)
6299 continue;
6300 const MachineOperand &Op = MI.getOperand(i);
6301 if (Op.isReg()) {
6302 if (Op.isUse()) {
6303 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6304 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6305 if (--ConstantBusLimit <= 0)
6306 return false;
6307 }
6308 }
6309 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6310 !isInlineConstant(Op, InstDesc.operands()[i])) {
6311 // The same literal may be used multiple times.
6312 if (!UsedLiteral)
6313 UsedLiteral = &Op;
6314 else if (UsedLiteral->isIdenticalTo(Op))
6315 continue;
6316
6317 if (!LiteralLimit--)
6318 return false;
6319 if (--ConstantBusLimit <= 0)
6320 return false;
6321 }
6322 }
6323 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6324 // There can be at most one literal operand, but it can be repeated.
6325 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6326 if (i == OpIdx)
6327 continue;
6328 const MachineOperand &Op = MI.getOperand(i);
6329 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6330 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6331 !Op.isIdenticalTo(*MO))
6332 return false;
6333
6334 // Do not fold a non-inlineable and non-register operand into an
6335 // instruction that already has a frame index. The frame index handling
6336 // code could not handle well when a frame index co-exists with another
6337 // non-register operand, unless that operand is an inlineable immediate.
6338 if (Op.isFI())
6339 return false;
6340 }
6341 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6342 isF16PseudoScalarTrans(MI.getOpcode())) {
6343 return false;
6344 }
6345
6346 if (MO->isReg()) {
6347 if (!DefinedRC)
6348 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6349 return isLegalRegOperand(MI, OpIdx, *MO);
6350 }
6351
6352 if (MO->isImm()) {
6353 uint64_t Imm = MO->getImm();
6354 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6355 bool Is64BitOp = Is64BitFPOp ||
6356 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6357 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6358 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6359 if (Is64BitOp &&
6360 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6361 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6362 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6363 return false;
6364
6365 // FIXME: We can use sign extended 64-bit literals, but only for signed
6366 // operands. At the moment we do not know if an operand is signed.
6367 // Such operand will be encoded as its low 32 bits and then either
6368 // correctly sign extended or incorrectly zero extended by HW.
6369 // If 64-bit literals are supported and the literal will be encoded
6370 // as full 64 bit we still can use it.
6371 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6372 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6373 return false;
6374 }
6375 }
6376
6377 // Handle non-register types that are treated like immediates.
6378 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6379
6380 if (!DefinedRC) {
6381 // This operand expects an immediate.
6382 return true;
6383 }
6384
6385 return isImmOperandLegal(MI, OpIdx, *MO);
6386}
6387
6389 bool IsGFX950Only = ST.hasGFX950Insts();
6390 bool IsGFX940Only = ST.hasGFX940Insts();
6391
6392 if (!IsGFX950Only && !IsGFX940Only)
6393 return false;
6394
6395 if (!isVALU(MI))
6396 return false;
6397
6398 // V_COS, V_EXP, V_RCP, etc.
6399 if (isTRANS(MI))
6400 return true;
6401
6402 // DOT2, DOT2C, DOT4, etc.
6403 if (isDOT(MI))
6404 return true;
6405
6406 // MFMA, SMFMA
6407 if (isMFMA(MI))
6408 return true;
6409
6410 unsigned Opcode = MI.getOpcode();
6411 switch (Opcode) {
6412 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6413 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6414 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6415 case AMDGPU::V_MQSAD_U32_U8_e64:
6416 case AMDGPU::V_PK_ADD_F16:
6417 case AMDGPU::V_PK_ADD_F32:
6418 case AMDGPU::V_PK_ADD_I16:
6419 case AMDGPU::V_PK_ADD_U16:
6420 case AMDGPU::V_PK_ASHRREV_I16:
6421 case AMDGPU::V_PK_FMA_F16:
6422 case AMDGPU::V_PK_FMA_F32:
6423 case AMDGPU::V_PK_FMAC_F16_e32:
6424 case AMDGPU::V_PK_FMAC_F16_e64:
6425 case AMDGPU::V_PK_LSHLREV_B16:
6426 case AMDGPU::V_PK_LSHRREV_B16:
6427 case AMDGPU::V_PK_MAD_I16:
6428 case AMDGPU::V_PK_MAD_U16:
6429 case AMDGPU::V_PK_MAX_F16:
6430 case AMDGPU::V_PK_MAX_I16:
6431 case AMDGPU::V_PK_MAX_U16:
6432 case AMDGPU::V_PK_MIN_F16:
6433 case AMDGPU::V_PK_MIN_I16:
6434 case AMDGPU::V_PK_MIN_U16:
6435 case AMDGPU::V_PK_MOV_B32:
6436 case AMDGPU::V_PK_MUL_F16:
6437 case AMDGPU::V_PK_MUL_F32:
6438 case AMDGPU::V_PK_MUL_LO_U16:
6439 case AMDGPU::V_PK_SUB_I16:
6440 case AMDGPU::V_PK_SUB_U16:
6441 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6442 return true;
6443 default:
6444 return false;
6445 }
6446}
6447
6449 MachineInstr &MI) const {
6450 unsigned Opc = MI.getOpcode();
6451 const MCInstrDesc &InstrDesc = get(Opc);
6452
6453 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6454 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6455
6456 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6457 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6458
6459 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6460 // we need to only have one constant bus use before GFX10.
6461 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6462 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6463 RI.isSGPRReg(MRI, Src0.getReg()))
6464 legalizeOpWithMove(MI, Src0Idx);
6465
6466 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6467 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6468 // src0/src1 with V_READFIRSTLANE.
6469 if (Opc == AMDGPU::V_WRITELANE_B32) {
6470 const DebugLoc &DL = MI.getDebugLoc();
6471 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6472 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6473 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6474 .add(Src0);
6475 Src0.ChangeToRegister(Reg, false);
6476 }
6477 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6478 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6479 const DebugLoc &DL = MI.getDebugLoc();
6480 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6481 .add(Src1);
6482 Src1.ChangeToRegister(Reg, false);
6483 }
6484 return;
6485 }
6486
6487 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6488 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6489 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6490 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6491 legalizeOpWithMove(MI, Src2Idx);
6492 }
6493
6494 // VOP2 src0 instructions support all operand types, so we don't need to check
6495 // their legality. If src1 is already legal, we don't need to do anything.
6496 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6497 return;
6498
6499 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6500 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6501 // select is uniform.
6502 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6503 RI.isVGPR(MRI, Src1.getReg())) {
6504 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6505 const DebugLoc &DL = MI.getDebugLoc();
6506 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6507 .add(Src1);
6508 Src1.ChangeToRegister(Reg, false);
6509 return;
6510 }
6511
6512 // We do not use commuteInstruction here because it is too aggressive and will
6513 // commute if it is possible. We only want to commute here if it improves
6514 // legality. This can be called a fairly large number of times so don't waste
6515 // compile time pointlessly swapping and checking legality again.
6516 if (HasImplicitSGPR || !MI.isCommutable()) {
6517 legalizeOpWithMove(MI, Src1Idx);
6518 return;
6519 }
6520
6521 // If src0 can be used as src1, commuting will make the operands legal.
6522 // Otherwise we have to give up and insert a move.
6523 //
6524 // TODO: Other immediate-like operand kinds could be commuted if there was a
6525 // MachineOperand::ChangeTo* for them.
6526 if ((!Src1.isImm() && !Src1.isReg()) ||
6527 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6528 legalizeOpWithMove(MI, Src1Idx);
6529 return;
6530 }
6531
6532 int CommutedOpc = commuteOpcode(MI);
6533 if (CommutedOpc == -1) {
6534 legalizeOpWithMove(MI, Src1Idx);
6535 return;
6536 }
6537
6538 MI.setDesc(get(CommutedOpc));
6539
6540 Register Src0Reg = Src0.getReg();
6541 unsigned Src0SubReg = Src0.getSubReg();
6542 bool Src0Kill = Src0.isKill();
6543
6544 if (Src1.isImm())
6545 Src0.ChangeToImmediate(Src1.getImm());
6546 else if (Src1.isReg()) {
6547 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6548 Src0.setSubReg(Src1.getSubReg());
6549 } else
6550 llvm_unreachable("Should only have register or immediate operands");
6551
6552 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6553 Src1.setSubReg(Src0SubReg);
6555}
6556
6557// Legalize VOP3 operands. All operand types are supported for any operand
6558// but only one literal constant and only starting from GFX10.
6560 MachineInstr &MI) const {
6561 unsigned Opc = MI.getOpcode();
6562
6563 int VOP3Idx[3] = {
6564 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6565 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6566 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6567 };
6568
6569 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6570 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6571 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6572 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6573 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6574 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6575 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6576 // src1 and src2 must be scalar
6577 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6578 const DebugLoc &DL = MI.getDebugLoc();
6579 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6580 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6581 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6582 .add(Src1);
6583 Src1.ChangeToRegister(Reg, false);
6584 }
6585 if (VOP3Idx[2] != -1) {
6586 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6587 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6588 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6589 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6590 .add(Src2);
6591 Src2.ChangeToRegister(Reg, false);
6592 }
6593 }
6594 }
6595
6596 // Find the one SGPR operand we are allowed to use.
6597 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6598 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6599 SmallDenseSet<unsigned> SGPRsUsed;
6600 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6601 if (SGPRReg) {
6602 SGPRsUsed.insert(SGPRReg);
6603 --ConstantBusLimit;
6604 }
6605
6606 for (int Idx : VOP3Idx) {
6607 if (Idx == -1)
6608 break;
6609 MachineOperand &MO = MI.getOperand(Idx);
6610
6611 if (!MO.isReg()) {
6612 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6613 continue;
6614
6615 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6616 --LiteralLimit;
6617 --ConstantBusLimit;
6618 continue;
6619 }
6620
6621 --LiteralLimit;
6622 --ConstantBusLimit;
6623 legalizeOpWithMove(MI, Idx);
6624 continue;
6625 }
6626
6627 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6628 continue; // VGPRs are legal
6629
6630 // We can use one SGPR in each VOP3 instruction prior to GFX10
6631 // and two starting from GFX10.
6632 if (SGPRsUsed.count(MO.getReg()))
6633 continue;
6634 if (ConstantBusLimit > 0) {
6635 SGPRsUsed.insert(MO.getReg());
6636 --ConstantBusLimit;
6637 continue;
6638 }
6639
6640 // If we make it this far, then the operand is not legal and we must
6641 // legalize it.
6642 legalizeOpWithMove(MI, Idx);
6643 }
6644
6645 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6646 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6647 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6648 legalizeOpWithMove(MI, VOP3Idx[2]);
6649
6650 // Fix the register class of packed FP32 instructions on gfx12+. See
6651 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6653 for (unsigned I = 0; I < 3; ++I) {
6655 legalizeOpWithMove(MI, VOP3Idx[I]);
6656 }
6657 }
6658}
6659
6662 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6663 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6664 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6665 if (DstRC)
6666 SRC = RI.getCommonSubClass(SRC, DstRC);
6667
6668 Register DstReg = MRI.createVirtualRegister(SRC);
6669 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6670
6671 if (RI.hasAGPRs(VRC)) {
6672 VRC = RI.getEquivalentVGPRClass(VRC);
6673 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6674 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6675 get(TargetOpcode::COPY), NewSrcReg)
6676 .addReg(SrcReg);
6677 SrcReg = NewSrcReg;
6678 }
6679
6680 if (SubRegs == 1) {
6681 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6682 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6683 .addReg(SrcReg);
6684 return DstReg;
6685 }
6686
6688 for (unsigned i = 0; i < SubRegs; ++i) {
6689 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6690 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6691 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6692 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6693 SRegs.push_back(SGPR);
6694 }
6695
6697 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6698 get(AMDGPU::REG_SEQUENCE), DstReg);
6699 for (unsigned i = 0; i < SubRegs; ++i) {
6700 MIB.addReg(SRegs[i]);
6701 MIB.addImm(RI.getSubRegFromChannel(i));
6702 }
6703 return DstReg;
6704}
6705
6707 MachineInstr &MI) const {
6708
6709 // If the pointer is store in VGPRs, then we need to move them to
6710 // SGPRs using v_readfirstlane. This is safe because we only select
6711 // loads with uniform pointers to SMRD instruction so we know the
6712 // pointer value is uniform.
6713 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6714 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6715 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6716 SBase->setReg(SGPR);
6717 }
6718 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6719 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6720 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6721 SOff->setReg(SGPR);
6722 }
6723}
6724
6726 unsigned Opc = Inst.getOpcode();
6727 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6728 if (OldSAddrIdx < 0)
6729 return false;
6730
6731 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6732
6733 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6734 if (NewOpc < 0)
6736 if (NewOpc < 0)
6737 return false;
6738
6740 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6741 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6742 return false;
6743
6744 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6745 if (NewVAddrIdx < 0)
6746 return false;
6747
6748 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6749
6750 // Check vaddr, it shall be zero or absent.
6751 MachineInstr *VAddrDef = nullptr;
6752 if (OldVAddrIdx >= 0) {
6753 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6754 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6755 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6756 !VAddrDef->getOperand(1).isImm() ||
6757 VAddrDef->getOperand(1).getImm() != 0)
6758 return false;
6759 }
6760
6761 const MCInstrDesc &NewDesc = get(NewOpc);
6762 Inst.setDesc(NewDesc);
6763
6764 // Callers expect iterator to be valid after this call, so modify the
6765 // instruction in place.
6766 if (OldVAddrIdx == NewVAddrIdx) {
6767 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6768 // Clear use list from the old vaddr holding a zero register.
6769 MRI.removeRegOperandFromUseList(&NewVAddr);
6770 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6771 Inst.removeOperand(OldSAddrIdx);
6772 // Update the use list with the pointer we have just moved from vaddr to
6773 // saddr position. Otherwise new vaddr will be missing from the use list.
6774 MRI.removeRegOperandFromUseList(&NewVAddr);
6775 MRI.addRegOperandToUseList(&NewVAddr);
6776 } else {
6777 assert(OldSAddrIdx == NewVAddrIdx);
6778
6779 if (OldVAddrIdx >= 0) {
6780 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6781 AMDGPU::OpName::vdst_in);
6782
6783 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6784 // it asserts. Untie the operands for now and retie them afterwards.
6785 if (NewVDstIn != -1) {
6786 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6787 Inst.untieRegOperand(OldVDstIn);
6788 }
6789
6790 Inst.removeOperand(OldVAddrIdx);
6791
6792 if (NewVDstIn != -1) {
6793 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6794 Inst.tieOperands(NewVDst, NewVDstIn);
6795 }
6796 }
6797 }
6798
6799 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6800 VAddrDef->eraseFromParent();
6801
6802 return true;
6803}
6804
6805// FIXME: Remove this when SelectionDAG is obsoleted.
6807 MachineInstr &MI) const {
6808 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6809 return;
6810
6811 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6812 // thinks they are uniform, so a readfirstlane should be valid.
6813 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6814 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6815 return;
6816
6818 return;
6819
6820 const TargetRegisterClass *DeclaredRC =
6821 getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
6822
6823 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6824 SAddr->setReg(ToSGPR);
6825}
6826
6829 const TargetRegisterClass *DstRC,
6832 const DebugLoc &DL) const {
6833 Register OpReg = Op.getReg();
6834 unsigned OpSubReg = Op.getSubReg();
6835
6836 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6837 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6838
6839 // Check if operand is already the correct register class.
6840 if (DstRC == OpRC)
6841 return;
6842
6843 Register DstReg = MRI.createVirtualRegister(DstRC);
6844 auto Copy =
6845 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6846 Op.setReg(DstReg);
6847
6848 MachineInstr *Def = MRI.getVRegDef(OpReg);
6849 if (!Def)
6850 return;
6851
6852 // Try to eliminate the copy if it is copying an immediate value.
6853 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6854 foldImmediate(*Copy, *Def, OpReg, &MRI);
6855
6856 bool ImpDef = Def->isImplicitDef();
6857 while (!ImpDef && Def && Def->isCopy()) {
6858 if (Def->getOperand(1).getReg().isPhysical())
6859 break;
6860 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6861 ImpDef = Def && Def->isImplicitDef();
6862 }
6863 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6864 !ImpDef)
6865 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6866}
6867
6868// Emit the actual waterfall loop, executing the wrapped instruction for each
6869// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6870// iteration, in the worst case we execute 64 (once per lane).
6871static void
6874 MachineBasicBlock &LoopBB,
6875 MachineBasicBlock &BodyBB,
6876 const DebugLoc &DL,
6877 ArrayRef<MachineOperand *> ScalarOps) {
6878 MachineFunction &MF = *LoopBB.getParent();
6879 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6880 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6882 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6883
6885 Register CondReg;
6886
6887 for (MachineOperand *ScalarOp : ScalarOps) {
6888 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6889 unsigned NumSubRegs = RegSize / 32;
6890 Register VScalarOp = ScalarOp->getReg();
6891
6892 if (NumSubRegs == 1) {
6893 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6894
6895 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6896 .addReg(VScalarOp);
6897
6898 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6899
6900 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6901 .addReg(CurReg)
6902 .addReg(VScalarOp);
6903
6904 // Combine the comparison results with AND.
6905 if (!CondReg) // First.
6906 CondReg = NewCondReg;
6907 else { // If not the first, we create an AND.
6908 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6909 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6910 .addReg(CondReg)
6911 .addReg(NewCondReg);
6912 CondReg = AndReg;
6913 }
6914
6915 // Update ScalarOp operand to use the SGPR ScalarOp.
6916 ScalarOp->setReg(CurReg);
6917 ScalarOp->setIsKill();
6918 } else {
6919 SmallVector<Register, 8> ReadlanePieces;
6920 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6921 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6922 "Unhandled register size");
6923
6924 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6925 Register CurRegLo =
6926 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6927 Register CurRegHi =
6928 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6929
6930 // Read the next variant <- also loop target.
6931 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6932 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6933
6934 // Read the next variant <- also loop target.
6935 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6936 .addReg(VScalarOp, VScalarOpUndef,
6937 TRI->getSubRegFromChannel(Idx + 1));
6938
6939 ReadlanePieces.push_back(CurRegLo);
6940 ReadlanePieces.push_back(CurRegHi);
6941
6942 // Comparison is to be done as 64-bit.
6943 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6944 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6945 .addReg(CurRegLo)
6946 .addImm(AMDGPU::sub0)
6947 .addReg(CurRegHi)
6948 .addImm(AMDGPU::sub1);
6949
6950 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6951 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6952 NewCondReg)
6953 .addReg(CurReg);
6954 if (NumSubRegs <= 2)
6955 Cmp.addReg(VScalarOp);
6956 else
6957 Cmp.addReg(VScalarOp, VScalarOpUndef,
6958 TRI->getSubRegFromChannel(Idx, 2));
6959
6960 // Combine the comparison results with AND.
6961 if (!CondReg) // First.
6962 CondReg = NewCondReg;
6963 else { // If not the first, we create an AND.
6964 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6965 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6966 .addReg(CondReg)
6967 .addReg(NewCondReg);
6968 CondReg = AndReg;
6969 }
6970 } // End for loop.
6971
6972 const auto *SScalarOpRC =
6973 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6974 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6975
6976 // Build scalar ScalarOp.
6977 auto Merge =
6978 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6979 unsigned Channel = 0;
6980 for (Register Piece : ReadlanePieces) {
6981 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6982 }
6983
6984 // Update ScalarOp operand to use the SGPR ScalarOp.
6985 ScalarOp->setReg(SScalarOp);
6986 ScalarOp->setIsKill();
6987 }
6988 }
6989
6990 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6991 MRI.setSimpleHint(SaveExec, CondReg);
6992
6993 // Update EXEC to matching lanes, saving original to SaveExec.
6994 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
6995 .addReg(CondReg, RegState::Kill);
6996
6997 // The original instruction is here; we insert the terminators after it.
6998 I = BodyBB.end();
6999
7000 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7001 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7002 .addReg(LMC.ExecReg)
7003 .addReg(SaveExec);
7004
7005 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7006}
7007
7008// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7009// with SGPRs by iterating over all unique values across all lanes.
7010// Returns the loop basic block that now contains \p MI.
7011static MachineBasicBlock *
7015 MachineBasicBlock::iterator Begin = nullptr,
7016 MachineBasicBlock::iterator End = nullptr) {
7017 MachineBasicBlock &MBB = *MI.getParent();
7018 MachineFunction &MF = *MBB.getParent();
7019 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7020 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7022 if (!Begin.isValid())
7023 Begin = &MI;
7024 if (!End.isValid()) {
7025 End = &MI;
7026 ++End;
7027 }
7028 const DebugLoc &DL = MI.getDebugLoc();
7030 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7031
7032 // Save SCC. Waterfall Loop may overwrite SCC.
7033 Register SaveSCCReg;
7034
7035 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7036 // rather than unlimited scan everywhere
7037 bool SCCNotDead =
7038 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7039 std::numeric_limits<unsigned>::max()) !=
7041 if (SCCNotDead) {
7042 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7043 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7044 .addImm(1)
7045 .addImm(0);
7046 }
7047
7048 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7049
7050 // Save the EXEC mask
7051 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7052
7053 // Killed uses in the instruction we are waterfalling around will be
7054 // incorrect due to the added control-flow.
7056 ++AfterMI;
7057 for (auto I = Begin; I != AfterMI; I++) {
7058 for (auto &MO : I->all_uses())
7059 MRI.clearKillFlags(MO.getReg());
7060 }
7061
7062 // To insert the loop we need to split the block. Move everything after this
7063 // point to a new block, and insert a new empty block between the two.
7066 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7068 ++MBBI;
7069
7070 MF.insert(MBBI, LoopBB);
7071 MF.insert(MBBI, BodyBB);
7072 MF.insert(MBBI, RemainderBB);
7073
7074 LoopBB->addSuccessor(BodyBB);
7075 BodyBB->addSuccessor(LoopBB);
7076 BodyBB->addSuccessor(RemainderBB);
7077
7078 // Move Begin to MI to the BodyBB, and the remainder of the block to
7079 // RemainderBB.
7080 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7081 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7082 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7083
7084 MBB.addSuccessor(LoopBB);
7085
7086 // Update dominators. We know that MBB immediately dominates LoopBB, that
7087 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7088 // RemainderBB. RemainderBB immediately dominates all of the successors
7089 // transferred to it from MBB that MBB used to properly dominate.
7090 if (MDT) {
7091 MDT->addNewBlock(LoopBB, &MBB);
7092 MDT->addNewBlock(BodyBB, LoopBB);
7093 MDT->addNewBlock(RemainderBB, BodyBB);
7094 for (auto &Succ : RemainderBB->successors()) {
7095 if (MDT->properlyDominates(&MBB, Succ)) {
7096 MDT->changeImmediateDominator(Succ, RemainderBB);
7097 }
7098 }
7099 }
7100
7101 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7102
7103 MachineBasicBlock::iterator First = RemainderBB->begin();
7104 // Restore SCC
7105 if (SCCNotDead) {
7106 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7107 .addReg(SaveSCCReg, RegState::Kill)
7108 .addImm(0);
7109 }
7110
7111 // Restore the EXEC mask
7112 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7113 .addReg(SaveExec);
7114 return BodyBB;
7115}
7116
7117// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7118static std::tuple<unsigned, unsigned>
7120 MachineBasicBlock &MBB = *MI.getParent();
7121 MachineFunction &MF = *MBB.getParent();
7123
7124 // Extract the ptr from the resource descriptor.
7125 unsigned RsrcPtr =
7126 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7127 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7128
7129 // Create an empty resource descriptor
7130 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7131 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7132 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7133 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7134 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7135
7136 // Zero64 = 0
7137 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7138 .addImm(0);
7139
7140 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7141 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7142 .addImm(Lo_32(RsrcDataFormat));
7143
7144 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7145 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7146 .addImm(Hi_32(RsrcDataFormat));
7147
7148 // NewSRsrc = {Zero64, SRsrcFormat}
7149 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7150 .addReg(Zero64)
7151 .addImm(AMDGPU::sub0_sub1)
7152 .addReg(SRsrcFormatLo)
7153 .addImm(AMDGPU::sub2)
7154 .addReg(SRsrcFormatHi)
7155 .addImm(AMDGPU::sub3);
7156
7157 return std::tuple(RsrcPtr, NewSRsrc);
7158}
7159
7162 MachineDominatorTree *MDT) const {
7163 MachineFunction &MF = *MI.getParent()->getParent();
7165 MachineBasicBlock *CreatedBB = nullptr;
7166
7167 // Legalize VOP2
7168 if (isVOP2(MI) || isVOPC(MI)) {
7170 return CreatedBB;
7171 }
7172
7173 // Legalize VOP3
7174 if (isVOP3(MI)) {
7176 return CreatedBB;
7177 }
7178
7179 // Legalize SMRD
7180 if (isSMRD(MI)) {
7182 return CreatedBB;
7183 }
7184
7185 // Legalize FLAT
7186 if (isFLAT(MI)) {
7188 return CreatedBB;
7189 }
7190
7191 // Legalize REG_SEQUENCE and PHI
7192 // The register class of the operands much be the same type as the register
7193 // class of the output.
7194 if (MI.getOpcode() == AMDGPU::PHI) {
7195 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7196 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7197 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7198 continue;
7199 const TargetRegisterClass *OpRC =
7200 MRI.getRegClass(MI.getOperand(i).getReg());
7201 if (RI.hasVectorRegisters(OpRC)) {
7202 VRC = OpRC;
7203 } else {
7204 SRC = OpRC;
7205 }
7206 }
7207
7208 // If any of the operands are VGPR registers, then they all most be
7209 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7210 // them.
7211 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7212 if (!VRC) {
7213 assert(SRC);
7214 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7215 VRC = &AMDGPU::VReg_1RegClass;
7216 } else
7217 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7218 ? RI.getEquivalentAGPRClass(SRC)
7219 : RI.getEquivalentVGPRClass(SRC);
7220 } else {
7221 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7222 ? RI.getEquivalentAGPRClass(VRC)
7223 : RI.getEquivalentVGPRClass(VRC);
7224 }
7225 RC = VRC;
7226 } else {
7227 RC = SRC;
7228 }
7229
7230 // Update all the operands so they have the same type.
7231 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7232 MachineOperand &Op = MI.getOperand(I);
7233 if (!Op.isReg() || !Op.getReg().isVirtual())
7234 continue;
7235
7236 // MI is a PHI instruction.
7237 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7239
7240 // Avoid creating no-op copies with the same src and dst reg class. These
7241 // confuse some of the machine passes.
7242 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7243 }
7244 }
7245
7246 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7247 // VGPR dest type and SGPR sources, insert copies so all operands are
7248 // VGPRs. This seems to help operand folding / the register coalescer.
7249 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7250 MachineBasicBlock *MBB = MI.getParent();
7251 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7252 if (RI.hasVGPRs(DstRC)) {
7253 // Update all the operands so they are VGPR register classes. These may
7254 // not be the same register class because REG_SEQUENCE supports mixing
7255 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7256 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7257 MachineOperand &Op = MI.getOperand(I);
7258 if (!Op.isReg() || !Op.getReg().isVirtual())
7259 continue;
7260
7261 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7262 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7263 if (VRC == OpRC)
7264 continue;
7265
7266 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7267 Op.setIsKill();
7268 }
7269 }
7270
7271 return CreatedBB;
7272 }
7273
7274 // Legalize INSERT_SUBREG
7275 // src0 must have the same register class as dst
7276 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7277 Register Dst = MI.getOperand(0).getReg();
7278 Register Src0 = MI.getOperand(1).getReg();
7279 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7280 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7281 if (DstRC != Src0RC) {
7282 MachineBasicBlock *MBB = MI.getParent();
7283 MachineOperand &Op = MI.getOperand(1);
7284 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7285 }
7286 return CreatedBB;
7287 }
7288
7289 // Legalize SI_INIT_M0
7290 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7291 MachineOperand &Src = MI.getOperand(0);
7292 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7293 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7294 return CreatedBB;
7295 }
7296
7297 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7298 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7299 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7300 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7301 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7302 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7303 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7304 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7305 MachineOperand &Src = MI.getOperand(1);
7306 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7307 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7308 return CreatedBB;
7309 }
7310
7311 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7312 //
7313 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7314 // scratch memory access. In both cases, the legalization never involves
7315 // conversion to the addr64 form.
7317 (isMUBUF(MI) || isMTBUF(MI)))) {
7318 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7319 ? AMDGPU::OpName::rsrc
7320 : AMDGPU::OpName::srsrc;
7321 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7322 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7323 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7324
7325 AMDGPU::OpName SampOpName =
7326 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7327 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7328 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7329 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7330
7331 return CreatedBB;
7332 }
7333
7334 // Legalize SI_CALL
7335 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7336 MachineOperand *Dest = &MI.getOperand(0);
7337 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7338 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7339 // following copies, we also need to move copies from and to physical
7340 // registers into the loop block.
7341 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7342 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7343
7344 // Also move the copies to physical registers into the loop block
7345 MachineBasicBlock &MBB = *MI.getParent();
7347 while (Start->getOpcode() != FrameSetupOpcode)
7348 --Start;
7350 while (End->getOpcode() != FrameDestroyOpcode)
7351 ++End;
7352 // Also include following copies of the return value
7353 ++End;
7354 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7355 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7356 ++End;
7357 CreatedBB =
7358 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7359 }
7360 }
7361
7362 // Legalize s_sleep_var.
7363 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7364 const DebugLoc &DL = MI.getDebugLoc();
7365 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7366 int Src0Idx =
7367 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7368 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7369 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7370 .add(Src0);
7371 Src0.ChangeToRegister(Reg, false);
7372 return nullptr;
7373 }
7374
7375 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7376 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7377 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7378 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7379 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7380 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7381 for (MachineOperand &Src : MI.explicit_operands()) {
7382 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7383 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7384 }
7385 return CreatedBB;
7386 }
7387
7388 // Legalize MUBUF instructions.
7389 bool isSoffsetLegal = true;
7390 int SoffsetIdx =
7391 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7392 if (SoffsetIdx != -1) {
7393 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7394 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7395 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7396 isSoffsetLegal = false;
7397 }
7398 }
7399
7400 bool isRsrcLegal = true;
7401 int RsrcIdx =
7402 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7403 if (RsrcIdx != -1) {
7404 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7405 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7406 isRsrcLegal = false;
7407 }
7408
7409 // The operands are legal.
7410 if (isRsrcLegal && isSoffsetLegal)
7411 return CreatedBB;
7412
7413 if (!isRsrcLegal) {
7414 // Legalize a VGPR Rsrc
7415 //
7416 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7417 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7418 // a zero-value SRsrc.
7419 //
7420 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7421 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7422 // above.
7423 //
7424 // Otherwise we are on non-ADDR64 hardware, and/or we have
7425 // idxen/offen/bothen and we fall back to a waterfall loop.
7426
7427 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7428 MachineBasicBlock &MBB = *MI.getParent();
7429
7430 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7431 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7432 // This is already an ADDR64 instruction so we need to add the pointer
7433 // extracted from the resource descriptor to the current value of VAddr.
7434 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7435 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7436 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7437
7438 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7439 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7440 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7441
7442 unsigned RsrcPtr, NewSRsrc;
7443 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7444
7445 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7446 const DebugLoc &DL = MI.getDebugLoc();
7447 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7448 .addDef(CondReg0)
7449 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7450 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7451 .addImm(0);
7452
7453 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7454 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7455 .addDef(CondReg1, RegState::Dead)
7456 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7457 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7458 .addReg(CondReg0, RegState::Kill)
7459 .addImm(0);
7460
7461 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7462 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7463 .addReg(NewVAddrLo)
7464 .addImm(AMDGPU::sub0)
7465 .addReg(NewVAddrHi)
7466 .addImm(AMDGPU::sub1);
7467
7468 VAddr->setReg(NewVAddr);
7469 Rsrc->setReg(NewSRsrc);
7470 } else if (!VAddr && ST.hasAddr64()) {
7471 // This instructions is the _OFFSET variant, so we need to convert it to
7472 // ADDR64.
7473 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7474 "FIXME: Need to emit flat atomics here");
7475
7476 unsigned RsrcPtr, NewSRsrc;
7477 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7478
7479 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7480 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7481 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7482 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7483 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7484
7485 // Atomics with return have an additional tied operand and are
7486 // missing some of the special bits.
7487 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7488 MachineInstr *Addr64;
7489
7490 if (!VDataIn) {
7491 // Regular buffer load / store.
7493 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7494 .add(*VData)
7495 .addReg(NewVAddr)
7496 .addReg(NewSRsrc)
7497 .add(*SOffset)
7498 .add(*Offset);
7499
7500 if (const MachineOperand *CPol =
7501 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7502 MIB.addImm(CPol->getImm());
7503 }
7504
7505 if (const MachineOperand *TFE =
7506 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7507 MIB.addImm(TFE->getImm());
7508 }
7509
7510 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7511
7512 MIB.cloneMemRefs(MI);
7513 Addr64 = MIB;
7514 } else {
7515 // Atomics with return.
7516 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7517 .add(*VData)
7518 .add(*VDataIn)
7519 .addReg(NewVAddr)
7520 .addReg(NewSRsrc)
7521 .add(*SOffset)
7522 .add(*Offset)
7523 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7524 .cloneMemRefs(MI);
7525 }
7526
7527 MI.removeFromParent();
7528
7529 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7530 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7531 NewVAddr)
7532 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7533 .addImm(AMDGPU::sub0)
7534 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7535 .addImm(AMDGPU::sub1);
7536 } else {
7537 // Legalize a VGPR Rsrc and soffset together.
7538 if (!isSoffsetLegal) {
7539 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7540 CreatedBB =
7541 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7542 return CreatedBB;
7543 }
7544 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7545 return CreatedBB;
7546 }
7547 }
7548
7549 // Legalize a VGPR soffset.
7550 if (!isSoffsetLegal) {
7551 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7552 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7553 return CreatedBB;
7554 }
7555 return CreatedBB;
7556}
7557
7559 InstrList.insert(MI);
7560 // Add MBUF instructiosn to deferred list.
7561 int RsrcIdx =
7562 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7563 if (RsrcIdx != -1) {
7564 DeferredList.insert(MI);
7565 }
7566}
7567
7569 return DeferredList.contains(MI);
7570}
7571
7572// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7573// lowering (change spgr to vgpr).
7574// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7575// size. Need to legalize the size of the operands during the vgpr lowering
7576// chain. This can be removed after we have sgpr16 in place
7578 MachineRegisterInfo &MRI) const {
7579 if (!ST.useRealTrue16Insts())
7580 return;
7581
7582 unsigned Opcode = MI.getOpcode();
7583 MachineBasicBlock *MBB = MI.getParent();
7584 // Legalize operands and check for size mismatch
7585 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7586 OpIdx >= get(Opcode).getNumOperands() ||
7587 get(Opcode).operands()[OpIdx].RegClass == -1)
7588 return;
7589
7590 MachineOperand &Op = MI.getOperand(OpIdx);
7591 if (!Op.isReg() || !Op.getReg().isVirtual())
7592 return;
7593
7594 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7595 if (!RI.isVGPRClass(CurrRC))
7596 return;
7597
7598 unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7599 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7600 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7601 Op.setSubReg(AMDGPU::lo16);
7602 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7603 const DebugLoc &DL = MI.getDebugLoc();
7604 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7605 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7606 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7607 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7608 .addReg(Op.getReg())
7609 .addImm(AMDGPU::lo16)
7610 .addReg(Undef)
7611 .addImm(AMDGPU::hi16);
7612 Op.setReg(NewDstReg);
7613 }
7614}
7616 MachineRegisterInfo &MRI) const {
7617 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7619}
7620
7622 MachineDominatorTree *MDT) const {
7623
7624 while (!Worklist.empty()) {
7625 MachineInstr &Inst = *Worklist.top();
7626 Worklist.erase_top();
7627 // Skip MachineInstr in the deferred list.
7628 if (Worklist.isDeferred(&Inst))
7629 continue;
7630 moveToVALUImpl(Worklist, MDT, Inst);
7631 }
7632
7633 // Deferred list of instructions will be processed once
7634 // all the MachineInstr in the worklist are done.
7635 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7636 moveToVALUImpl(Worklist, MDT, *Inst);
7637 assert(Worklist.empty() &&
7638 "Deferred MachineInstr are not supposed to re-populate worklist");
7639 }
7640}
7641
7644 MachineInstr &Inst) const {
7645
7647 if (!MBB)
7648 return;
7649 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7650 unsigned Opcode = Inst.getOpcode();
7651 unsigned NewOpcode = getVALUOp(Inst);
7652 // Handle some special cases
7653 switch (Opcode) {
7654 default:
7655 break;
7656 case AMDGPU::S_ADD_I32:
7657 case AMDGPU::S_SUB_I32: {
7658 // FIXME: The u32 versions currently selected use the carry.
7659 bool Changed;
7660 MachineBasicBlock *CreatedBBTmp = nullptr;
7661 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7662 if (Changed)
7663 return;
7664
7665 // Default handling
7666 break;
7667 }
7668
7669 case AMDGPU::S_MUL_U64:
7670 if (ST.hasVectorMulU64()) {
7671 NewOpcode = AMDGPU::V_MUL_U64_e64;
7672 break;
7673 }
7674 // Split s_mul_u64 in 32-bit vector multiplications.
7675 splitScalarSMulU64(Worklist, Inst, MDT);
7676 Inst.eraseFromParent();
7677 return;
7678
7679 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7680 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7681 // This is a special case of s_mul_u64 where all the operands are either
7682 // zero extended or sign extended.
7683 splitScalarSMulPseudo(Worklist, Inst, MDT);
7684 Inst.eraseFromParent();
7685 return;
7686
7687 case AMDGPU::S_AND_B64:
7688 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7689 Inst.eraseFromParent();
7690 return;
7691
7692 case AMDGPU::S_OR_B64:
7693 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7694 Inst.eraseFromParent();
7695 return;
7696
7697 case AMDGPU::S_XOR_B64:
7698 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7699 Inst.eraseFromParent();
7700 return;
7701
7702 case AMDGPU::S_NAND_B64:
7703 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7704 Inst.eraseFromParent();
7705 return;
7706
7707 case AMDGPU::S_NOR_B64:
7708 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7709 Inst.eraseFromParent();
7710 return;
7711
7712 case AMDGPU::S_XNOR_B64:
7713 if (ST.hasDLInsts())
7714 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7715 else
7716 splitScalar64BitXnor(Worklist, Inst, MDT);
7717 Inst.eraseFromParent();
7718 return;
7719
7720 case AMDGPU::S_ANDN2_B64:
7721 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7722 Inst.eraseFromParent();
7723 return;
7724
7725 case AMDGPU::S_ORN2_B64:
7726 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7727 Inst.eraseFromParent();
7728 return;
7729
7730 case AMDGPU::S_BREV_B64:
7731 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7732 Inst.eraseFromParent();
7733 return;
7734
7735 case AMDGPU::S_NOT_B64:
7736 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7737 Inst.eraseFromParent();
7738 return;
7739
7740 case AMDGPU::S_BCNT1_I32_B64:
7741 splitScalar64BitBCNT(Worklist, Inst);
7742 Inst.eraseFromParent();
7743 return;
7744
7745 case AMDGPU::S_BFE_I64:
7746 splitScalar64BitBFE(Worklist, Inst);
7747 Inst.eraseFromParent();
7748 return;
7749
7750 case AMDGPU::S_FLBIT_I32_B64:
7751 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7752 Inst.eraseFromParent();
7753 return;
7754 case AMDGPU::S_FF1_I32_B64:
7755 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7756 Inst.eraseFromParent();
7757 return;
7758
7759 case AMDGPU::S_LSHL_B32:
7760 if (ST.hasOnlyRevVALUShifts()) {
7761 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7762 swapOperands(Inst);
7763 }
7764 break;
7765 case AMDGPU::S_ASHR_I32:
7766 if (ST.hasOnlyRevVALUShifts()) {
7767 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7768 swapOperands(Inst);
7769 }
7770 break;
7771 case AMDGPU::S_LSHR_B32:
7772 if (ST.hasOnlyRevVALUShifts()) {
7773 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7774 swapOperands(Inst);
7775 }
7776 break;
7777 case AMDGPU::S_LSHL_B64:
7778 if (ST.hasOnlyRevVALUShifts()) {
7779 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7780 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7781 : AMDGPU::V_LSHLREV_B64_e64;
7782 swapOperands(Inst);
7783 }
7784 break;
7785 case AMDGPU::S_ASHR_I64:
7786 if (ST.hasOnlyRevVALUShifts()) {
7787 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7788 swapOperands(Inst);
7789 }
7790 break;
7791 case AMDGPU::S_LSHR_B64:
7792 if (ST.hasOnlyRevVALUShifts()) {
7793 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7794 swapOperands(Inst);
7795 }
7796 break;
7797
7798 case AMDGPU::S_ABS_I32:
7799 lowerScalarAbs(Worklist, Inst);
7800 Inst.eraseFromParent();
7801 return;
7802
7803 case AMDGPU::S_CBRANCH_SCC0:
7804 case AMDGPU::S_CBRANCH_SCC1: {
7805 // Clear unused bits of vcc
7806 Register CondReg = Inst.getOperand(1).getReg();
7807 bool IsSCC = CondReg == AMDGPU::SCC;
7809 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7810 .addReg(LMC.ExecReg)
7811 .addReg(IsSCC ? LMC.VccReg : CondReg);
7812 Inst.removeOperand(1);
7813 } break;
7814
7815 case AMDGPU::S_BFE_U64:
7816 case AMDGPU::S_BFM_B64:
7817 llvm_unreachable("Moving this op to VALU not implemented");
7818
7819 case AMDGPU::S_PACK_LL_B32_B16:
7820 case AMDGPU::S_PACK_LH_B32_B16:
7821 case AMDGPU::S_PACK_HL_B32_B16:
7822 case AMDGPU::S_PACK_HH_B32_B16:
7823 movePackToVALU(Worklist, MRI, Inst);
7824 Inst.eraseFromParent();
7825 return;
7826
7827 case AMDGPU::S_XNOR_B32:
7828 lowerScalarXnor(Worklist, Inst);
7829 Inst.eraseFromParent();
7830 return;
7831
7832 case AMDGPU::S_NAND_B32:
7833 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7834 Inst.eraseFromParent();
7835 return;
7836
7837 case AMDGPU::S_NOR_B32:
7838 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7839 Inst.eraseFromParent();
7840 return;
7841
7842 case AMDGPU::S_ANDN2_B32:
7843 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7844 Inst.eraseFromParent();
7845 return;
7846
7847 case AMDGPU::S_ORN2_B32:
7848 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7849 Inst.eraseFromParent();
7850 return;
7851
7852 // TODO: remove as soon as everything is ready
7853 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7854 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7855 // can only be selected from the uniform SDNode.
7856 case AMDGPU::S_ADD_CO_PSEUDO:
7857 case AMDGPU::S_SUB_CO_PSEUDO: {
7858 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7859 ? AMDGPU::V_ADDC_U32_e64
7860 : AMDGPU::V_SUBB_U32_e64;
7861 const auto *CarryRC = RI.getWaveMaskRegClass();
7862
7863 Register CarryInReg = Inst.getOperand(4).getReg();
7864 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7865 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7866 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7867 .addReg(CarryInReg);
7868 }
7869
7870 Register CarryOutReg = Inst.getOperand(1).getReg();
7871
7872 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7873 MRI.getRegClass(Inst.getOperand(0).getReg())));
7874 MachineInstr *CarryOp =
7875 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7876 .addReg(CarryOutReg, RegState::Define)
7877 .add(Inst.getOperand(2))
7878 .add(Inst.getOperand(3))
7879 .addReg(CarryInReg)
7880 .addImm(0);
7881 legalizeOperands(*CarryOp);
7882 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7883 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7884 Inst.eraseFromParent();
7885 }
7886 return;
7887 case AMDGPU::S_UADDO_PSEUDO:
7888 case AMDGPU::S_USUBO_PSEUDO: {
7889 const DebugLoc &DL = Inst.getDebugLoc();
7890 MachineOperand &Dest0 = Inst.getOperand(0);
7891 MachineOperand &Dest1 = Inst.getOperand(1);
7892 MachineOperand &Src0 = Inst.getOperand(2);
7893 MachineOperand &Src1 = Inst.getOperand(3);
7894
7895 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7896 ? AMDGPU::V_ADD_CO_U32_e64
7897 : AMDGPU::V_SUB_CO_U32_e64;
7898 const TargetRegisterClass *NewRC =
7899 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7900 Register DestReg = MRI.createVirtualRegister(NewRC);
7901 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7902 .addReg(Dest1.getReg(), RegState::Define)
7903 .add(Src0)
7904 .add(Src1)
7905 .addImm(0); // clamp bit
7906
7907 legalizeOperands(*NewInstr, MDT);
7908 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7909 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7910 Worklist);
7911 Inst.eraseFromParent();
7912 }
7913 return;
7914
7915 case AMDGPU::S_CSELECT_B32:
7916 case AMDGPU::S_CSELECT_B64:
7917 lowerSelect(Worklist, Inst, MDT);
7918 Inst.eraseFromParent();
7919 return;
7920 case AMDGPU::S_CMP_EQ_I32:
7921 case AMDGPU::S_CMP_LG_I32:
7922 case AMDGPU::S_CMP_GT_I32:
7923 case AMDGPU::S_CMP_GE_I32:
7924 case AMDGPU::S_CMP_LT_I32:
7925 case AMDGPU::S_CMP_LE_I32:
7926 case AMDGPU::S_CMP_EQ_U32:
7927 case AMDGPU::S_CMP_LG_U32:
7928 case AMDGPU::S_CMP_GT_U32:
7929 case AMDGPU::S_CMP_GE_U32:
7930 case AMDGPU::S_CMP_LT_U32:
7931 case AMDGPU::S_CMP_LE_U32:
7932 case AMDGPU::S_CMP_EQ_U64:
7933 case AMDGPU::S_CMP_LG_U64:
7934 case AMDGPU::S_CMP_LT_F32:
7935 case AMDGPU::S_CMP_EQ_F32:
7936 case AMDGPU::S_CMP_LE_F32:
7937 case AMDGPU::S_CMP_GT_F32:
7938 case AMDGPU::S_CMP_LG_F32:
7939 case AMDGPU::S_CMP_GE_F32:
7940 case AMDGPU::S_CMP_O_F32:
7941 case AMDGPU::S_CMP_U_F32:
7942 case AMDGPU::S_CMP_NGE_F32:
7943 case AMDGPU::S_CMP_NLG_F32:
7944 case AMDGPU::S_CMP_NGT_F32:
7945 case AMDGPU::S_CMP_NLE_F32:
7946 case AMDGPU::S_CMP_NEQ_F32:
7947 case AMDGPU::S_CMP_NLT_F32: {
7948 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7949 auto NewInstr =
7950 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7951 .setMIFlags(Inst.getFlags());
7952 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7953 0) {
7954 NewInstr
7955 .addImm(0) // src0_modifiers
7956 .add(Inst.getOperand(0)) // src0
7957 .addImm(0) // src1_modifiers
7958 .add(Inst.getOperand(1)) // src1
7959 .addImm(0); // clamp
7960 } else {
7961 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7962 }
7963 legalizeOperands(*NewInstr, MDT);
7964 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7965 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7966 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7967 Inst.eraseFromParent();
7968 return;
7969 }
7970 case AMDGPU::S_CMP_LT_F16:
7971 case AMDGPU::S_CMP_EQ_F16:
7972 case AMDGPU::S_CMP_LE_F16:
7973 case AMDGPU::S_CMP_GT_F16:
7974 case AMDGPU::S_CMP_LG_F16:
7975 case AMDGPU::S_CMP_GE_F16:
7976 case AMDGPU::S_CMP_O_F16:
7977 case AMDGPU::S_CMP_U_F16:
7978 case AMDGPU::S_CMP_NGE_F16:
7979 case AMDGPU::S_CMP_NLG_F16:
7980 case AMDGPU::S_CMP_NGT_F16:
7981 case AMDGPU::S_CMP_NLE_F16:
7982 case AMDGPU::S_CMP_NEQ_F16:
7983 case AMDGPU::S_CMP_NLT_F16: {
7984 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7985 auto NewInstr =
7986 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7987 .setMIFlags(Inst.getFlags());
7988 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7989 NewInstr
7990 .addImm(0) // src0_modifiers
7991 .add(Inst.getOperand(0)) // src0
7992 .addImm(0) // src1_modifiers
7993 .add(Inst.getOperand(1)) // src1
7994 .addImm(0); // clamp
7995 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7996 NewInstr.addImm(0); // op_sel0
7997 } else {
7998 NewInstr
7999 .add(Inst.getOperand(0))
8000 .add(Inst.getOperand(1));
8001 }
8002 legalizeOperandsVALUt16(*NewInstr, MRI);
8003 legalizeOperands(*NewInstr, MDT);
8004 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8005 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
8006 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8007 Inst.eraseFromParent();
8008 return;
8009 }
8010 case AMDGPU::S_CVT_HI_F32_F16: {
8011 const DebugLoc &DL = Inst.getDebugLoc();
8012 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8013 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8014 if (ST.useRealTrue16Insts()) {
8015 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8016 .add(Inst.getOperand(1));
8017 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8018 .addImm(0) // src0_modifiers
8019 .addReg(TmpReg, 0, AMDGPU::hi16)
8020 .addImm(0) // clamp
8021 .addImm(0) // omod
8022 .addImm(0); // op_sel0
8023 } else {
8024 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8025 .addImm(16)
8026 .add(Inst.getOperand(1));
8027 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8028 .addImm(0) // src0_modifiers
8029 .addReg(TmpReg)
8030 .addImm(0) // clamp
8031 .addImm(0); // omod
8032 }
8033
8034 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8035 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8036 Inst.eraseFromParent();
8037 return;
8038 }
8039 case AMDGPU::S_MINIMUM_F32:
8040 case AMDGPU::S_MAXIMUM_F32: {
8041 const DebugLoc &DL = Inst.getDebugLoc();
8042 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8043 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8044 .addImm(0) // src0_modifiers
8045 .add(Inst.getOperand(1))
8046 .addImm(0) // src1_modifiers
8047 .add(Inst.getOperand(2))
8048 .addImm(0) // clamp
8049 .addImm(0); // omod
8050 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8051
8052 legalizeOperands(*NewInstr, MDT);
8053 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8054 Inst.eraseFromParent();
8055 return;
8056 }
8057 case AMDGPU::S_MINIMUM_F16:
8058 case AMDGPU::S_MAXIMUM_F16: {
8059 const DebugLoc &DL = Inst.getDebugLoc();
8060 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8061 ? &AMDGPU::VGPR_16RegClass
8062 : &AMDGPU::VGPR_32RegClass);
8063 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8064 .addImm(0) // src0_modifiers
8065 .add(Inst.getOperand(1))
8066 .addImm(0) // src1_modifiers
8067 .add(Inst.getOperand(2))
8068 .addImm(0) // clamp
8069 .addImm(0) // omod
8070 .addImm(0); // opsel0
8071 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8072 legalizeOperandsVALUt16(*NewInstr, MRI);
8073 legalizeOperands(*NewInstr, MDT);
8074 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8075 Inst.eraseFromParent();
8076 return;
8077 }
8078 case AMDGPU::V_S_EXP_F16_e64:
8079 case AMDGPU::V_S_LOG_F16_e64:
8080 case AMDGPU::V_S_RCP_F16_e64:
8081 case AMDGPU::V_S_RSQ_F16_e64:
8082 case AMDGPU::V_S_SQRT_F16_e64: {
8083 const DebugLoc &DL = Inst.getDebugLoc();
8084 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8085 ? &AMDGPU::VGPR_16RegClass
8086 : &AMDGPU::VGPR_32RegClass);
8087 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8088 .add(Inst.getOperand(1)) // src0_modifiers
8089 .add(Inst.getOperand(2))
8090 .add(Inst.getOperand(3)) // clamp
8091 .add(Inst.getOperand(4)) // omod
8092 .setMIFlags(Inst.getFlags());
8093 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8094 NewInstr.addImm(0); // opsel0
8095 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8096 legalizeOperandsVALUt16(*NewInstr, MRI);
8097 legalizeOperands(*NewInstr, MDT);
8098 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8099 Inst.eraseFromParent();
8100 return;
8101 }
8102 }
8103
8104 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8105 // We cannot move this instruction to the VALU, so we should try to
8106 // legalize its operands instead.
8107 legalizeOperands(Inst, MDT);
8108 return;
8109 }
8110 // Handle converting generic instructions like COPY-to-SGPR into
8111 // COPY-to-VGPR.
8112 if (NewOpcode == Opcode) {
8113 Register DstReg = Inst.getOperand(0).getReg();
8114 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8115
8116 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8117 // hope for the best.
8118 if (Inst.isCopy() && DstReg.isPhysical() &&
8119 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8120 // TODO: Only works for 32 bit registers.
8121 if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
8122 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8123 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
8124 .add(Inst.getOperand(1));
8125 } else {
8126 Register NewDst =
8127 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8128 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8129 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8130 .add(Inst.getOperand(1));
8131 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8132 DstReg)
8133 .addReg(NewDst);
8134 }
8135 Inst.eraseFromParent();
8136 return;
8137 }
8138
8139 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8140 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8141 // Instead of creating a copy where src and dst are the same register
8142 // class, we just replace all uses of dst with src. These kinds of
8143 // copies interfere with the heuristics MachineSink uses to decide
8144 // whether or not to split a critical edge. Since the pass assumes
8145 // that copies will end up as machine instructions and not be
8146 // eliminated.
8147 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8148 Register NewDstReg = Inst.getOperand(1).getReg();
8149 MRI.replaceRegWith(DstReg, NewDstReg);
8150 MRI.clearKillFlags(NewDstReg);
8151 Inst.getOperand(0).setReg(DstReg);
8152 Inst.eraseFromParent();
8153 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8154 for (MachineOperand &MO :
8155 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8156 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8157 }
8158 return;
8159 }
8160
8161 // If this is a v2s copy between 16bit and 32bit reg,
8162 // replace vgpr copy to reg_sequence/extract_subreg
8163 // This can be remove after we have sgpr16 in place
8164 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8165 Inst.getOperand(1).getReg().isVirtual() &&
8166 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8167 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8168 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8169 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8170 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8171 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8172 get(AMDGPU::IMPLICIT_DEF), Undef);
8173 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8174 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8175 .addReg(Inst.getOperand(1).getReg())
8176 .addImm(AMDGPU::lo16)
8177 .addReg(Undef)
8178 .addImm(AMDGPU::hi16);
8179 Inst.eraseFromParent();
8180 MRI.replaceRegWith(DstReg, NewDstReg);
8181 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8182 return;
8183 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8184 AMDGPU::lo16)) {
8185 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8186 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8187 MRI.replaceRegWith(DstReg, NewDstReg);
8188 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8189 return;
8190 }
8191 }
8192
8193 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8194 MRI.replaceRegWith(DstReg, NewDstReg);
8195 legalizeOperands(Inst, MDT);
8196 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8197 return;
8198 }
8199
8200 // Use the new VALU Opcode.
8201 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8202 .setMIFlags(Inst.getFlags());
8203 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8204 // Intersperse VOP3 modifiers among the SALU operands.
8205 NewInstr->addOperand(Inst.getOperand(0));
8206 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8207 AMDGPU::OpName::src0_modifiers) >= 0)
8208 NewInstr.addImm(0);
8209 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8210 MachineOperand Src = Inst.getOperand(1);
8211 NewInstr->addOperand(Src);
8212 }
8213
8214 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8215 // We are converting these to a BFE, so we need to add the missing
8216 // operands for the size and offset.
8217 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8218 NewInstr.addImm(0);
8219 NewInstr.addImm(Size);
8220 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8221 // The VALU version adds the second operand to the result, so insert an
8222 // extra 0 operand.
8223 NewInstr.addImm(0);
8224 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8225 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8226 // If we need to move this to VGPRs, we need to unpack the second
8227 // operand back into the 2 separate ones for bit offset and width.
8228 assert(OffsetWidthOp.isImm() &&
8229 "Scalar BFE is only implemented for constant width and offset");
8230 uint32_t Imm = OffsetWidthOp.getImm();
8231
8232 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8233 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8234 NewInstr.addImm(Offset);
8235 NewInstr.addImm(BitWidth);
8236 } else {
8237 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8238 AMDGPU::OpName::src1_modifiers) >= 0)
8239 NewInstr.addImm(0);
8240 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8241 NewInstr->addOperand(Inst.getOperand(2));
8242 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8243 AMDGPU::OpName::src2_modifiers) >= 0)
8244 NewInstr.addImm(0);
8245 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8246 NewInstr->addOperand(Inst.getOperand(3));
8247 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8248 NewInstr.addImm(0);
8249 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8250 NewInstr.addImm(0);
8251 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8252 NewInstr.addImm(0);
8253 }
8254 } else {
8255 // Just copy the SALU operands.
8256 for (const MachineOperand &Op : Inst.explicit_operands())
8257 NewInstr->addOperand(Op);
8258 }
8259
8260 // Remove any references to SCC. Vector instructions can't read from it, and
8261 // We're just about to add the implicit use / defs of VCC, and we don't want
8262 // both.
8263 for (MachineOperand &Op : Inst.implicit_operands()) {
8264 if (Op.getReg() == AMDGPU::SCC) {
8265 // Only propagate through live-def of SCC.
8266 if (Op.isDef() && !Op.isDead())
8267 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8268 if (Op.isUse())
8269 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8270 }
8271 }
8272 Inst.eraseFromParent();
8273 Register NewDstReg;
8274 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8275 Register DstReg = NewInstr->getOperand(0).getReg();
8276 assert(DstReg.isVirtual());
8277 // Update the destination register class.
8278 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8279 assert(NewDstRC);
8280 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8281 MRI.replaceRegWith(DstReg, NewDstReg);
8282 }
8283 fixImplicitOperands(*NewInstr);
8284
8285 legalizeOperandsVALUt16(*NewInstr, MRI);
8286
8287 // Legalize the operands
8288 legalizeOperands(*NewInstr, MDT);
8289 if (NewDstReg)
8290 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8291}
8292
8293// Add/sub require special handling to deal with carry outs.
8294std::pair<bool, MachineBasicBlock *>
8295SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8296 MachineDominatorTree *MDT) const {
8297 if (ST.hasAddNoCarry()) {
8298 // Assume there is no user of scc since we don't select this in that case.
8299 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8300 // is used.
8301
8302 MachineBasicBlock &MBB = *Inst.getParent();
8303 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8304
8305 Register OldDstReg = Inst.getOperand(0).getReg();
8306 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8307
8308 unsigned Opc = Inst.getOpcode();
8309 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8310
8311 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8312 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8313
8314 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8315 Inst.removeOperand(3);
8316
8317 Inst.setDesc(get(NewOpc));
8318 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8319 Inst.addImplicitDefUseOperands(*MBB.getParent());
8320 MRI.replaceRegWith(OldDstReg, ResultReg);
8321 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8322
8323 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8324 return std::pair(true, NewBB);
8325 }
8326
8327 return std::pair(false, nullptr);
8328}
8329
8330void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8331 MachineDominatorTree *MDT) const {
8332
8333 MachineBasicBlock &MBB = *Inst.getParent();
8334 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8335 MachineBasicBlock::iterator MII = Inst;
8336 DebugLoc DL = Inst.getDebugLoc();
8337
8338 MachineOperand &Dest = Inst.getOperand(0);
8339 MachineOperand &Src0 = Inst.getOperand(1);
8340 MachineOperand &Src1 = Inst.getOperand(2);
8341 MachineOperand &Cond = Inst.getOperand(3);
8342
8343 Register CondReg = Cond.getReg();
8344 bool IsSCC = (CondReg == AMDGPU::SCC);
8345
8346 // If this is a trivial select where the condition is effectively not SCC
8347 // (CondReg is a source of copy to SCC), then the select is semantically
8348 // equivalent to copying CondReg. Hence, there is no need to create
8349 // V_CNDMASK, we can just use that and bail out.
8350 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8351 (Src1.getImm() == 0)) {
8352 MRI.replaceRegWith(Dest.getReg(), CondReg);
8353 return;
8354 }
8355
8356 Register NewCondReg = CondReg;
8357 if (IsSCC) {
8358 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8359 NewCondReg = MRI.createVirtualRegister(TC);
8360
8361 // Now look for the closest SCC def if it is a copy
8362 // replacing the CondReg with the COPY source register
8363 bool CopyFound = false;
8364 for (MachineInstr &CandI :
8366 Inst.getParent()->rend())) {
8367 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8368 -1) {
8369 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8370 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8371 .addReg(CandI.getOperand(1).getReg());
8372 CopyFound = true;
8373 }
8374 break;
8375 }
8376 }
8377 if (!CopyFound) {
8378 // SCC def is not a copy
8379 // Insert a trivial select instead of creating a copy, because a copy from
8380 // SCC would semantically mean just copying a single bit, but we may need
8381 // the result to be a vector condition mask that needs preserving.
8382 unsigned Opcode =
8383 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8384 auto NewSelect =
8385 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8386 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8387 }
8388 }
8389
8390 Register NewDestReg = MRI.createVirtualRegister(
8391 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8392 MachineInstr *NewInst;
8393 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8394 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8395 .addImm(0)
8396 .add(Src1) // False
8397 .addImm(0)
8398 .add(Src0) // True
8399 .addReg(NewCondReg);
8400 } else {
8401 NewInst =
8402 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8403 .add(Src1) // False
8404 .add(Src0) // True
8405 .addReg(NewCondReg);
8406 }
8407 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8408 legalizeOperands(*NewInst, MDT);
8409 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8410}
8411
8412void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8413 MachineInstr &Inst) const {
8414 MachineBasicBlock &MBB = *Inst.getParent();
8415 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8416 MachineBasicBlock::iterator MII = Inst;
8417 DebugLoc DL = Inst.getDebugLoc();
8418
8419 MachineOperand &Dest = Inst.getOperand(0);
8420 MachineOperand &Src = Inst.getOperand(1);
8421 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8422 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8423
8424 unsigned SubOp = ST.hasAddNoCarry() ?
8425 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8426
8427 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8428 .addImm(0)
8429 .addReg(Src.getReg());
8430
8431 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8432 .addReg(Src.getReg())
8433 .addReg(TmpReg);
8434
8435 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8436 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8437}
8438
8439void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8440 MachineInstr &Inst) const {
8441 MachineBasicBlock &MBB = *Inst.getParent();
8442 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8443 MachineBasicBlock::iterator MII = Inst;
8444 const DebugLoc &DL = Inst.getDebugLoc();
8445
8446 MachineOperand &Dest = Inst.getOperand(0);
8447 MachineOperand &Src0 = Inst.getOperand(1);
8448 MachineOperand &Src1 = Inst.getOperand(2);
8449
8450 if (ST.hasDLInsts()) {
8451 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8452 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8453 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8454
8455 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8456 .add(Src0)
8457 .add(Src1);
8458
8459 MRI.replaceRegWith(Dest.getReg(), NewDest);
8460 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8461 } else {
8462 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8463 // invert either source and then perform the XOR. If either source is a
8464 // scalar register, then we can leave the inversion on the scalar unit to
8465 // achieve a better distribution of scalar and vector instructions.
8466 bool Src0IsSGPR = Src0.isReg() &&
8467 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8468 bool Src1IsSGPR = Src1.isReg() &&
8469 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8470 MachineInstr *Xor;
8471 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8472 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8473
8474 // Build a pair of scalar instructions and add them to the work list.
8475 // The next iteration over the work list will lower these to the vector
8476 // unit as necessary.
8477 if (Src0IsSGPR) {
8478 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8479 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8480 .addReg(Temp)
8481 .add(Src1);
8482 } else if (Src1IsSGPR) {
8483 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8484 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8485 .add(Src0)
8486 .addReg(Temp);
8487 } else {
8488 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8489 .add(Src0)
8490 .add(Src1);
8491 MachineInstr *Not =
8492 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8493 Worklist.insert(Not);
8494 }
8495
8496 MRI.replaceRegWith(Dest.getReg(), NewDest);
8497
8498 Worklist.insert(Xor);
8499
8500 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8501 }
8502}
8503
8504void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8505 MachineInstr &Inst,
8506 unsigned Opcode) const {
8507 MachineBasicBlock &MBB = *Inst.getParent();
8508 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8509 MachineBasicBlock::iterator MII = Inst;
8510 const DebugLoc &DL = Inst.getDebugLoc();
8511
8512 MachineOperand &Dest = Inst.getOperand(0);
8513 MachineOperand &Src0 = Inst.getOperand(1);
8514 MachineOperand &Src1 = Inst.getOperand(2);
8515
8516 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8517 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8518
8519 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8520 .add(Src0)
8521 .add(Src1);
8522
8523 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8524 .addReg(Interm);
8525
8526 Worklist.insert(&Op);
8527 Worklist.insert(&Not);
8528
8529 MRI.replaceRegWith(Dest.getReg(), NewDest);
8530 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8531}
8532
8533void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8534 MachineInstr &Inst,
8535 unsigned Opcode) const {
8536 MachineBasicBlock &MBB = *Inst.getParent();
8537 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8538 MachineBasicBlock::iterator MII = Inst;
8539 const DebugLoc &DL = Inst.getDebugLoc();
8540
8541 MachineOperand &Dest = Inst.getOperand(0);
8542 MachineOperand &Src0 = Inst.getOperand(1);
8543 MachineOperand &Src1 = Inst.getOperand(2);
8544
8545 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8546 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8547
8548 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8549 .add(Src1);
8550
8551 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8552 .add(Src0)
8553 .addReg(Interm);
8554
8555 Worklist.insert(&Not);
8556 Worklist.insert(&Op);
8557
8558 MRI.replaceRegWith(Dest.getReg(), NewDest);
8559 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8560}
8561
8562void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8563 MachineInstr &Inst, unsigned Opcode,
8564 bool Swap) const {
8565 MachineBasicBlock &MBB = *Inst.getParent();
8566 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8567
8568 MachineOperand &Dest = Inst.getOperand(0);
8569 MachineOperand &Src0 = Inst.getOperand(1);
8570 DebugLoc DL = Inst.getDebugLoc();
8571
8572 MachineBasicBlock::iterator MII = Inst;
8573
8574 const MCInstrDesc &InstDesc = get(Opcode);
8575 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8576 MRI.getRegClass(Src0.getReg()) :
8577 &AMDGPU::SGPR_32RegClass;
8578
8579 const TargetRegisterClass *Src0SubRC =
8580 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8581
8582 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8583 AMDGPU::sub0, Src0SubRC);
8584
8585 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8586 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8587 const TargetRegisterClass *NewDestSubRC =
8588 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8589
8590 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8591 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8592
8593 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8594 AMDGPU::sub1, Src0SubRC);
8595
8596 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8597 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8598
8599 if (Swap)
8600 std::swap(DestSub0, DestSub1);
8601
8602 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8603 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8604 .addReg(DestSub0)
8605 .addImm(AMDGPU::sub0)
8606 .addReg(DestSub1)
8607 .addImm(AMDGPU::sub1);
8608
8609 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8610
8611 Worklist.insert(&LoHalf);
8612 Worklist.insert(&HiHalf);
8613
8614 // We don't need to legalizeOperands here because for a single operand, src0
8615 // will support any kind of input.
8616
8617 // Move all users of this moved value.
8618 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8619}
8620
8621// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8622// split the s_mul_u64 in 32-bit vector multiplications.
8623void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8624 MachineInstr &Inst,
8625 MachineDominatorTree *MDT) const {
8626 MachineBasicBlock &MBB = *Inst.getParent();
8627 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8628
8629 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8630 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8631 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8632
8633 MachineOperand &Dest = Inst.getOperand(0);
8634 MachineOperand &Src0 = Inst.getOperand(1);
8635 MachineOperand &Src1 = Inst.getOperand(2);
8636 const DebugLoc &DL = Inst.getDebugLoc();
8637 MachineBasicBlock::iterator MII = Inst;
8638
8639 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8640 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8641 const TargetRegisterClass *Src0SubRC =
8642 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8643 if (RI.isSGPRClass(Src0SubRC))
8644 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8645 const TargetRegisterClass *Src1SubRC =
8646 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8647 if (RI.isSGPRClass(Src1SubRC))
8648 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8649
8650 // First, we extract the low 32-bit and high 32-bit values from each of the
8651 // operands.
8652 MachineOperand Op0L =
8653 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8654 MachineOperand Op1L =
8655 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8656 MachineOperand Op0H =
8657 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8658 MachineOperand Op1H =
8659 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8660
8661 // The multilication is done as follows:
8662 //
8663 // Op1H Op1L
8664 // * Op0H Op0L
8665 // --------------------
8666 // Op1H*Op0L Op1L*Op0L
8667 // + Op1H*Op0H Op1L*Op0H
8668 // -----------------------------------------
8669 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8670 //
8671 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8672 // value and that would overflow.
8673 // The low 32-bit value is Op1L*Op0L.
8674 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8675
8676 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8677 MachineInstr *Op1L_Op0H =
8678 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8679 .add(Op1L)
8680 .add(Op0H);
8681
8682 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8683 MachineInstr *Op1H_Op0L =
8684 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8685 .add(Op1H)
8686 .add(Op0L);
8687
8688 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8689 MachineInstr *Carry =
8690 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8691 .add(Op1L)
8692 .add(Op0L);
8693
8694 MachineInstr *LoHalf =
8695 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8696 .add(Op1L)
8697 .add(Op0L);
8698
8699 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8700 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8701 .addReg(Op1L_Op0H_Reg)
8702 .addReg(Op1H_Op0L_Reg);
8703
8704 MachineInstr *HiHalf =
8705 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8706 .addReg(AddReg)
8707 .addReg(CarryReg);
8708
8709 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8710 .addReg(DestSub0)
8711 .addImm(AMDGPU::sub0)
8712 .addReg(DestSub1)
8713 .addImm(AMDGPU::sub1);
8714
8715 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8716
8717 // Try to legalize the operands in case we need to swap the order to keep it
8718 // valid.
8719 legalizeOperands(*Op1L_Op0H, MDT);
8720 legalizeOperands(*Op1H_Op0L, MDT);
8721 legalizeOperands(*Carry, MDT);
8722 legalizeOperands(*LoHalf, MDT);
8723 legalizeOperands(*Add, MDT);
8724 legalizeOperands(*HiHalf, MDT);
8725
8726 // Move all users of this moved value.
8727 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8728}
8729
8730// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8731// multiplications.
8732void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8733 MachineInstr &Inst,
8734 MachineDominatorTree *MDT) const {
8735 MachineBasicBlock &MBB = *Inst.getParent();
8736 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8737
8738 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8739 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8740 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8741
8742 MachineOperand &Dest = Inst.getOperand(0);
8743 MachineOperand &Src0 = Inst.getOperand(1);
8744 MachineOperand &Src1 = Inst.getOperand(2);
8745 const DebugLoc &DL = Inst.getDebugLoc();
8746 MachineBasicBlock::iterator MII = Inst;
8747
8748 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8749 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8750 const TargetRegisterClass *Src0SubRC =
8751 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8752 if (RI.isSGPRClass(Src0SubRC))
8753 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8754 const TargetRegisterClass *Src1SubRC =
8755 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8756 if (RI.isSGPRClass(Src1SubRC))
8757 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8758
8759 // First, we extract the low 32-bit and high 32-bit values from each of the
8760 // operands.
8761 MachineOperand Op0L =
8762 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8763 MachineOperand Op1L =
8764 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8765
8766 unsigned Opc = Inst.getOpcode();
8767 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8768 ? AMDGPU::V_MUL_HI_U32_e64
8769 : AMDGPU::V_MUL_HI_I32_e64;
8770 MachineInstr *HiHalf =
8771 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8772
8773 MachineInstr *LoHalf =
8774 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8775 .add(Op1L)
8776 .add(Op0L);
8777
8778 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8779 .addReg(DestSub0)
8780 .addImm(AMDGPU::sub0)
8781 .addReg(DestSub1)
8782 .addImm(AMDGPU::sub1);
8783
8784 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8785
8786 // Try to legalize the operands in case we need to swap the order to keep it
8787 // valid.
8788 legalizeOperands(*HiHalf, MDT);
8789 legalizeOperands(*LoHalf, MDT);
8790
8791 // Move all users of this moved value.
8792 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8793}
8794
8795void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8796 MachineInstr &Inst, unsigned Opcode,
8797 MachineDominatorTree *MDT) const {
8798 MachineBasicBlock &MBB = *Inst.getParent();
8799 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8800
8801 MachineOperand &Dest = Inst.getOperand(0);
8802 MachineOperand &Src0 = Inst.getOperand(1);
8803 MachineOperand &Src1 = Inst.getOperand(2);
8804 DebugLoc DL = Inst.getDebugLoc();
8805
8806 MachineBasicBlock::iterator MII = Inst;
8807
8808 const MCInstrDesc &InstDesc = get(Opcode);
8809 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8810 MRI.getRegClass(Src0.getReg()) :
8811 &AMDGPU::SGPR_32RegClass;
8812
8813 const TargetRegisterClass *Src0SubRC =
8814 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8815 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8816 MRI.getRegClass(Src1.getReg()) :
8817 &AMDGPU::SGPR_32RegClass;
8818
8819 const TargetRegisterClass *Src1SubRC =
8820 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8821
8822 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8823 AMDGPU::sub0, Src0SubRC);
8824 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8825 AMDGPU::sub0, Src1SubRC);
8826 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8827 AMDGPU::sub1, Src0SubRC);
8828 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8829 AMDGPU::sub1, Src1SubRC);
8830
8831 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8832 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8833 const TargetRegisterClass *NewDestSubRC =
8834 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8835
8836 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8837 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8838 .add(SrcReg0Sub0)
8839 .add(SrcReg1Sub0);
8840
8841 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8842 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8843 .add(SrcReg0Sub1)
8844 .add(SrcReg1Sub1);
8845
8846 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8847 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8848 .addReg(DestSub0)
8849 .addImm(AMDGPU::sub0)
8850 .addReg(DestSub1)
8851 .addImm(AMDGPU::sub1);
8852
8853 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8854
8855 Worklist.insert(&LoHalf);
8856 Worklist.insert(&HiHalf);
8857
8858 // Move all users of this moved value.
8859 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8860}
8861
8862void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8863 MachineInstr &Inst,
8864 MachineDominatorTree *MDT) const {
8865 MachineBasicBlock &MBB = *Inst.getParent();
8866 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8867
8868 MachineOperand &Dest = Inst.getOperand(0);
8869 MachineOperand &Src0 = Inst.getOperand(1);
8870 MachineOperand &Src1 = Inst.getOperand(2);
8871 const DebugLoc &DL = Inst.getDebugLoc();
8872
8873 MachineBasicBlock::iterator MII = Inst;
8874
8875 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8876
8877 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8878
8879 MachineOperand* Op0;
8880 MachineOperand* Op1;
8881
8882 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8883 Op0 = &Src0;
8884 Op1 = &Src1;
8885 } else {
8886 Op0 = &Src1;
8887 Op1 = &Src0;
8888 }
8889
8890 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8891 .add(*Op0);
8892
8893 Register NewDest = MRI.createVirtualRegister(DestRC);
8894
8895 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8896 .addReg(Interm)
8897 .add(*Op1);
8898
8899 MRI.replaceRegWith(Dest.getReg(), NewDest);
8900
8901 Worklist.insert(&Xor);
8902}
8903
8904void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8905 MachineInstr &Inst) const {
8906 MachineBasicBlock &MBB = *Inst.getParent();
8907 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8908
8909 MachineBasicBlock::iterator MII = Inst;
8910 const DebugLoc &DL = Inst.getDebugLoc();
8911
8912 MachineOperand &Dest = Inst.getOperand(0);
8913 MachineOperand &Src = Inst.getOperand(1);
8914
8915 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8916 const TargetRegisterClass *SrcRC = Src.isReg() ?
8917 MRI.getRegClass(Src.getReg()) :
8918 &AMDGPU::SGPR_32RegClass;
8919
8920 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8921 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8922
8923 const TargetRegisterClass *SrcSubRC =
8924 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8925
8926 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8927 AMDGPU::sub0, SrcSubRC);
8928 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8929 AMDGPU::sub1, SrcSubRC);
8930
8931 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8932
8933 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8934
8935 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8936
8937 // We don't need to legalize operands here. src0 for either instruction can be
8938 // an SGPR, and the second input is unused or determined here.
8939 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8940}
8941
8942void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8943 MachineInstr &Inst) const {
8944 MachineBasicBlock &MBB = *Inst.getParent();
8945 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8946 MachineBasicBlock::iterator MII = Inst;
8947 const DebugLoc &DL = Inst.getDebugLoc();
8948
8949 MachineOperand &Dest = Inst.getOperand(0);
8950 uint32_t Imm = Inst.getOperand(2).getImm();
8951 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8952 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8953
8954 (void) Offset;
8955
8956 // Only sext_inreg cases handled.
8957 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8958 Offset == 0 && "Not implemented");
8959
8960 if (BitWidth < 32) {
8961 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8962 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8963 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8964
8965 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8966 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8967 .addImm(0)
8968 .addImm(BitWidth);
8969
8970 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8971 .addImm(31)
8972 .addReg(MidRegLo);
8973
8974 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8975 .addReg(MidRegLo)
8976 .addImm(AMDGPU::sub0)
8977 .addReg(MidRegHi)
8978 .addImm(AMDGPU::sub1);
8979
8980 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8981 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8982 return;
8983 }
8984
8985 MachineOperand &Src = Inst.getOperand(1);
8986 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8987 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8988
8989 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8990 .addImm(31)
8991 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8992
8993 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8994 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8995 .addImm(AMDGPU::sub0)
8996 .addReg(TmpReg)
8997 .addImm(AMDGPU::sub1);
8998
8999 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9000 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9001}
9002
9003void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9004 MachineInstr &Inst, unsigned Opcode,
9005 MachineDominatorTree *MDT) const {
9006 // (S_FLBIT_I32_B64 hi:lo) ->
9007 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9008 // (S_FF1_I32_B64 hi:lo) ->
9009 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9010
9011 MachineBasicBlock &MBB = *Inst.getParent();
9012 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9013 MachineBasicBlock::iterator MII = Inst;
9014 const DebugLoc &DL = Inst.getDebugLoc();
9015
9016 MachineOperand &Dest = Inst.getOperand(0);
9017 MachineOperand &Src = Inst.getOperand(1);
9018
9019 const MCInstrDesc &InstDesc = get(Opcode);
9020
9021 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9022 unsigned OpcodeAdd =
9023 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9024
9025 const TargetRegisterClass *SrcRC =
9026 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9027 const TargetRegisterClass *SrcSubRC =
9028 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9029
9030 MachineOperand SrcRegSub0 =
9031 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9032 MachineOperand SrcRegSub1 =
9033 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9034
9035 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9036 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9037 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9038 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9039
9040 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9041
9042 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9043
9044 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9045 .addReg(IsCtlz ? MidReg1 : MidReg2)
9046 .addImm(32)
9047 .addImm(1); // enable clamp
9048
9049 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9050 .addReg(MidReg3)
9051 .addReg(IsCtlz ? MidReg2 : MidReg1);
9052
9053 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9054
9055 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9056}
9057
9058void SIInstrInfo::addUsersToMoveToVALUWorklist(
9060 SIInstrWorklist &Worklist) const {
9061 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9062 MachineInstr &UseMI = *MO.getParent();
9063
9064 unsigned OpNo = 0;
9065
9066 switch (UseMI.getOpcode()) {
9067 case AMDGPU::COPY:
9068 case AMDGPU::WQM:
9069 case AMDGPU::SOFT_WQM:
9070 case AMDGPU::STRICT_WWM:
9071 case AMDGPU::STRICT_WQM:
9072 case AMDGPU::REG_SEQUENCE:
9073 case AMDGPU::PHI:
9074 case AMDGPU::INSERT_SUBREG:
9075 break;
9076 default:
9077 OpNo = MO.getOperandNo();
9078 break;
9079 }
9080
9081 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9082 MRI.constrainRegClass(DstReg, OpRC);
9083
9084 if (!RI.hasVectorRegisters(OpRC))
9085 Worklist.insert(&UseMI);
9086 else
9087 // Legalization could change user list.
9089 }
9090}
9091
9092void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9094 MachineInstr &Inst) const {
9095 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9096 MachineBasicBlock *MBB = Inst.getParent();
9097 MachineOperand &Src0 = Inst.getOperand(1);
9098 MachineOperand &Src1 = Inst.getOperand(2);
9099 const DebugLoc &DL = Inst.getDebugLoc();
9100
9101 switch (Inst.getOpcode()) {
9102 case AMDGPU::S_PACK_LL_B32_B16: {
9103 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9104 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9105
9106 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9107 // 0.
9108 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9109 .addImm(0xffff);
9110
9111 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9112 .addReg(ImmReg, RegState::Kill)
9113 .add(Src0);
9114
9115 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9116 .add(Src1)
9117 .addImm(16)
9118 .addReg(TmpReg, RegState::Kill);
9119 break;
9120 }
9121 case AMDGPU::S_PACK_LH_B32_B16: {
9122 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9123 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9124 .addImm(0xffff);
9125 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9126 .addReg(ImmReg, RegState::Kill)
9127 .add(Src0)
9128 .add(Src1);
9129 break;
9130 }
9131 case AMDGPU::S_PACK_HL_B32_B16: {
9132 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9133 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9134 .addImm(16)
9135 .add(Src0);
9136 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9137 .add(Src1)
9138 .addImm(16)
9139 .addReg(TmpReg, RegState::Kill);
9140 break;
9141 }
9142 case AMDGPU::S_PACK_HH_B32_B16: {
9143 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9144 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9145 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9146 .addImm(16)
9147 .add(Src0);
9148 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9149 .addImm(0xffff0000);
9150 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9151 .add(Src1)
9152 .addReg(ImmReg, RegState::Kill)
9153 .addReg(TmpReg, RegState::Kill);
9154 break;
9155 }
9156 default:
9157 llvm_unreachable("unhandled s_pack_* instruction");
9158 }
9159
9160 MachineOperand &Dest = Inst.getOperand(0);
9161 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9162 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9163}
9164
9165void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
9166 MachineInstr &SCCDefInst,
9167 SIInstrWorklist &Worklist,
9168 Register NewCond) const {
9169
9170 // Ensure that def inst defines SCC, which is still live.
9171 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9172 !Op.isDead() && Op.getParent() == &SCCDefInst);
9173 SmallVector<MachineInstr *, 4> CopyToDelete;
9174 // This assumes that all the users of SCC are in the same block
9175 // as the SCC def.
9176 for (MachineInstr &MI : // Skip the def inst itself.
9177 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9178 SCCDefInst.getParent()->end())) {
9179 // Check if SCC is used first.
9180 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9181 if (SCCIdx != -1) {
9182 if (MI.isCopy()) {
9183 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9184 Register DestReg = MI.getOperand(0).getReg();
9185
9186 MRI.replaceRegWith(DestReg, NewCond);
9187 CopyToDelete.push_back(&MI);
9188 } else {
9189
9190 if (NewCond.isValid())
9191 MI.getOperand(SCCIdx).setReg(NewCond);
9192
9193 Worklist.insert(&MI);
9194 }
9195 }
9196 // Exit if we find another SCC def.
9197 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9198 break;
9199 }
9200 for (auto &Copy : CopyToDelete)
9201 Copy->eraseFromParent();
9202}
9203
9204// Instructions that use SCC may be converted to VALU instructions. When that
9205// happens, the SCC register is changed to VCC_LO. The instruction that defines
9206// SCC must be changed to an instruction that defines VCC. This function makes
9207// sure that the instruction that defines SCC is added to the moveToVALU
9208// worklist.
9209void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9210 SIInstrWorklist &Worklist) const {
9211 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9212 // then there is nothing to do because the defining instruction has been
9213 // converted to a VALU already. If SCC then that instruction needs to be
9214 // converted to a VALU.
9215 for (MachineInstr &MI :
9216 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9217 SCCUseInst->getParent()->rend())) {
9218 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9219 break;
9220 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9221 Worklist.insert(&MI);
9222 break;
9223 }
9224 }
9225}
9226
9227const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9228 const MachineInstr &Inst) const {
9229 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9230
9231 switch (Inst.getOpcode()) {
9232 // For target instructions, getOpRegClass just returns the virtual register
9233 // class associated with the operand, so we need to find an equivalent VGPR
9234 // register class in order to move the instruction to the VALU.
9235 case AMDGPU::COPY:
9236 case AMDGPU::PHI:
9237 case AMDGPU::REG_SEQUENCE:
9238 case AMDGPU::INSERT_SUBREG:
9239 case AMDGPU::WQM:
9240 case AMDGPU::SOFT_WQM:
9241 case AMDGPU::STRICT_WWM:
9242 case AMDGPU::STRICT_WQM: {
9243 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9244 if (RI.isAGPRClass(SrcRC)) {
9245 if (RI.isAGPRClass(NewDstRC))
9246 return nullptr;
9247
9248 switch (Inst.getOpcode()) {
9249 case AMDGPU::PHI:
9250 case AMDGPU::REG_SEQUENCE:
9251 case AMDGPU::INSERT_SUBREG:
9252 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9253 break;
9254 default:
9255 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9256 }
9257
9258 if (!NewDstRC)
9259 return nullptr;
9260 } else {
9261 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9262 return nullptr;
9263
9264 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9265 if (!NewDstRC)
9266 return nullptr;
9267 }
9268
9269 return NewDstRC;
9270 }
9271 default:
9272 return NewDstRC;
9273 }
9274}
9275
9276// Find the one SGPR operand we are allowed to use.
9277Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9278 int OpIndices[3]) const {
9279 const MCInstrDesc &Desc = MI.getDesc();
9280
9281 // Find the one SGPR operand we are allowed to use.
9282 //
9283 // First we need to consider the instruction's operand requirements before
9284 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9285 // of VCC, but we are still bound by the constant bus requirement to only use
9286 // one.
9287 //
9288 // If the operand's class is an SGPR, we can never move it.
9289
9290 Register SGPRReg = findImplicitSGPRRead(MI);
9291 if (SGPRReg)
9292 return SGPRReg;
9293
9294 Register UsedSGPRs[3] = {Register()};
9295 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9296
9297 for (unsigned i = 0; i < 3; ++i) {
9298 int Idx = OpIndices[i];
9299 if (Idx == -1)
9300 break;
9301
9302 const MachineOperand &MO = MI.getOperand(Idx);
9303 if (!MO.isReg())
9304 continue;
9305
9306 // Is this operand statically required to be an SGPR based on the operand
9307 // constraints?
9308 const TargetRegisterClass *OpRC =
9309 RI.getRegClass(Desc.operands()[Idx].RegClass);
9310 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9311 if (IsRequiredSGPR)
9312 return MO.getReg();
9313
9314 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9315 Register Reg = MO.getReg();
9316 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9317 if (RI.isSGPRClass(RegRC))
9318 UsedSGPRs[i] = Reg;
9319 }
9320
9321 // We don't have a required SGPR operand, so we have a bit more freedom in
9322 // selecting operands to move.
9323
9324 // Try to select the most used SGPR. If an SGPR is equal to one of the
9325 // others, we choose that.
9326 //
9327 // e.g.
9328 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9329 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9330
9331 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9332 // prefer those.
9333
9334 if (UsedSGPRs[0]) {
9335 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9336 SGPRReg = UsedSGPRs[0];
9337 }
9338
9339 if (!SGPRReg && UsedSGPRs[1]) {
9340 if (UsedSGPRs[1] == UsedSGPRs[2])
9341 SGPRReg = UsedSGPRs[1];
9342 }
9343
9344 return SGPRReg;
9345}
9346
9348 AMDGPU::OpName OperandName) const {
9349 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9350 return nullptr;
9351
9352 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9353 if (Idx == -1)
9354 return nullptr;
9355
9356 return &MI.getOperand(Idx);
9357}
9358
9360 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9361 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9364 return (Format << 44) |
9365 (1ULL << 56) | // RESOURCE_LEVEL = 1
9366 (3ULL << 60); // OOB_SELECT = 3
9367 }
9368
9369 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9370 if (ST.isAmdHsaOS()) {
9371 // Set ATC = 1. GFX9 doesn't have this bit.
9372 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9373 RsrcDataFormat |= (1ULL << 56);
9374
9375 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9376 // BTW, it disables TC L2 and therefore decreases performance.
9377 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9378 RsrcDataFormat |= (2ULL << 59);
9379 }
9380
9381 return RsrcDataFormat;
9382}
9383
9387 0xffffffff; // Size;
9388
9389 // GFX9 doesn't have ELEMENT_SIZE.
9390 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9391 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9392 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9393 }
9394
9395 // IndexStride = 64 / 32.
9396 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9397 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9398
9399 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9400 // Clear them unless we want a huge stride.
9401 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9402 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9403 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9404
9405 return Rsrc23;
9406}
9407
9409 unsigned Opc = MI.getOpcode();
9410
9411 return isSMRD(Opc);
9412}
9413
9415 return get(Opc).mayLoad() &&
9416 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9417}
9418
9420 int &FrameIndex) const {
9421 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9422 if (!Addr || !Addr->isFI())
9423 return Register();
9424
9425 assert(!MI.memoperands_empty() &&
9426 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9427
9428 FrameIndex = Addr->getIndex();
9429 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9430}
9431
9433 int &FrameIndex) const {
9434 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9435 assert(Addr && Addr->isFI());
9436 FrameIndex = Addr->getIndex();
9437 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9438}
9439
9441 int &FrameIndex) const {
9442 if (!MI.mayLoad())
9443 return Register();
9444
9445 if (isMUBUF(MI) || isVGPRSpill(MI))
9446 return isStackAccess(MI, FrameIndex);
9447
9448 if (isSGPRSpill(MI))
9449 return isSGPRStackAccess(MI, FrameIndex);
9450
9451 return Register();
9452}
9453
9455 int &FrameIndex) const {
9456 if (!MI.mayStore())
9457 return Register();
9458
9459 if (isMUBUF(MI) || isVGPRSpill(MI))
9460 return isStackAccess(MI, FrameIndex);
9461
9462 if (isSGPRSpill(MI))
9463 return isSGPRStackAccess(MI, FrameIndex);
9464
9465 return Register();
9466}
9467
9469 unsigned Size = 0;
9471 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9472 while (++I != E && I->isInsideBundle()) {
9473 assert(!I->isBundle() && "No nested bundle!");
9475 }
9476
9477 return Size;
9478}
9479
9481 unsigned Opc = MI.getOpcode();
9483 unsigned DescSize = Desc.getSize();
9484
9485 // If we have a definitive size, we can use it. Otherwise we need to inspect
9486 // the operands to know the size.
9487 if (isFixedSize(MI)) {
9488 unsigned Size = DescSize;
9489
9490 // If we hit the buggy offset, an extra nop will be inserted in MC so
9491 // estimate the worst case.
9492 if (MI.isBranch() && ST.hasOffset3fBug())
9493 Size += 4;
9494
9495 return Size;
9496 }
9497
9498 // Instructions may have a 32-bit literal encoded after them. Check
9499 // operands that could ever be literals.
9500 if (isVALU(MI) || isSALU(MI)) {
9501 if (isDPP(MI))
9502 return DescSize;
9503 bool HasLiteral = false;
9504 unsigned LiteralSize = 4;
9505 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9506 const MachineOperand &Op = MI.getOperand(I);
9507 const MCOperandInfo &OpInfo = Desc.operands()[I];
9508 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9509 HasLiteral = true;
9510 if (ST.has64BitLiterals()) {
9511 switch (OpInfo.OperandType) {
9512 default:
9513 break;
9515 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9516 LiteralSize = 8;
9517 break;
9519 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9520 LiteralSize = 8;
9521 break;
9522 }
9523 }
9524 break;
9525 }
9526 }
9527 return HasLiteral ? DescSize + LiteralSize : DescSize;
9528 }
9529
9530 // Check whether we have extra NSA words.
9531 if (isMIMG(MI)) {
9532 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9533 if (VAddr0Idx < 0)
9534 return 8;
9535
9536 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9537 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9538 }
9539
9540 switch (Opc) {
9541 case TargetOpcode::BUNDLE:
9542 return getInstBundleSize(MI);
9543 case TargetOpcode::INLINEASM:
9544 case TargetOpcode::INLINEASM_BR: {
9545 const MachineFunction *MF = MI.getParent()->getParent();
9546 const char *AsmStr = MI.getOperand(0).getSymbolName();
9547 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9548 }
9549 default:
9550 if (MI.isMetaInstruction())
9551 return 0;
9552
9553 // If D16 Pseudo inst, get correct MC code size
9554 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9555 if (D16Info) {
9556 // Assume d16_lo/hi inst are always in same size
9557 unsigned LoInstOpcode = D16Info->LoOp;
9558 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9559 DescSize = Desc.getSize();
9560 }
9561
9562 // If FMA Pseudo inst, get correct MC code size
9563 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9564 // All potential lowerings are the same size; arbitrarily pick one.
9565 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9566 DescSize = Desc.getSize();
9567 }
9568
9569 return DescSize;
9570 }
9571}
9572
9574 if (!isFLAT(MI))
9575 return false;
9576
9577 if (MI.memoperands_empty())
9578 return true;
9579
9580 for (const MachineMemOperand *MMO : MI.memoperands()) {
9581 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9582 return true;
9583 }
9584 return false;
9585}
9586
9589 static const std::pair<int, const char *> TargetIndices[] = {
9590 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9591 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9592 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9593 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9594 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9595 return ArrayRef(TargetIndices);
9596}
9597
9598/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9599/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9605
9606/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9607/// pass.
9612
9613// Called during:
9614// - pre-RA scheduling and post-RA scheduling
9617 const ScheduleDAGMI *DAG) const {
9618 // Borrowed from Arm Target
9619 // We would like to restrict this hazard recognizer to only
9620 // post-RA scheduling; we can tell that we're post-RA because we don't
9621 // track VRegLiveness.
9622 if (!DAG->hasVRegLiveness())
9623 return new GCNHazardRecognizer(DAG->MF);
9625}
9626
9627std::pair<unsigned, unsigned>
9629 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9630}
9631
9634 static const std::pair<unsigned, const char *> TargetFlags[] = {
9635 {MO_GOTPCREL, "amdgpu-gotprel"},
9636 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9637 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9638 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9639 {MO_REL32_LO, "amdgpu-rel32-lo"},
9640 {MO_REL32_HI, "amdgpu-rel32-hi"},
9641 {MO_REL64, "amdgpu-rel64"},
9642 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9643 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9644 {MO_ABS64, "amdgpu-abs64"},
9645 };
9646
9647 return ArrayRef(TargetFlags);
9648}
9649
9652 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9653 {
9654 {MONoClobber, "amdgpu-noclobber"},
9655 {MOLastUse, "amdgpu-last-use"},
9656 {MOCooperative, "amdgpu-cooperative"},
9657 };
9658
9659 return ArrayRef(TargetFlags);
9660}
9661
9663 const MachineFunction &MF) const {
9665 assert(SrcReg.isVirtual());
9666 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9667 return AMDGPU::WWM_COPY;
9668
9669 return AMDGPU::COPY;
9670}
9671
9673 Register Reg) const {
9674 // We need to handle instructions which may be inserted during register
9675 // allocation to handle the prolog. The initial prolog instruction may have
9676 // been separated from the start of the block by spills and copies inserted
9677 // needed by the prolog. However, the insertions for scalar registers can
9678 // always be placed at the BB top as they are independent of the exec mask
9679 // value.
9680 const MachineFunction *MF = MI.getParent()->getParent();
9681 bool IsNullOrVectorRegister = true;
9682 if (Reg) {
9683 const MachineRegisterInfo &MRI = MF->getRegInfo();
9684 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9685 }
9686
9687 uint16_t Opcode = MI.getOpcode();
9689 return IsNullOrVectorRegister &&
9690 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9691 (Opcode == AMDGPU::IMPLICIT_DEF &&
9692 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9693 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9694 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9695}
9696
9700 const DebugLoc &DL,
9701 Register DestReg) const {
9702 if (ST.hasAddNoCarry())
9703 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9704
9705 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9706 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9707 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9708
9709 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9710 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9711}
9712
9715 const DebugLoc &DL,
9716 Register DestReg,
9717 RegScavenger &RS) const {
9718 if (ST.hasAddNoCarry())
9719 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9720
9721 // If available, prefer to use vcc.
9722 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9723 ? Register(RI.getVCC())
9725 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9726 0, /* AllowSpill */ false);
9727
9728 // TODO: Users need to deal with this.
9729 if (!UnusedCarry.isValid())
9730 return MachineInstrBuilder();
9731
9732 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9733 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9734}
9735
9736bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9737 switch (Opcode) {
9738 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9739 case AMDGPU::SI_KILL_I1_TERMINATOR:
9740 return true;
9741 default:
9742 return false;
9743 }
9744}
9745
9747 switch (Opcode) {
9748 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9749 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9750 case AMDGPU::SI_KILL_I1_PSEUDO:
9751 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9752 default:
9753 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9754 }
9755}
9756
9757bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9758 return Imm <= getMaxMUBUFImmOffset(ST);
9759}
9760
9762 // GFX12 field is non-negative 24-bit signed byte offset.
9763 const unsigned OffsetBits =
9764 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9765 return (1 << OffsetBits) - 1;
9766}
9767
9769 if (!ST.isWave32())
9770 return;
9771
9772 if (MI.isInlineAsm())
9773 return;
9774
9775 for (auto &Op : MI.implicit_operands()) {
9776 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9777 Op.setReg(AMDGPU::VCC_LO);
9778 }
9779}
9780
9782 if (!isSMRD(MI))
9783 return false;
9784
9785 // Check that it is using a buffer resource.
9786 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9787 if (Idx == -1) // e.g. s_memtime
9788 return false;
9789
9790 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9791 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9792}
9793
9794// Given Imm, split it into the values to put into the SOffset and ImmOffset
9795// fields in an MUBUF instruction. Return false if it is not possible (due to a
9796// hardware bug needing a workaround).
9797//
9798// The required alignment ensures that individual address components remain
9799// aligned if they are aligned to begin with. It also ensures that additional
9800// offsets within the given alignment can be added to the resulting ImmOffset.
9802 uint32_t &ImmOffset, Align Alignment) const {
9803 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9804 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9805 uint32_t Overflow = 0;
9806
9807 if (Imm > MaxImm) {
9808 if (Imm <= MaxImm + 64) {
9809 // Use an SOffset inline constant for 4..64
9810 Overflow = Imm - MaxImm;
9811 Imm = MaxImm;
9812 } else {
9813 // Try to keep the same value in SOffset for adjacent loads, so that
9814 // the corresponding register contents can be re-used.
9815 //
9816 // Load values with all low-bits (except for alignment bits) set into
9817 // SOffset, so that a larger range of values can be covered using
9818 // s_movk_i32.
9819 //
9820 // Atomic operations fail to work correctly when individual address
9821 // components are unaligned, even if their sum is aligned.
9822 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9823 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9824 Imm = Low;
9825 Overflow = High - Alignment.value();
9826 }
9827 }
9828
9829 if (Overflow > 0) {
9830 // There is a hardware bug in SI and CI which prevents address clamping in
9831 // MUBUF instructions from working correctly with SOffsets. The immediate
9832 // offset is unaffected.
9833 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9834 return false;
9835
9836 // It is not possible to set immediate in SOffset field on some targets.
9837 if (ST.hasRestrictedSOffset())
9838 return false;
9839 }
9840
9841 ImmOffset = Imm;
9842 SOffset = Overflow;
9843 return true;
9844}
9845
9846// Depending on the used address space and instructions, some immediate offsets
9847// are allowed and some are not.
9848// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9849// scratch instruction offsets can also be negative. On GFX12, offsets can be
9850// negative for all variants.
9851//
9852// There are several bugs related to these offsets:
9853// On gfx10.1, flat instructions that go into the global address space cannot
9854// use an offset.
9855//
9856// For scratch instructions, the address can be either an SGPR or a VGPR.
9857// The following offsets can be used, depending on the architecture (x means
9858// cannot be used):
9859// +----------------------------+------+------+
9860// | Address-Mode | SGPR | VGPR |
9861// +----------------------------+------+------+
9862// | gfx9 | | |
9863// | negative, 4-aligned offset | x | ok |
9864// | negative, unaligned offset | x | ok |
9865// +----------------------------+------+------+
9866// | gfx10 | | |
9867// | negative, 4-aligned offset | ok | ok |
9868// | negative, unaligned offset | ok | x |
9869// +----------------------------+------+------+
9870// | gfx10.3 | | |
9871// | negative, 4-aligned offset | ok | ok |
9872// | negative, unaligned offset | ok | ok |
9873// +----------------------------+------+------+
9874//
9875// This function ignores the addressing mode, so if an offset cannot be used in
9876// one addressing mode, it is considered illegal.
9877bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9878 uint64_t FlatVariant) const {
9879 // TODO: Should 0 be special cased?
9880 if (!ST.hasFlatInstOffsets())
9881 return false;
9882
9883 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9884 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9885 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9886 return false;
9887
9888 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9889 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9890 (Offset % 4) != 0) {
9891 return false;
9892 }
9893
9894 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9895 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9896 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9897}
9898
9899// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9900std::pair<int64_t, int64_t>
9901SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9902 uint64_t FlatVariant) const {
9903 int64_t RemainderOffset = COffsetVal;
9904 int64_t ImmField = 0;
9905
9906 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9907 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9908
9909 if (AllowNegative) {
9910 // Use signed division by a power of two to truncate towards 0.
9911 int64_t D = 1LL << NumBits;
9912 RemainderOffset = (COffsetVal / D) * D;
9913 ImmField = COffsetVal - RemainderOffset;
9914
9915 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9916 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9917 (ImmField % 4) != 0) {
9918 // Make ImmField a multiple of 4
9919 RemainderOffset += ImmField % 4;
9920 ImmField -= ImmField % 4;
9921 }
9922 } else if (COffsetVal >= 0) {
9923 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9924 RemainderOffset = COffsetVal - ImmField;
9925 }
9926
9927 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9928 assert(RemainderOffset + ImmField == COffsetVal);
9929 return {ImmField, RemainderOffset};
9930}
9931
9933 if (ST.hasNegativeScratchOffsetBug() &&
9934 FlatVariant == SIInstrFlags::FlatScratch)
9935 return false;
9936
9937 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9938}
9939
9940static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9941 switch (ST.getGeneration()) {
9942 default:
9943 break;
9946 return SIEncodingFamily::SI;
9949 return SIEncodingFamily::VI;
9955 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9957 }
9958 llvm_unreachable("Unknown subtarget generation!");
9959}
9960
9961bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9962 switch(MCOp) {
9963 // These opcodes use indirect register addressing so
9964 // they need special handling by codegen (currently missing).
9965 // Therefore it is too risky to allow these opcodes
9966 // to be selected by dpp combiner or sdwa peepholer.
9967 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9968 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9969 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9970 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9971 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9972 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9973 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9974 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9975 return true;
9976 default:
9977 return false;
9978 }
9979}
9980
9981#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9982 case OPCODE##_dpp: \
9983 case OPCODE##_e32: \
9984 case OPCODE##_e64: \
9985 case OPCODE##_e64_dpp: \
9986 case OPCODE##_sdwa:
9987
9988static bool isRenamedInGFX9(int Opcode) {
9989 switch (Opcode) {
9990 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9991 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9992 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9993 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9994 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9995 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9996 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9997 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9998 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9999 //
10000 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10001 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10002 case AMDGPU::V_FMA_F16_gfx9_e64:
10003 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10004 case AMDGPU::V_INTERP_P2_F16:
10005 case AMDGPU::V_MAD_F16_e64:
10006 case AMDGPU::V_MAD_U16_e64:
10007 case AMDGPU::V_MAD_I16_e64:
10008 return true;
10009 default:
10010 return false;
10011 }
10012}
10013
10014int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10015 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
10016
10017 unsigned Gen = subtargetEncodingFamily(ST);
10018
10019 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10021
10022 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10023 // subtarget has UnpackedD16VMem feature.
10024 // TODO: remove this when we discard GFX80 encoding.
10025 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10027
10028 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10029 switch (ST.getGeneration()) {
10030 default:
10032 break;
10035 break;
10038 break;
10039 }
10040 }
10041
10042 if (isMAI(Opcode)) {
10043 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10044 if (MFMAOp != -1)
10045 Opcode = MFMAOp;
10046 }
10047
10048 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10049
10050 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10052
10053 // -1 means that Opcode is already a native instruction.
10054 if (MCOp == -1)
10055 return Opcode;
10056
10057 if (ST.hasGFX90AInsts()) {
10058 uint16_t NMCOp = (uint16_t)-1;
10059 if (ST.hasGFX940Insts())
10061 if (NMCOp == (uint16_t)-1)
10063 if (NMCOp == (uint16_t)-1)
10065 if (NMCOp != (uint16_t)-1)
10066 MCOp = NMCOp;
10067 }
10068
10069 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10070 // no encoding in the given subtarget generation.
10071 if (MCOp == (uint16_t)-1)
10072 return -1;
10073
10074 if (isAsmOnlyOpcode(MCOp))
10075 return -1;
10076
10077 return MCOp;
10078}
10079
10080static
10082 assert(RegOpnd.isReg());
10083 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10084 getRegSubRegPair(RegOpnd);
10085}
10086
10089 assert(MI.isRegSequence());
10090 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10091 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10092 auto &RegOp = MI.getOperand(1 + 2 * I);
10093 return getRegOrUndef(RegOp);
10094 }
10096}
10097
10098// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10099// Following a subreg of reg:subreg isn't supported
10102 if (!RSR.SubReg)
10103 return false;
10104 switch (MI.getOpcode()) {
10105 default: break;
10106 case AMDGPU::REG_SEQUENCE:
10107 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10108 return true;
10109 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10110 case AMDGPU::INSERT_SUBREG:
10111 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10112 // inserted the subreg we're looking for
10113 RSR = getRegOrUndef(MI.getOperand(2));
10114 else { // the subreg in the rest of the reg
10115 auto R1 = getRegOrUndef(MI.getOperand(1));
10116 if (R1.SubReg) // subreg of subreg isn't supported
10117 return false;
10118 RSR.Reg = R1.Reg;
10119 }
10120 return true;
10121 }
10122 return false;
10123}
10124
10127 assert(MRI.isSSA());
10128 if (!P.Reg.isVirtual())
10129 return nullptr;
10130
10131 auto RSR = P;
10132 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10133 while (auto *MI = DefInst) {
10134 DefInst = nullptr;
10135 switch (MI->getOpcode()) {
10136 case AMDGPU::COPY:
10137 case AMDGPU::V_MOV_B32_e32: {
10138 auto &Op1 = MI->getOperand(1);
10139 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10140 if (Op1.isUndef())
10141 return nullptr;
10142 RSR = getRegSubRegPair(Op1);
10143 DefInst = MRI.getVRegDef(RSR.Reg);
10144 }
10145 break;
10146 }
10147 default:
10148 if (followSubRegDef(*MI, RSR)) {
10149 if (!RSR.Reg)
10150 return nullptr;
10151 DefInst = MRI.getVRegDef(RSR.Reg);
10152 }
10153 }
10154 if (!DefInst)
10155 return MI;
10156 }
10157 return nullptr;
10158}
10159
10161 Register VReg,
10162 const MachineInstr &DefMI,
10163 const MachineInstr &UseMI) {
10164 assert(MRI.isSSA() && "Must be run on SSA");
10165
10166 auto *TRI = MRI.getTargetRegisterInfo();
10167 auto *DefBB = DefMI.getParent();
10168
10169 // Don't bother searching between blocks, although it is possible this block
10170 // doesn't modify exec.
10171 if (UseMI.getParent() != DefBB)
10172 return true;
10173
10174 const int MaxInstScan = 20;
10175 int NumInst = 0;
10176
10177 // Stop scan at the use.
10178 auto E = UseMI.getIterator();
10179 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10180 if (I->isDebugInstr())
10181 continue;
10182
10183 if (++NumInst > MaxInstScan)
10184 return true;
10185
10186 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10187 return true;
10188 }
10189
10190 return false;
10191}
10192
10194 Register VReg,
10195 const MachineInstr &DefMI) {
10196 assert(MRI.isSSA() && "Must be run on SSA");
10197
10198 auto *TRI = MRI.getTargetRegisterInfo();
10199 auto *DefBB = DefMI.getParent();
10200
10201 const int MaxUseScan = 10;
10202 int NumUse = 0;
10203
10204 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10205 auto &UseInst = *Use.getParent();
10206 // Don't bother searching between blocks, although it is possible this block
10207 // doesn't modify exec.
10208 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10209 return true;
10210
10211 if (++NumUse > MaxUseScan)
10212 return true;
10213 }
10214
10215 if (NumUse == 0)
10216 return false;
10217
10218 const int MaxInstScan = 20;
10219 int NumInst = 0;
10220
10221 // Stop scan when we have seen all the uses.
10222 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10223 assert(I != DefBB->end());
10224
10225 if (I->isDebugInstr())
10226 continue;
10227
10228 if (++NumInst > MaxInstScan)
10229 return true;
10230
10231 for (const MachineOperand &Op : I->operands()) {
10232 // We don't check reg masks here as they're used only on calls:
10233 // 1. EXEC is only considered const within one BB
10234 // 2. Call should be a terminator instruction if present in a BB
10235
10236 if (!Op.isReg())
10237 continue;
10238
10239 Register Reg = Op.getReg();
10240 if (Op.isUse()) {
10241 if (Reg == VReg && --NumUse == 0)
10242 return false;
10243 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10244 return true;
10245 }
10246 }
10247}
10248
10251 const DebugLoc &DL, Register Src, Register Dst) const {
10252 auto Cur = MBB.begin();
10253 if (Cur != MBB.end())
10254 do {
10255 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10256 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10257 ++Cur;
10258 } while (Cur != MBB.end() && Cur != LastPHIIt);
10259
10260 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10261 Dst);
10262}
10263
10266 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10267 if (InsPt != MBB.end() &&
10268 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10269 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10270 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10271 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10272 InsPt++;
10273 return BuildMI(MBB, InsPt, DL,
10274 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10275 .addReg(Src, 0, SrcSubReg)
10276 .addReg(AMDGPU::EXEC, RegState::Implicit);
10277 }
10278 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10279 Dst);
10280}
10281
10282bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10283
10286 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10287 VirtRegMap *VRM) const {
10288 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10289 //
10290 // %0:sreg_32 = COPY $m0
10291 //
10292 // We explicitly chose SReg_32 for the virtual register so such a copy might
10293 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10294 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10295 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10296 // TargetInstrInfo::foldMemoryOperand() is going to try.
10297 // A similar issue also exists with spilling and reloading $exec registers.
10298 //
10299 // To prevent that, constrain the %0 register class here.
10300 if (isFullCopyInstr(MI)) {
10301 Register DstReg = MI.getOperand(0).getReg();
10302 Register SrcReg = MI.getOperand(1).getReg();
10303 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10304 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10306 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10307 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10308 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10309 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10310 return nullptr;
10311 }
10312 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10313 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10314 return nullptr;
10315 }
10316 }
10317 }
10318
10319 return nullptr;
10320}
10321
10323 const MachineInstr &MI,
10324 unsigned *PredCost) const {
10325 if (MI.isBundle()) {
10327 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10328 unsigned Lat = 0, Count = 0;
10329 for (++I; I != E && I->isBundledWithPred(); ++I) {
10330 ++Count;
10331 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10332 }
10333 return Lat + Count - 1;
10334 }
10335
10336 return SchedModel.computeInstrLatency(&MI);
10337}
10338
10341 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10342 unsigned Opcode = MI.getOpcode();
10343
10344 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10345 Register Dst = MI.getOperand(0).getReg();
10346 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10347 : MI.getOperand(1).getReg();
10348 LLT DstTy = MRI.getType(Dst);
10349 LLT SrcTy = MRI.getType(Src);
10350 unsigned DstAS = DstTy.getAddressSpace();
10351 unsigned SrcAS = SrcTy.getAddressSpace();
10352 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10353 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10354 ST.hasGloballyAddressableScratch()
10357 };
10358
10359 // If the target supports globally addressable scratch, the mapping from
10360 // scratch memory to the flat aperture changes therefore an address space cast
10361 // is no longer uniform.
10362 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10363 return HandleAddrSpaceCast(MI);
10364
10365 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10366 auto IID = GI->getIntrinsicID();
10371
10372 switch (IID) {
10373 case Intrinsic::amdgcn_addrspacecast_nonnull:
10374 return HandleAddrSpaceCast(MI);
10375 case Intrinsic::amdgcn_if:
10376 case Intrinsic::amdgcn_else:
10377 // FIXME: Uniform if second result
10378 break;
10379 }
10380
10382 }
10383
10384 // Loads from the private and flat address spaces are divergent, because
10385 // threads can execute the load instruction with the same inputs and get
10386 // different results.
10387 //
10388 // All other loads are not divergent, because if threads issue loads with the
10389 // same arguments, they will always get the same result.
10390 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10391 Opcode == AMDGPU::G_SEXTLOAD) {
10392 if (MI.memoperands_empty())
10393 return InstructionUniformity::NeverUniform; // conservative assumption
10394
10395 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10396 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10397 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10398 })) {
10399 // At least one MMO in a non-global address space.
10401 }
10403 }
10404
10405 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10406 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10407 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10408 AMDGPU::isGenericAtomic(Opcode)) {
10410 }
10412}
10413
10416
10417 if (isNeverUniform(MI))
10419
10420 unsigned opcode = MI.getOpcode();
10421 if (opcode == AMDGPU::V_READLANE_B32 ||
10422 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10423 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10425
10426 if (isCopyInstr(MI)) {
10427 const MachineOperand &srcOp = MI.getOperand(1);
10428 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10429 const TargetRegisterClass *regClass =
10430 RI.getPhysRegBaseClass(srcOp.getReg());
10431 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10433 }
10435 }
10436
10437 // GMIR handling
10438 if (MI.isPreISelOpcode())
10440
10441 // Atomics are divergent because they are executed sequentially: when an
10442 // atomic operation refers to the same address in each thread, then each
10443 // thread after the first sees the value written by the previous thread as
10444 // original value.
10445
10446 if (isAtomic(MI))
10448
10449 // Loads from the private and flat address spaces are divergent, because
10450 // threads can execute the load instruction with the same inputs and get
10451 // different results.
10452 if (isFLAT(MI) && MI.mayLoad()) {
10453 if (MI.memoperands_empty())
10454 return InstructionUniformity::NeverUniform; // conservative assumption
10455
10456 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10457 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10458 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10459 })) {
10460 // At least one MMO in a non-global address space.
10462 }
10463
10465 }
10466
10467 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10468 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10469
10470 // FIXME: It's conceptually broken to report this for an instruction, and not
10471 // a specific def operand. For inline asm in particular, there could be mixed
10472 // uniform and divergent results.
10473 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10474 const MachineOperand &SrcOp = MI.getOperand(I);
10475 if (!SrcOp.isReg())
10476 continue;
10477
10478 Register Reg = SrcOp.getReg();
10479 if (!Reg || !SrcOp.readsReg())
10480 continue;
10481
10482 // If RegBank is null, this is unassigned or an unallocatable special
10483 // register, which are all scalars.
10484 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10485 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10487 }
10488
10489 // TODO: Uniformity check condtions above can be rearranged for more
10490 // redability
10491
10492 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10493 // currently turned into no-op COPYs by SelectionDAG ISel and are
10494 // therefore no longer recognizable.
10495
10497}
10498
10500 switch (MF.getFunction().getCallingConv()) {
10502 return 1;
10504 return 2;
10506 return 3;
10510 const Function &F = MF.getFunction();
10511 F.getContext().diagnose(DiagnosticInfoUnsupported(
10512 F, "ds_ordered_count unsupported for this calling conv"));
10513 [[fallthrough]];
10514 }
10517 case CallingConv::C:
10518 case CallingConv::Fast:
10519 default:
10520 // Assume other calling conventions are various compute callable functions
10521 return 0;
10522 }
10523}
10524
10526 Register &SrcReg2, int64_t &CmpMask,
10527 int64_t &CmpValue) const {
10528 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10529 return false;
10530
10531 switch (MI.getOpcode()) {
10532 default:
10533 break;
10534 case AMDGPU::S_CMP_EQ_U32:
10535 case AMDGPU::S_CMP_EQ_I32:
10536 case AMDGPU::S_CMP_LG_U32:
10537 case AMDGPU::S_CMP_LG_I32:
10538 case AMDGPU::S_CMP_LT_U32:
10539 case AMDGPU::S_CMP_LT_I32:
10540 case AMDGPU::S_CMP_GT_U32:
10541 case AMDGPU::S_CMP_GT_I32:
10542 case AMDGPU::S_CMP_LE_U32:
10543 case AMDGPU::S_CMP_LE_I32:
10544 case AMDGPU::S_CMP_GE_U32:
10545 case AMDGPU::S_CMP_GE_I32:
10546 case AMDGPU::S_CMP_EQ_U64:
10547 case AMDGPU::S_CMP_LG_U64:
10548 SrcReg = MI.getOperand(0).getReg();
10549 if (MI.getOperand(1).isReg()) {
10550 if (MI.getOperand(1).getSubReg())
10551 return false;
10552 SrcReg2 = MI.getOperand(1).getReg();
10553 CmpValue = 0;
10554 } else if (MI.getOperand(1).isImm()) {
10555 SrcReg2 = Register();
10556 CmpValue = MI.getOperand(1).getImm();
10557 } else {
10558 return false;
10559 }
10560 CmpMask = ~0;
10561 return true;
10562 case AMDGPU::S_CMPK_EQ_U32:
10563 case AMDGPU::S_CMPK_EQ_I32:
10564 case AMDGPU::S_CMPK_LG_U32:
10565 case AMDGPU::S_CMPK_LG_I32:
10566 case AMDGPU::S_CMPK_LT_U32:
10567 case AMDGPU::S_CMPK_LT_I32:
10568 case AMDGPU::S_CMPK_GT_U32:
10569 case AMDGPU::S_CMPK_GT_I32:
10570 case AMDGPU::S_CMPK_LE_U32:
10571 case AMDGPU::S_CMPK_LE_I32:
10572 case AMDGPU::S_CMPK_GE_U32:
10573 case AMDGPU::S_CMPK_GE_I32:
10574 SrcReg = MI.getOperand(0).getReg();
10575 SrcReg2 = Register();
10576 CmpValue = MI.getOperand(1).getImm();
10577 CmpMask = ~0;
10578 return true;
10579 }
10580
10581 return false;
10582}
10583
10585 Register SrcReg2, int64_t CmpMask,
10586 int64_t CmpValue,
10587 const MachineRegisterInfo *MRI) const {
10588 if (!SrcReg || SrcReg.isPhysical())
10589 return false;
10590
10591 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10592 return false;
10593
10594 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10595 this](int64_t ExpectedValue, unsigned SrcSize,
10596 bool IsReversible, bool IsSigned) -> bool {
10597 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10598 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10599 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10600 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10601 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10602 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10603 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10604 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10605 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10606 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10607 //
10608 // Signed ge/gt are not used for the sign bit.
10609 //
10610 // If result of the AND is unused except in the compare:
10611 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10612 //
10613 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10614 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10615 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10616 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10617 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10618 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10619
10620 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10621 if (!Def || Def->getParent() != CmpInstr.getParent())
10622 return false;
10623
10624 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10625 Def->getOpcode() != AMDGPU::S_AND_B64)
10626 return false;
10627
10628 int64_t Mask;
10629 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10630 if (MO->isImm())
10631 Mask = MO->getImm();
10632 else if (!getFoldableImm(MO, Mask))
10633 return false;
10634 Mask &= maxUIntN(SrcSize);
10635 return isPowerOf2_64(Mask);
10636 };
10637
10638 MachineOperand *SrcOp = &Def->getOperand(1);
10639 if (isMask(SrcOp))
10640 SrcOp = &Def->getOperand(2);
10641 else if (isMask(&Def->getOperand(2)))
10642 SrcOp = &Def->getOperand(1);
10643 else
10644 return false;
10645
10646 // A valid Mask is required to have a single bit set, hence a non-zero and
10647 // power-of-two value. This verifies that we will not do 64-bit shift below.
10648 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10649 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10650 if (IsSigned && BitNo == SrcSize - 1)
10651 return false;
10652
10653 ExpectedValue <<= BitNo;
10654
10655 bool IsReversedCC = false;
10656 if (CmpValue != ExpectedValue) {
10657 if (!IsReversible)
10658 return false;
10659 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10660 if (!IsReversedCC)
10661 return false;
10662 }
10663
10664 Register DefReg = Def->getOperand(0).getReg();
10665 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10666 return false;
10667
10668 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10669 I != E; ++I) {
10670 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10671 I->killsRegister(AMDGPU::SCC, &RI))
10672 return false;
10673 }
10674
10675 MachineOperand *SccDef =
10676 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10677 SccDef->setIsDead(false);
10678 CmpInstr.eraseFromParent();
10679
10680 if (!MRI->use_nodbg_empty(DefReg)) {
10681 assert(!IsReversedCC);
10682 return true;
10683 }
10684
10685 // Replace AND with unused result with a S_BITCMP.
10686 MachineBasicBlock *MBB = Def->getParent();
10687
10688 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10689 : AMDGPU::S_BITCMP1_B32
10690 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10691 : AMDGPU::S_BITCMP1_B64;
10692
10693 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10694 .add(*SrcOp)
10695 .addImm(BitNo);
10696 Def->eraseFromParent();
10697
10698 return true;
10699 };
10700
10701 switch (CmpInstr.getOpcode()) {
10702 default:
10703 break;
10704 case AMDGPU::S_CMP_EQ_U32:
10705 case AMDGPU::S_CMP_EQ_I32:
10706 case AMDGPU::S_CMPK_EQ_U32:
10707 case AMDGPU::S_CMPK_EQ_I32:
10708 return optimizeCmpAnd(1, 32, true, false);
10709 case AMDGPU::S_CMP_GE_U32:
10710 case AMDGPU::S_CMPK_GE_U32:
10711 return optimizeCmpAnd(1, 32, false, false);
10712 case AMDGPU::S_CMP_GE_I32:
10713 case AMDGPU::S_CMPK_GE_I32:
10714 return optimizeCmpAnd(1, 32, false, true);
10715 case AMDGPU::S_CMP_EQ_U64:
10716 return optimizeCmpAnd(1, 64, true, false);
10717 case AMDGPU::S_CMP_LG_U32:
10718 case AMDGPU::S_CMP_LG_I32:
10719 case AMDGPU::S_CMPK_LG_U32:
10720 case AMDGPU::S_CMPK_LG_I32:
10721 return optimizeCmpAnd(0, 32, true, false);
10722 case AMDGPU::S_CMP_GT_U32:
10723 case AMDGPU::S_CMPK_GT_U32:
10724 return optimizeCmpAnd(0, 32, false, false);
10725 case AMDGPU::S_CMP_GT_I32:
10726 case AMDGPU::S_CMPK_GT_I32:
10727 return optimizeCmpAnd(0, 32, false, true);
10728 case AMDGPU::S_CMP_LG_U64:
10729 return optimizeCmpAnd(0, 64, true, false);
10730 }
10731
10732 return false;
10733}
10734
10736 AMDGPU::OpName OpName) const {
10737 if (!ST.needsAlignedVGPRs())
10738 return;
10739
10740 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10741 if (OpNo < 0)
10742 return;
10743 MachineOperand &Op = MI.getOperand(OpNo);
10744 if (getOpSize(MI, OpNo) > 4)
10745 return;
10746
10747 // Add implicit aligned super-reg to force alignment on the data operand.
10748 const DebugLoc &DL = MI.getDebugLoc();
10749 MachineBasicBlock *BB = MI.getParent();
10751 Register DataReg = Op.getReg();
10752 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10753 Register Undef = MRI.createVirtualRegister(
10754 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10755 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10756 Register NewVR =
10757 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10758 : &AMDGPU::VReg_64_Align2RegClass);
10759 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10760 .addReg(DataReg, 0, Op.getSubReg())
10761 .addImm(AMDGPU::sub0)
10762 .addReg(Undef)
10763 .addImm(AMDGPU::sub1);
10764 Op.setReg(NewVR);
10765 Op.setSubReg(AMDGPU::sub0);
10766 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10767}
10768
10770 if (isIGLP(*MI))
10771 return false;
10772
10774}
10775
10777 if (!isWMMA(MI) && !isSWMMAC(MI))
10778 return false;
10779
10780 if (AMDGPU::isGFX1250(ST))
10781 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10782
10783 return true;
10784}
10785
10787 unsigned Opcode = MI.getOpcode();
10788
10789 if (AMDGPU::isGFX12Plus(ST))
10790 return isDOT(MI) || isXDLWMMA(MI);
10791
10792 if (!isMAI(MI) || isDGEMM(Opcode) ||
10793 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10794 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10795 return false;
10796
10797 if (!ST.hasGFX940Insts())
10798 return true;
10799
10800 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10801}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MCInstrDesc &TID, unsigned RCID)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
bool mayStore() const
Return true if this instruction could possibly modify memory.
bool mayLoad() const
Return true if this instruction could possibly read memory.
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:87
uint8_t OperandType
Information about the type of the operand.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:96
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:569
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:571
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:568
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:570
@ TI_CONSTDATA_START
Definition AMDGPU.h:567
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:72
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:70
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:71
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:62
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:73
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
Op::Description Desc
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:127
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:583
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:219
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.