LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197}
198
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(ToCycle)) {
223 FromCycle->getExitingBlocks(ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opc0) && isDS(Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(0) != Load1->getOperand(0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opc0).NumDefs;
279 Offset1Idx -= get(Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opc0) && isSMRD(Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Load0);
292 if (NumOps != getNumOperandsNoGlue(Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(0) != Load1->getOperand(0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
306 const ConstantSDNode *Load1Offset =
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opc0).NumDefs;
336 OffIdx1 -= get(Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(OffIdx0);
339 SDValue Off1 = Load1->getOperand(OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(LdSt)) {
378 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(64);
395 else
396 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
432 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(RSrc);
448 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(LdSt, AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
469 return true;
470 }
471
472 if (isImage(LdSt)) {
473 auto RsrcOpName =
474 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
476 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(&LdSt.getOperand(I));
482 } else {
483 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(LdSt)) {
495 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(BaseOp);
499 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(BaseOp);
514 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(BaseOp);
517 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
533 const MachineInstr &MI2,
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(Base1);
554 Base2 = getUnderlyingObject(Base2);
555
556 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
563 int64_t Offset1, bool OffsetIsScalable1,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
634 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(SrcReg, getKillRegState(KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(SrcReg, &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(DefOp.getReg(), &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(DefOp.getReg(), &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(ImpUseSuperReg,
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(std::next(MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
718 *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
732 /* RestoreAfter */ false, 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749 .addReg(SrcReg, getKillRegState(KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(ImpUseSuperReg,
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(Tmp, RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
761}
762
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, 2);
786 DestSubReg = RI.getSubReg(DestReg, SubIdx);
787 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794 .addReg(SrcSubReg)
795 .addReg(SrcReg, RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(FirstMI, LastMI);
807
808 FirstMI->addOperand(
809 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(SrcReg, &RI);
813}
814
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
821 unsigned Size = RI.getRegSizeInBits(*RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(DestReg);
843 Size = RI.getRegSizeInBits(*RC);
844 SrcRC = RI.getPhysRegBaseClass(SrcReg);
845 SrcSize = RI.getRegSizeInBits(*SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(MBB, MI, DL, get(Opc), DestReg)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(1)
865 .addImm(0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(0)
875 .addReg(SrcReg, getKillRegState(KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(1)
892 .addImm(0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(0)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
911 .addReg(SrcReg, getKillRegState(KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
924 .addReg(SrcReg, getKillRegState(KillSrc))
925 .addImm(0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
939 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(SrcReg, getKillRegState(KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(SrcReg, getKillRegState(KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
954 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
969 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
979 .addReg(NewSrcReg, getKillRegState(KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(SrcReg);
1003 } else {
1004 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(0) // src0_modifiers
1006 .addReg(SrcReg)
1007 .addImm(0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1019 .addReg(NewSrcReg, getKillRegState(KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1024 .addImm(0) // src0_modifiers
1025 .addReg(NewSrcReg)
1026 .addImm(0) // clamp
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(SrcReg, getKillRegState(KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1047 .addReg(SrcReg)
1049 .addReg(SrcReg)
1050 .addImm(0) // op_sel_lo
1051 .addImm(0) // op_sel_hi
1052 .addImm(0) // neg_lo
1053 .addImm(0) // neg_hi
1054 .addImm(0) // clamp
1055 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(SrcRC)) {
1063 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1067 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(*RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1120 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1130 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1133 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1135 .addReg(SrcSubReg)
1137 .addReg(SrcSubReg)
1138 .addImm(0) // op_sel_lo
1139 .addImm(0) // op_sel_hi
1140 .addImm(0) // neg_lo
1141 .addImm(0) // neg_hi
1142 .addImm(0) // clamp
1143 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1182 const DebugLoc &DL, Register DstReg,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1194 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1195 .add(Cond[0]);
1196 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1197 .addImm(0)
1198 .addReg(FalseReg)
1199 .addImm(0)
1200 .addReg(TrueReg)
1201 .addReg(SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1208 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1209 .addImm(0)
1210 .addReg(FalseReg)
1211 .addImm(0)
1212 .addReg(TrueReg)
1213 .addReg(SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1231 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1232 .add(RegOp);
1233 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1234 .addImm(0)
1235 .addReg(FalseReg)
1236 .addImm(0)
1237 .addReg(TrueReg)
1238 .addReg(SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1245 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1246 .add(RegOp);
1247 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addReg(SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1258 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1259 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1272 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1273 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1274 .addImm(0)
1275 .addReg(FalseReg)
1276 .addImm(0)
1277 .addReg(TrueReg)
1278 .addReg(SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1296 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1297 .addImm(Value)
1298 .addReg(SrcReg);
1299
1300 return Reg;
1301}
1302
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1309 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1310 .addImm(Value)
1311 .addReg(SrcReg);
1312
1313 return Reg;
1314}
1315
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO: {
1330 const MachineOperand &Src0 = MI.getOperand(1);
1331 if (Src0.isImm()) {
1332 ImmVal = Src0.getImm();
1333 return MI.getOperand(0).getReg() == Reg;
1334 }
1335
1336 return false;
1337 }
1338 case AMDGPU::S_BREV_B32:
1339 case AMDGPU::V_BFREV_B32_e32:
1340 case AMDGPU::V_BFREV_B32_e64: {
1341 const MachineOperand &Src0 = MI.getOperand(1);
1342 if (Src0.isImm()) {
1343 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1344 return MI.getOperand(0).getReg() == Reg;
1345 }
1346
1347 return false;
1348 }
1349 case AMDGPU::S_NOT_B32:
1350 case AMDGPU::V_NOT_B32_e32:
1351 case AMDGPU::V_NOT_B32_e64: {
1352 const MachineOperand &Src0 = MI.getOperand(1);
1353 if (Src0.isImm()) {
1354 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1355 return MI.getOperand(0).getReg() == Reg;
1356 }
1357
1358 return false;
1359 }
1360 default:
1361 return false;
1362 }
1363}
1364
1366
1367 if (RI.isAGPRClass(DstRC))
1368 return AMDGPU::COPY;
1369 if (RI.getRegSizeInBits(*DstRC) == 16) {
1370 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1371 // before RA.
1372 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1373 }
1374 if (RI.getRegSizeInBits(*DstRC) == 32)
1375 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1376 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1377 return AMDGPU::S_MOV_B64;
1378 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1379 return AMDGPU::V_MOV_B64_PSEUDO;
1380 return AMDGPU::COPY;
1381}
1382
1383const MCInstrDesc &
1385 bool IsIndirectSrc) const {
1386 if (IsIndirectSrc) {
1387 if (VecSize <= 32) // 4 bytes
1388 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1389 if (VecSize <= 64) // 8 bytes
1390 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1391 if (VecSize <= 96) // 12 bytes
1392 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1393 if (VecSize <= 128) // 16 bytes
1394 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1395 if (VecSize <= 160) // 20 bytes
1396 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1397 if (VecSize <= 256) // 32 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1399 if (VecSize <= 288) // 36 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1401 if (VecSize <= 320) // 40 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1403 if (VecSize <= 352) // 44 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1405 if (VecSize <= 384) // 48 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1407 if (VecSize <= 512) // 64 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1409 if (VecSize <= 1024) // 128 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1411
1412 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1413 }
1414
1415 if (VecSize <= 32) // 4 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1417 if (VecSize <= 64) // 8 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1419 if (VecSize <= 96) // 12 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1421 if (VecSize <= 128) // 16 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1423 if (VecSize <= 160) // 20 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1425 if (VecSize <= 256) // 32 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1427 if (VecSize <= 288) // 36 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1429 if (VecSize <= 320) // 40 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1431 if (VecSize <= 352) // 44 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1433 if (VecSize <= 384) // 48 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1435 if (VecSize <= 512) // 64 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1437 if (VecSize <= 1024) // 128 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1439
1440 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1441}
1442
1443static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1444 if (VecSize <= 32) // 4 bytes
1445 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1446 if (VecSize <= 64) // 8 bytes
1447 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1448 if (VecSize <= 96) // 12 bytes
1449 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1450 if (VecSize <= 128) // 16 bytes
1451 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1452 if (VecSize <= 160) // 20 bytes
1453 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1454 if (VecSize <= 256) // 32 bytes
1455 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1456 if (VecSize <= 288) // 36 bytes
1457 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1458 if (VecSize <= 320) // 40 bytes
1459 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1460 if (VecSize <= 352) // 44 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1462 if (VecSize <= 384) // 48 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1464 if (VecSize <= 512) // 64 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1466 if (VecSize <= 1024) // 128 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1468
1469 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1470}
1471
1472static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1473 if (VecSize <= 32) // 4 bytes
1474 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1475 if (VecSize <= 64) // 8 bytes
1476 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1477 if (VecSize <= 96) // 12 bytes
1478 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1479 if (VecSize <= 128) // 16 bytes
1480 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1481 if (VecSize <= 160) // 20 bytes
1482 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1483 if (VecSize <= 256) // 32 bytes
1484 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1485 if (VecSize <= 288) // 36 bytes
1486 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1487 if (VecSize <= 320) // 40 bytes
1488 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1489 if (VecSize <= 352) // 44 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1491 if (VecSize <= 384) // 48 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1493 if (VecSize <= 512) // 64 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1495 if (VecSize <= 1024) // 128 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1497
1498 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1499}
1500
1501static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1502 if (VecSize <= 64) // 8 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1504 if (VecSize <= 128) // 16 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1506 if (VecSize <= 256) // 32 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1508 if (VecSize <= 512) // 64 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1510 if (VecSize <= 1024) // 128 bytes
1511 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1512
1513 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1514}
1515
1516const MCInstrDesc &
1517SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1518 bool IsSGPR) const {
1519 if (IsSGPR) {
1520 switch (EltSize) {
1521 case 32:
1522 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1523 case 64:
1524 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1525 default:
1526 llvm_unreachable("invalid reg indexing elt size");
1527 }
1528 }
1529
1530 assert(EltSize == 32 && "invalid reg indexing elt size");
1532}
1533
1534static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1535 switch (Size) {
1536 case 4:
1537 return AMDGPU::SI_SPILL_S32_SAVE;
1538 case 8:
1539 return AMDGPU::SI_SPILL_S64_SAVE;
1540 case 12:
1541 return AMDGPU::SI_SPILL_S96_SAVE;
1542 case 16:
1543 return AMDGPU::SI_SPILL_S128_SAVE;
1544 case 20:
1545 return AMDGPU::SI_SPILL_S160_SAVE;
1546 case 24:
1547 return AMDGPU::SI_SPILL_S192_SAVE;
1548 case 28:
1549 return AMDGPU::SI_SPILL_S224_SAVE;
1550 case 32:
1551 return AMDGPU::SI_SPILL_S256_SAVE;
1552 case 36:
1553 return AMDGPU::SI_SPILL_S288_SAVE;
1554 case 40:
1555 return AMDGPU::SI_SPILL_S320_SAVE;
1556 case 44:
1557 return AMDGPU::SI_SPILL_S352_SAVE;
1558 case 48:
1559 return AMDGPU::SI_SPILL_S384_SAVE;
1560 case 64:
1561 return AMDGPU::SI_SPILL_S512_SAVE;
1562 case 128:
1563 return AMDGPU::SI_SPILL_S1024_SAVE;
1564 default:
1565 llvm_unreachable("unknown register size");
1566 }
1567}
1568
1569static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1570 switch (Size) {
1571 case 2:
1572 return AMDGPU::SI_SPILL_V16_SAVE;
1573 case 4:
1574 return AMDGPU::SI_SPILL_V32_SAVE;
1575 case 8:
1576 return AMDGPU::SI_SPILL_V64_SAVE;
1577 case 12:
1578 return AMDGPU::SI_SPILL_V96_SAVE;
1579 case 16:
1580 return AMDGPU::SI_SPILL_V128_SAVE;
1581 case 20:
1582 return AMDGPU::SI_SPILL_V160_SAVE;
1583 case 24:
1584 return AMDGPU::SI_SPILL_V192_SAVE;
1585 case 28:
1586 return AMDGPU::SI_SPILL_V224_SAVE;
1587 case 32:
1588 return AMDGPU::SI_SPILL_V256_SAVE;
1589 case 36:
1590 return AMDGPU::SI_SPILL_V288_SAVE;
1591 case 40:
1592 return AMDGPU::SI_SPILL_V320_SAVE;
1593 case 44:
1594 return AMDGPU::SI_SPILL_V352_SAVE;
1595 case 48:
1596 return AMDGPU::SI_SPILL_V384_SAVE;
1597 case 64:
1598 return AMDGPU::SI_SPILL_V512_SAVE;
1599 case 128:
1600 return AMDGPU::SI_SPILL_V1024_SAVE;
1601 default:
1602 llvm_unreachable("unknown register size");
1603 }
1604}
1605
1606static unsigned getAVSpillSaveOpcode(unsigned Size) {
1607 switch (Size) {
1608 case 4:
1609 return AMDGPU::SI_SPILL_AV32_SAVE;
1610 case 8:
1611 return AMDGPU::SI_SPILL_AV64_SAVE;
1612 case 12:
1613 return AMDGPU::SI_SPILL_AV96_SAVE;
1614 case 16:
1615 return AMDGPU::SI_SPILL_AV128_SAVE;
1616 case 20:
1617 return AMDGPU::SI_SPILL_AV160_SAVE;
1618 case 24:
1619 return AMDGPU::SI_SPILL_AV192_SAVE;
1620 case 28:
1621 return AMDGPU::SI_SPILL_AV224_SAVE;
1622 case 32:
1623 return AMDGPU::SI_SPILL_AV256_SAVE;
1624 case 36:
1625 return AMDGPU::SI_SPILL_AV288_SAVE;
1626 case 40:
1627 return AMDGPU::SI_SPILL_AV320_SAVE;
1628 case 44:
1629 return AMDGPU::SI_SPILL_AV352_SAVE;
1630 case 48:
1631 return AMDGPU::SI_SPILL_AV384_SAVE;
1632 case 64:
1633 return AMDGPU::SI_SPILL_AV512_SAVE;
1634 case 128:
1635 return AMDGPU::SI_SPILL_AV1024_SAVE;
1636 default:
1637 llvm_unreachable("unknown register size");
1638 }
1639}
1640
1641static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1642 bool IsVectorSuperClass) {
1643 // Currently, there is only 32-bit WWM register spills needed.
1644 if (Size != 4)
1645 llvm_unreachable("unknown wwm register spill size");
1646
1647 if (IsVectorSuperClass)
1648 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1649
1650 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1651}
1652
1654 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1655 const SIMachineFunctionInfo &MFI) const {
1656 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1657
1658 // Choose the right opcode if spilling a WWM register.
1660 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1661
1662 // TODO: Check if AGPRs are available
1663 if (ST.hasMAIInsts())
1664 return getAVSpillSaveOpcode(Size);
1665
1667}
1668
1671 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1672 MachineInstr::MIFlag Flags) const {
1673 MachineFunction *MF = MBB.getParent();
1675 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1676 const DebugLoc &DL = MBB.findDebugLoc(MI);
1677
1678 MachinePointerInfo PtrInfo
1679 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1681 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1682 FrameInfo.getObjectAlign(FrameIndex));
1683 unsigned SpillSize = RI.getSpillSize(*RC);
1684
1686 if (RI.isSGPRClass(RC)) {
1687 MFI->setHasSpilledSGPRs();
1688 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1689 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1690 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1691
1692 // We are only allowed to create one new instruction when spilling
1693 // registers, so we need to use pseudo instruction for spilling SGPRs.
1694 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1695
1696 // The SGPR spill/restore instructions only work on number sgprs, so we need
1697 // to make sure we are using the correct register class.
1698 if (SrcReg.isVirtual() && SpillSize == 4) {
1699 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1700 }
1701
1702 BuildMI(MBB, MI, DL, OpDesc)
1703 .addReg(SrcReg, getKillRegState(isKill)) // data
1704 .addFrameIndex(FrameIndex) // addr
1705 .addMemOperand(MMO)
1707
1708 if (RI.spillSGPRToVGPR())
1709 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1710 return;
1711 }
1712
1713 unsigned Opcode =
1714 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1715 MFI->setHasSpilledVGPRs();
1716
1717 BuildMI(MBB, MI, DL, get(Opcode))
1718 .addReg(SrcReg, getKillRegState(isKill)) // data
1719 .addFrameIndex(FrameIndex) // addr
1720 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1721 .addImm(0) // offset
1722 .addMemOperand(MMO);
1723}
1724
1725static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1726 switch (Size) {
1727 case 4:
1728 return AMDGPU::SI_SPILL_S32_RESTORE;
1729 case 8:
1730 return AMDGPU::SI_SPILL_S64_RESTORE;
1731 case 12:
1732 return AMDGPU::SI_SPILL_S96_RESTORE;
1733 case 16:
1734 return AMDGPU::SI_SPILL_S128_RESTORE;
1735 case 20:
1736 return AMDGPU::SI_SPILL_S160_RESTORE;
1737 case 24:
1738 return AMDGPU::SI_SPILL_S192_RESTORE;
1739 case 28:
1740 return AMDGPU::SI_SPILL_S224_RESTORE;
1741 case 32:
1742 return AMDGPU::SI_SPILL_S256_RESTORE;
1743 case 36:
1744 return AMDGPU::SI_SPILL_S288_RESTORE;
1745 case 40:
1746 return AMDGPU::SI_SPILL_S320_RESTORE;
1747 case 44:
1748 return AMDGPU::SI_SPILL_S352_RESTORE;
1749 case 48:
1750 return AMDGPU::SI_SPILL_S384_RESTORE;
1751 case 64:
1752 return AMDGPU::SI_SPILL_S512_RESTORE;
1753 case 128:
1754 return AMDGPU::SI_SPILL_S1024_RESTORE;
1755 default:
1756 llvm_unreachable("unknown register size");
1757 }
1758}
1759
1760static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1761 switch (Size) {
1762 case 2:
1763 return AMDGPU::SI_SPILL_V16_RESTORE;
1764 case 4:
1765 return AMDGPU::SI_SPILL_V32_RESTORE;
1766 case 8:
1767 return AMDGPU::SI_SPILL_V64_RESTORE;
1768 case 12:
1769 return AMDGPU::SI_SPILL_V96_RESTORE;
1770 case 16:
1771 return AMDGPU::SI_SPILL_V128_RESTORE;
1772 case 20:
1773 return AMDGPU::SI_SPILL_V160_RESTORE;
1774 case 24:
1775 return AMDGPU::SI_SPILL_V192_RESTORE;
1776 case 28:
1777 return AMDGPU::SI_SPILL_V224_RESTORE;
1778 case 32:
1779 return AMDGPU::SI_SPILL_V256_RESTORE;
1780 case 36:
1781 return AMDGPU::SI_SPILL_V288_RESTORE;
1782 case 40:
1783 return AMDGPU::SI_SPILL_V320_RESTORE;
1784 case 44:
1785 return AMDGPU::SI_SPILL_V352_RESTORE;
1786 case 48:
1787 return AMDGPU::SI_SPILL_V384_RESTORE;
1788 case 64:
1789 return AMDGPU::SI_SPILL_V512_RESTORE;
1790 case 128:
1791 return AMDGPU::SI_SPILL_V1024_RESTORE;
1792 default:
1793 llvm_unreachable("unknown register size");
1794 }
1795}
1796
1797static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1798 switch (Size) {
1799 case 4:
1800 return AMDGPU::SI_SPILL_AV32_RESTORE;
1801 case 8:
1802 return AMDGPU::SI_SPILL_AV64_RESTORE;
1803 case 12:
1804 return AMDGPU::SI_SPILL_AV96_RESTORE;
1805 case 16:
1806 return AMDGPU::SI_SPILL_AV128_RESTORE;
1807 case 20:
1808 return AMDGPU::SI_SPILL_AV160_RESTORE;
1809 case 24:
1810 return AMDGPU::SI_SPILL_AV192_RESTORE;
1811 case 28:
1812 return AMDGPU::SI_SPILL_AV224_RESTORE;
1813 case 32:
1814 return AMDGPU::SI_SPILL_AV256_RESTORE;
1815 case 36:
1816 return AMDGPU::SI_SPILL_AV288_RESTORE;
1817 case 40:
1818 return AMDGPU::SI_SPILL_AV320_RESTORE;
1819 case 44:
1820 return AMDGPU::SI_SPILL_AV352_RESTORE;
1821 case 48:
1822 return AMDGPU::SI_SPILL_AV384_RESTORE;
1823 case 64:
1824 return AMDGPU::SI_SPILL_AV512_RESTORE;
1825 case 128:
1826 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1827 default:
1828 llvm_unreachable("unknown register size");
1829 }
1830}
1831
1832static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1833 bool IsVectorSuperClass) {
1834 // Currently, there is only 32-bit WWM register spills needed.
1835 if (Size != 4)
1836 llvm_unreachable("unknown wwm register spill size");
1837
1838 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1839 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1840
1841 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1842}
1843
1845 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1846 const SIMachineFunctionInfo &MFI) const {
1847 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1848
1849 // Choose the right opcode if restoring a WWM register.
1851 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1852
1853 // TODO: Check if AGPRs are available
1854 if (ST.hasMAIInsts())
1856
1857 assert(!RI.isAGPRClass(RC));
1859}
1860
1863 Register DestReg, int FrameIndex,
1864 const TargetRegisterClass *RC,
1865 Register VReg,
1866 MachineInstr::MIFlag Flags) const {
1867 MachineFunction *MF = MBB.getParent();
1869 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1870 const DebugLoc &DL = MBB.findDebugLoc(MI);
1871 unsigned SpillSize = RI.getSpillSize(*RC);
1872
1873 MachinePointerInfo PtrInfo
1874 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1875
1877 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1878 FrameInfo.getObjectAlign(FrameIndex));
1879
1880 if (RI.isSGPRClass(RC)) {
1881 MFI->setHasSpilledSGPRs();
1882 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1883 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1884 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1885
1886 // FIXME: Maybe this should not include a memoperand because it will be
1887 // lowered to non-memory instructions.
1888 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1889 if (DestReg.isVirtual() && SpillSize == 4) {
1891 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1892 }
1893
1894 if (RI.spillSGPRToVGPR())
1895 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1896 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1897 .addFrameIndex(FrameIndex) // addr
1898 .addMemOperand(MMO)
1900
1901 return;
1902 }
1903
1904 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1905 SpillSize, *MFI);
1906 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1907 .addFrameIndex(FrameIndex) // vaddr
1908 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1909 .addImm(0) // offset
1910 .addMemOperand(MMO);
1911}
1912
1917
1920 unsigned Quantity) const {
1921 DebugLoc DL = MBB.findDebugLoc(MI);
1922 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1923 while (Quantity > 0) {
1924 unsigned Arg = std::min(Quantity, MaxSNopCount);
1925 Quantity -= Arg;
1926 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1927 }
1928}
1929
1931 auto *MF = MBB.getParent();
1932 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1933
1934 assert(Info->isEntryFunction());
1935
1936 if (MBB.succ_empty()) {
1937 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1938 if (HasNoTerminator) {
1939 if (Info->returnsVoid()) {
1940 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1941 } else {
1942 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1943 }
1944 }
1945 }
1946}
1947
1951 const DebugLoc &DL) const {
1952 MachineFunction *MF = MBB.getParent();
1953 constexpr unsigned DoorbellIDMask = 0x3ff;
1954 constexpr unsigned ECQueueWaveAbort = 0x400;
1955
1956 MachineBasicBlock *TrapBB = &MBB;
1957 MachineBasicBlock *ContBB = &MBB;
1958 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1959
1960 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1961 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1962 TrapBB = MF->CreateMachineBasicBlock();
1963 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1964 MF->push_back(TrapBB);
1965 MBB.addSuccessor(TrapBB);
1966 }
1967
1968 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1969 // will be a nop.
1970 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1971 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1972 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1973 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1974 DoorbellReg)
1976 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1977 .addUse(AMDGPU::M0);
1978 Register DoorbellRegMasked =
1979 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1980 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1981 .addUse(DoorbellReg)
1982 .addImm(DoorbellIDMask);
1983 Register SetWaveAbortBit =
1984 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1985 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1986 .addUse(DoorbellRegMasked)
1987 .addImm(ECQueueWaveAbort);
1988 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1989 .addUse(SetWaveAbortBit);
1990 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
1992 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1993 .addUse(AMDGPU::TTMP2);
1994 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
1995 TrapBB->addSuccessor(HaltLoopBB);
1996
1997 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
1998 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
1999 .addMBB(HaltLoopBB);
2000 MF->push_back(HaltLoopBB);
2001 HaltLoopBB->addSuccessor(HaltLoopBB);
2002
2003 return ContBB;
2004}
2005
2007 switch (MI.getOpcode()) {
2008 default:
2009 if (MI.isMetaInstruction())
2010 return 0;
2011 return 1; // FIXME: Do wait states equal cycles?
2012
2013 case AMDGPU::S_NOP:
2014 return MI.getOperand(0).getImm() + 1;
2015 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2016 // hazard, even if one exist, won't really be visible. Should we handle it?
2017 }
2018}
2019
2021 MachineBasicBlock &MBB = *MI.getParent();
2022 DebugLoc DL = MBB.findDebugLoc(MI);
2024 switch (MI.getOpcode()) {
2025 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2026 case AMDGPU::S_MOV_B64_term:
2027 // This is only a terminator to get the correct spill code placement during
2028 // register allocation.
2029 MI.setDesc(get(AMDGPU::S_MOV_B64));
2030 break;
2031
2032 case AMDGPU::S_MOV_B32_term:
2033 // This is only a terminator to get the correct spill code placement during
2034 // register allocation.
2035 MI.setDesc(get(AMDGPU::S_MOV_B32));
2036 break;
2037
2038 case AMDGPU::S_XOR_B64_term:
2039 // This is only a terminator to get the correct spill code placement during
2040 // register allocation.
2041 MI.setDesc(get(AMDGPU::S_XOR_B64));
2042 break;
2043
2044 case AMDGPU::S_XOR_B32_term:
2045 // This is only a terminator to get the correct spill code placement during
2046 // register allocation.
2047 MI.setDesc(get(AMDGPU::S_XOR_B32));
2048 break;
2049 case AMDGPU::S_OR_B64_term:
2050 // This is only a terminator to get the correct spill code placement during
2051 // register allocation.
2052 MI.setDesc(get(AMDGPU::S_OR_B64));
2053 break;
2054 case AMDGPU::S_OR_B32_term:
2055 // This is only a terminator to get the correct spill code placement during
2056 // register allocation.
2057 MI.setDesc(get(AMDGPU::S_OR_B32));
2058 break;
2059
2060 case AMDGPU::S_ANDN2_B64_term:
2061 // This is only a terminator to get the correct spill code placement during
2062 // register allocation.
2063 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2064 break;
2065
2066 case AMDGPU::S_ANDN2_B32_term:
2067 // This is only a terminator to get the correct spill code placement during
2068 // register allocation.
2069 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2070 break;
2071
2072 case AMDGPU::S_AND_B64_term:
2073 // This is only a terminator to get the correct spill code placement during
2074 // register allocation.
2075 MI.setDesc(get(AMDGPU::S_AND_B64));
2076 break;
2077
2078 case AMDGPU::S_AND_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(AMDGPU::S_AND_B32));
2082 break;
2083
2084 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2088 break;
2089
2090 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2094 break;
2095
2096 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2097 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2098 break;
2099
2100 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2101 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2102 break;
2103 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2104 Register Dst = MI.getOperand(0).getReg();
2105 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2106 MI.setDesc(
2107 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2108 break;
2109 }
2110 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2111 Register Dst = MI.getOperand(0).getReg();
2112 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2113 int64_t Imm = MI.getOperand(1).getImm();
2114
2115 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2116 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2117 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2120 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2121 .addImm(SignExtend64<32>(Imm >> 32))
2123 MI.eraseFromParent();
2124 break;
2125 }
2126
2127 [[fallthrough]];
2128 }
2129 case AMDGPU::V_MOV_B64_PSEUDO: {
2130 Register Dst = MI.getOperand(0).getReg();
2131 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2132 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2133
2134 const MachineOperand &SrcOp = MI.getOperand(1);
2135 // FIXME: Will this work for 64-bit floating point immediates?
2136 assert(!SrcOp.isFPImm());
2137 if (ST.hasMovB64()) {
2138 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2139 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2140 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2141 break;
2142 }
2143 if (SrcOp.isImm()) {
2144 APInt Imm(64, SrcOp.getImm());
2145 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2146 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2147 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2148 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2150 .addImm(Lo.getSExtValue())
2152 .addImm(Lo.getSExtValue())
2153 .addImm(0) // op_sel_lo
2154 .addImm(0) // op_sel_hi
2155 .addImm(0) // neg_lo
2156 .addImm(0) // neg_hi
2157 .addImm(0); // clamp
2158 } else {
2159 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2160 .addImm(Lo.getSExtValue())
2162 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2163 .addImm(Hi.getSExtValue())
2165 }
2166 } else {
2167 assert(SrcOp.isReg());
2168 if (ST.hasPkMovB32() &&
2169 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2170 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2171 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2172 .addReg(SrcOp.getReg())
2174 .addReg(SrcOp.getReg())
2175 .addImm(0) // op_sel_lo
2176 .addImm(0) // op_sel_hi
2177 .addImm(0) // neg_lo
2178 .addImm(0) // neg_hi
2179 .addImm(0); // clamp
2180 } else {
2181 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2182 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2184 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2185 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2187 }
2188 }
2189 MI.eraseFromParent();
2190 break;
2191 }
2192 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2194 break;
2195 }
2196 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2197 const MachineOperand &SrcOp = MI.getOperand(1);
2198 assert(!SrcOp.isFPImm());
2199
2200 if (ST.has64BitLiterals()) {
2201 MI.setDesc(get(AMDGPU::S_MOV_B64));
2202 break;
2203 }
2204
2205 APInt Imm(64, SrcOp.getImm());
2206 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2207 MI.setDesc(get(AMDGPU::S_MOV_B64));
2208 break;
2209 }
2210
2211 Register Dst = MI.getOperand(0).getReg();
2212 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2213 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2214
2215 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2216 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2217 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2218 .addImm(Lo.getSExtValue())
2220 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2221 .addImm(Hi.getSExtValue())
2223 MI.eraseFromParent();
2224 break;
2225 }
2226 case AMDGPU::V_SET_INACTIVE_B32: {
2227 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2228 Register DstReg = MI.getOperand(0).getReg();
2229 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2230 .add(MI.getOperand(3))
2231 .add(MI.getOperand(4))
2232 .add(MI.getOperand(1))
2233 .add(MI.getOperand(2))
2234 .add(MI.getOperand(5));
2235 MI.eraseFromParent();
2236 break;
2237 }
2238 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2239 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2240 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2241 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2242 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2243 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2244 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2245 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2246 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2247 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2248 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2249 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2250 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2251 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2252 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2253 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2254 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2255 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2256 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2257 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2258 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2259 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2260 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2261 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2262 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2263 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2267 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2268
2269 unsigned Opc;
2270 if (RI.hasVGPRs(EltRC)) {
2271 Opc = AMDGPU::V_MOVRELD_B32_e32;
2272 } else {
2273 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2274 : AMDGPU::S_MOVRELD_B32;
2275 }
2276
2277 const MCInstrDesc &OpDesc = get(Opc);
2278 Register VecReg = MI.getOperand(0).getReg();
2279 bool IsUndef = MI.getOperand(1).isUndef();
2280 unsigned SubReg = MI.getOperand(3).getImm();
2281 assert(VecReg == MI.getOperand(1).getReg());
2282
2284 BuildMI(MBB, MI, DL, OpDesc)
2285 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2286 .add(MI.getOperand(2))
2288 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2289
2290 const int ImpDefIdx =
2291 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2292 const int ImpUseIdx = ImpDefIdx + 1;
2293 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2294 MI.eraseFromParent();
2295 break;
2296 }
2297 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2309 assert(ST.useVGPRIndexMode());
2310 Register VecReg = MI.getOperand(0).getReg();
2311 bool IsUndef = MI.getOperand(1).isUndef();
2312 MachineOperand &Idx = MI.getOperand(3);
2313 Register SubReg = MI.getOperand(4).getImm();
2314
2315 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2316 .add(Idx)
2318 SetOn->getOperand(3).setIsUndef();
2319
2320 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2322 BuildMI(MBB, MI, DL, OpDesc)
2323 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2324 .add(MI.getOperand(2))
2326 .addReg(VecReg,
2327 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2328
2329 const int ImpDefIdx =
2330 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2331 const int ImpUseIdx = ImpDefIdx + 1;
2332 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2333
2334 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2335
2336 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2337
2338 MI.eraseFromParent();
2339 break;
2340 }
2341 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2342 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2343 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2344 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2345 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2346 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2347 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2348 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2349 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2350 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2351 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2352 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2353 assert(ST.useVGPRIndexMode());
2354 Register Dst = MI.getOperand(0).getReg();
2355 Register VecReg = MI.getOperand(1).getReg();
2356 bool IsUndef = MI.getOperand(1).isUndef();
2357 Register Idx = MI.getOperand(2).getReg();
2358 Register SubReg = MI.getOperand(3).getImm();
2359
2360 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2361 .addReg(Idx)
2363 SetOn->getOperand(3).setIsUndef();
2364
2365 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2366 .addDef(Dst)
2367 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2368 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2369
2370 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2371
2372 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2373
2374 MI.eraseFromParent();
2375 break;
2376 }
2377 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2378 MachineFunction &MF = *MBB.getParent();
2379 Register Reg = MI.getOperand(0).getReg();
2380 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2381 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2382 MachineOperand OpLo = MI.getOperand(1);
2383 MachineOperand OpHi = MI.getOperand(2);
2384
2385 // Create a bundle so these instructions won't be re-ordered by the
2386 // post-RA scheduler.
2387 MIBundleBuilder Bundler(MBB, MI);
2388 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2389
2390 // What we want here is an offset from the value returned by s_getpc (which
2391 // is the address of the s_add_u32 instruction) to the global variable, but
2392 // since the encoding of $symbol starts 4 bytes after the start of the
2393 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2394 // small. This requires us to add 4 to the global variable offset in order
2395 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2396 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2397 // instruction.
2398
2399 int64_t Adjust = 0;
2400 if (ST.hasGetPCZeroExtension()) {
2401 // Fix up hardware that does not sign-extend the 48-bit PC value by
2402 // inserting: s_sext_i32_i16 reghi, reghi
2403 Bundler.append(
2404 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2405 Adjust += 4;
2406 }
2407
2408 if (OpLo.isGlobal())
2409 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2410 Bundler.append(
2411 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2412
2413 if (OpHi.isGlobal())
2414 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2415 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2416 .addReg(RegHi)
2417 .add(OpHi));
2418
2419 finalizeBundle(MBB, Bundler.begin());
2420
2421 MI.eraseFromParent();
2422 break;
2423 }
2424 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2425 MachineFunction &MF = *MBB.getParent();
2426 Register Reg = MI.getOperand(0).getReg();
2427 MachineOperand Op = MI.getOperand(1);
2428
2429 // Create a bundle so these instructions won't be re-ordered by the
2430 // post-RA scheduler.
2431 MIBundleBuilder Bundler(MBB, MI);
2432 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2433 if (Op.isGlobal())
2434 Op.setOffset(Op.getOffset() + 4);
2435 Bundler.append(
2436 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2437
2438 finalizeBundle(MBB, Bundler.begin());
2439
2440 MI.eraseFromParent();
2441 break;
2442 }
2443 case AMDGPU::ENTER_STRICT_WWM: {
2444 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2445 // Whole Wave Mode is entered.
2446 MI.setDesc(get(LMC.OrSaveExecOpc));
2447 break;
2448 }
2449 case AMDGPU::ENTER_STRICT_WQM: {
2450 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2451 // STRICT_WQM is entered.
2452 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2453 .addReg(LMC.ExecReg);
2454 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2455
2456 MI.eraseFromParent();
2457 break;
2458 }
2459 case AMDGPU::EXIT_STRICT_WWM:
2460 case AMDGPU::EXIT_STRICT_WQM: {
2461 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2462 // WWM/STICT_WQM is exited.
2463 MI.setDesc(get(LMC.MovOpc));
2464 break;
2465 }
2466 case AMDGPU::SI_RETURN: {
2467 const MachineFunction *MF = MBB.getParent();
2468 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2469 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2470 // Hiding the return address use with SI_RETURN may lead to extra kills in
2471 // the function and missing live-ins. We are fine in practice because callee
2472 // saved register handling ensures the register value is restored before
2473 // RET, but we need the undef flag here to appease the MachineVerifier
2474 // liveness checks.
2476 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2477 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2478
2479 MIB.copyImplicitOps(MI);
2480 MI.eraseFromParent();
2481 break;
2482 }
2483
2484 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2485 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2486 MI.setDesc(get(AMDGPU::S_MUL_U64));
2487 break;
2488
2489 case AMDGPU::S_GETPC_B64_pseudo:
2490 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2491 if (ST.hasGetPCZeroExtension()) {
2492 Register Dst = MI.getOperand(0).getReg();
2493 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2494 // Fix up hardware that does not sign-extend the 48-bit PC value by
2495 // inserting: s_sext_i32_i16 dsthi, dsthi
2496 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2497 DstHi)
2498 .addReg(DstHi);
2499 }
2500 break;
2501
2502 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2503 assert(ST.hasBF16PackedInsts());
2504 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2505 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2506 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2507 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2508 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2509 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2510 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2511 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2512 break;
2513 }
2514
2515 return true;
2516}
2517
2520 unsigned SubIdx,
2521 const MachineInstr &Orig) const {
2522
2523 // Try shrinking the instruction to remat only the part needed for current
2524 // context.
2525 // TODO: Handle more cases.
2526 unsigned Opcode = Orig.getOpcode();
2527 switch (Opcode) {
2528 case AMDGPU::S_LOAD_DWORDX16_IMM:
2529 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2530 if (SubIdx != 0)
2531 break;
2532
2533 if (I == MBB.end())
2534 break;
2535
2536 if (I->isBundled())
2537 break;
2538
2539 // Look for a single use of the register that is also a subreg.
2540 Register RegToFind = Orig.getOperand(0).getReg();
2541 MachineOperand *UseMO = nullptr;
2542 for (auto &CandMO : I->operands()) {
2543 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2544 continue;
2545 if (UseMO) {
2546 UseMO = nullptr;
2547 break;
2548 }
2549 UseMO = &CandMO;
2550 }
2551 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2552 break;
2553
2554 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2555 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2556
2557 MachineFunction *MF = MBB.getParent();
2559 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2560
2561 unsigned NewOpcode = -1;
2562 if (SubregSize == 256)
2563 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2564 else if (SubregSize == 128)
2565 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2566 else
2567 break;
2568
2569 const MCInstrDesc &TID = get(NewOpcode);
2570 const TargetRegisterClass *NewRC =
2571 RI.getAllocatableClass(getRegClass(TID, 0));
2572 MRI.setRegClass(DestReg, NewRC);
2573
2574 UseMO->setReg(DestReg);
2575 UseMO->setSubReg(AMDGPU::NoSubRegister);
2576
2577 // Use a smaller load with the desired size, possibly with updated offset.
2578 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2579 MI->setDesc(TID);
2580 MI->getOperand(0).setReg(DestReg);
2581 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2582 if (Offset) {
2583 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2584 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2585 OffsetMO->setImm(FinalOffset);
2586 }
2588 for (const MachineMemOperand *MemOp : Orig.memoperands())
2589 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2590 SubregSize / 8));
2591 MI->setMemRefs(*MF, NewMMOs);
2592
2593 MBB.insert(I, MI);
2594 return;
2595 }
2596
2597 default:
2598 break;
2599 }
2600
2601 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2602}
2603
2604std::pair<MachineInstr*, MachineInstr*>
2606 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2607
2608 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2610 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2611 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2612 return std::pair(&MI, nullptr);
2613 }
2614
2615 MachineBasicBlock &MBB = *MI.getParent();
2616 DebugLoc DL = MBB.findDebugLoc(MI);
2617 MachineFunction *MF = MBB.getParent();
2619 Register Dst = MI.getOperand(0).getReg();
2620 unsigned Part = 0;
2621 MachineInstr *Split[2];
2622
2623 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2624 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2625 if (Dst.isPhysical()) {
2626 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2627 } else {
2628 assert(MRI.isSSA());
2629 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2630 MovDPP.addDef(Tmp);
2631 }
2632
2633 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2634 const MachineOperand &SrcOp = MI.getOperand(I);
2635 assert(!SrcOp.isFPImm());
2636 if (SrcOp.isImm()) {
2637 APInt Imm(64, SrcOp.getImm());
2638 Imm.ashrInPlace(Part * 32);
2639 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2640 } else {
2641 assert(SrcOp.isReg());
2642 Register Src = SrcOp.getReg();
2643 if (Src.isPhysical())
2644 MovDPP.addReg(RI.getSubReg(Src, Sub));
2645 else
2646 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2647 }
2648 }
2649
2650 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2651 MovDPP.addImm(MO.getImm());
2652
2653 Split[Part] = MovDPP;
2654 ++Part;
2655 }
2656
2657 if (Dst.isVirtual())
2658 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2659 .addReg(Split[0]->getOperand(0).getReg())
2660 .addImm(AMDGPU::sub0)
2661 .addReg(Split[1]->getOperand(0).getReg())
2662 .addImm(AMDGPU::sub1);
2663
2664 MI.eraseFromParent();
2665 return std::pair(Split[0], Split[1]);
2666}
2667
2668std::optional<DestSourcePair>
2670 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2671 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2672
2673 return std::nullopt;
2674}
2675
2677 AMDGPU::OpName Src0OpName,
2678 MachineOperand &Src1,
2679 AMDGPU::OpName Src1OpName) const {
2680 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2681 if (!Src0Mods)
2682 return false;
2683
2684 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2685 assert(Src1Mods &&
2686 "All commutable instructions have both src0 and src1 modifiers");
2687
2688 int Src0ModsVal = Src0Mods->getImm();
2689 int Src1ModsVal = Src1Mods->getImm();
2690
2691 Src1Mods->setImm(Src0ModsVal);
2692 Src0Mods->setImm(Src1ModsVal);
2693 return true;
2694}
2695
2697 MachineOperand &RegOp,
2698 MachineOperand &NonRegOp) {
2699 Register Reg = RegOp.getReg();
2700 unsigned SubReg = RegOp.getSubReg();
2701 bool IsKill = RegOp.isKill();
2702 bool IsDead = RegOp.isDead();
2703 bool IsUndef = RegOp.isUndef();
2704 bool IsDebug = RegOp.isDebug();
2705
2706 if (NonRegOp.isImm())
2707 RegOp.ChangeToImmediate(NonRegOp.getImm());
2708 else if (NonRegOp.isFI())
2709 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2710 else if (NonRegOp.isGlobal()) {
2711 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2712 NonRegOp.getTargetFlags());
2713 } else
2714 return nullptr;
2715
2716 // Make sure we don't reinterpret a subreg index in the target flags.
2717 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2718
2719 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2720 NonRegOp.setSubReg(SubReg);
2721
2722 return &MI;
2723}
2724
2726 MachineOperand &NonRegOp1,
2727 MachineOperand &NonRegOp2) {
2728 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2729 int64_t NonRegVal = NonRegOp1.getImm();
2730
2731 NonRegOp1.setImm(NonRegOp2.getImm());
2732 NonRegOp2.setImm(NonRegVal);
2733 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2734 NonRegOp2.setTargetFlags(TargetFlags);
2735 return &MI;
2736}
2737
2738bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2739 unsigned OpIdx1) const {
2740 const MCInstrDesc &InstDesc = MI.getDesc();
2741 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2742 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2743
2744 unsigned Opc = MI.getOpcode();
2745 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2746
2747 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2748 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2749
2750 // Swap doesn't breach constant bus or literal limits
2751 // It may move literal to position other than src0, this is not allowed
2752 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2753 // FIXME: After gfx9, literal can be in place other than Src0
2754 if (isVALU(MI)) {
2755 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2756 !isInlineConstant(MO0, OpInfo1))
2757 return false;
2758 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2759 !isInlineConstant(MO1, OpInfo0))
2760 return false;
2761 }
2762
2763 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2764 if (OpInfo1.RegClass == -1)
2765 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2766 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2767 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2768 }
2769 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2770 if (OpInfo0.RegClass == -1)
2771 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2772 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2773 isLegalRegOperand(MI, OpIdx0, MO1);
2774 }
2775
2776 // No need to check 64-bit literals since swapping does not bring new
2777 // 64-bit literals into current instruction to fold to 32-bit
2778
2779 return isImmOperandLegal(MI, OpIdx1, MO0);
2780}
2781
2783 unsigned Src0Idx,
2784 unsigned Src1Idx) const {
2785 assert(!NewMI && "this should never be used");
2786
2787 unsigned Opc = MI.getOpcode();
2788 int CommutedOpcode = commuteOpcode(Opc);
2789 if (CommutedOpcode == -1)
2790 return nullptr;
2791
2792 if (Src0Idx > Src1Idx)
2793 std::swap(Src0Idx, Src1Idx);
2794
2795 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2796 static_cast<int>(Src0Idx) &&
2797 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2798 static_cast<int>(Src1Idx) &&
2799 "inconsistency with findCommutedOpIndices");
2800
2801 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2802 return nullptr;
2803
2804 MachineInstr *CommutedMI = nullptr;
2805 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2806 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2807 if (Src0.isReg() && Src1.isReg()) {
2808 // Be sure to copy the source modifiers to the right place.
2809 CommutedMI =
2810 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2811 } else if (Src0.isReg() && !Src1.isReg()) {
2812 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2813 } else if (!Src0.isReg() && Src1.isReg()) {
2814 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2815 } else if (Src0.isImm() && Src1.isImm()) {
2816 CommutedMI = swapImmOperands(MI, Src0, Src1);
2817 } else {
2818 // FIXME: Found two non registers to commute. This does happen.
2819 return nullptr;
2820 }
2821
2822 if (CommutedMI) {
2823 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2824 Src1, AMDGPU::OpName::src1_modifiers);
2825
2826 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2827 AMDGPU::OpName::src1_sel);
2828
2829 CommutedMI->setDesc(get(CommutedOpcode));
2830 }
2831
2832 return CommutedMI;
2833}
2834
2835// This needs to be implemented because the source modifiers may be inserted
2836// between the true commutable operands, and the base
2837// TargetInstrInfo::commuteInstruction uses it.
2839 unsigned &SrcOpIdx0,
2840 unsigned &SrcOpIdx1) const {
2841 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2842}
2843
2845 unsigned &SrcOpIdx0,
2846 unsigned &SrcOpIdx1) const {
2847 if (!Desc.isCommutable())
2848 return false;
2849
2850 unsigned Opc = Desc.getOpcode();
2851 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2852 if (Src0Idx == -1)
2853 return false;
2854
2855 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2856 if (Src1Idx == -1)
2857 return false;
2858
2859 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2860}
2861
2863 int64_t BrOffset) const {
2864 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2865 // because its dest block is unanalyzable.
2866 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2867
2868 // Convert to dwords.
2869 BrOffset /= 4;
2870
2871 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2872 // from the next instruction.
2873 BrOffset -= 1;
2874
2875 return isIntN(BranchOffsetBits, BrOffset);
2876}
2877
2880 return MI.getOperand(0).getMBB();
2881}
2882
2884 for (const MachineInstr &MI : MBB->terminators()) {
2885 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2886 MI.getOpcode() == AMDGPU::SI_LOOP)
2887 return true;
2888 }
2889 return false;
2890}
2891
2893 MachineBasicBlock &DestBB,
2894 MachineBasicBlock &RestoreBB,
2895 const DebugLoc &DL, int64_t BrOffset,
2896 RegScavenger *RS) const {
2897 assert(MBB.empty() &&
2898 "new block should be inserted for expanding unconditional branch");
2899 assert(MBB.pred_size() == 1);
2900 assert(RestoreBB.empty() &&
2901 "restore block should be inserted for restoring clobbered registers");
2902
2903 MachineFunction *MF = MBB.getParent();
2906 auto I = MBB.end();
2907 auto &MCCtx = MF->getContext();
2908
2909 if (ST.hasAddPC64Inst()) {
2910 MCSymbol *Offset =
2911 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2912 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2914 MCSymbol *PostAddPCLabel =
2915 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2916 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2917 auto *OffsetExpr = MCBinaryExpr::createSub(
2918 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2919 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2920 Offset->setVariableValue(OffsetExpr);
2921 return;
2922 }
2923
2924 assert(RS && "RegScavenger required for long branching");
2925
2926 // FIXME: Virtual register workaround for RegScavenger not working with empty
2927 // blocks.
2928 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2929
2930 // Note: as this is used after hazard recognizer we need to apply some hazard
2931 // workarounds directly.
2932 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2933 ST.hasVALUReadSGPRHazard();
2934 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2935 if (FlushSGPRWrites)
2936 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2938 };
2939
2940 // We need to compute the offset relative to the instruction immediately after
2941 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2942 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2943 ApplyHazardWorkarounds();
2944
2945 MCSymbol *PostGetPCLabel =
2946 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2947 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2948
2949 MCSymbol *OffsetLo =
2950 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2951 MCSymbol *OffsetHi =
2952 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2953 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2954 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2955 .addReg(PCReg, 0, AMDGPU::sub0)
2956 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2957 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2958 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2959 .addReg(PCReg, 0, AMDGPU::sub1)
2960 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2961 ApplyHazardWorkarounds();
2962
2963 // Insert the indirect branch after the other terminator.
2964 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2965 .addReg(PCReg);
2966
2967 // If a spill is needed for the pc register pair, we need to insert a spill
2968 // restore block right before the destination block, and insert a short branch
2969 // into the old destination block's fallthrough predecessor.
2970 // e.g.:
2971 //
2972 // s_cbranch_scc0 skip_long_branch:
2973 //
2974 // long_branch_bb:
2975 // spill s[8:9]
2976 // s_getpc_b64 s[8:9]
2977 // s_add_u32 s8, s8, restore_bb
2978 // s_addc_u32 s9, s9, 0
2979 // s_setpc_b64 s[8:9]
2980 //
2981 // skip_long_branch:
2982 // foo;
2983 //
2984 // .....
2985 //
2986 // dest_bb_fallthrough_predecessor:
2987 // bar;
2988 // s_branch dest_bb
2989 //
2990 // restore_bb:
2991 // restore s[8:9]
2992 // fallthrough dest_bb
2993 ///
2994 // dest_bb:
2995 // buzz;
2996
2997 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2998 Register Scav;
2999
3000 // If we've previously reserved a register for long branches
3001 // avoid running the scavenger and just use those registers
3002 if (LongBranchReservedReg) {
3003 RS->enterBasicBlock(MBB);
3004 Scav = LongBranchReservedReg;
3005 } else {
3006 RS->enterBasicBlockEnd(MBB);
3007 Scav = RS->scavengeRegisterBackwards(
3008 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3009 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3010 }
3011 if (Scav) {
3012 RS->setRegUsed(Scav);
3013 MRI.replaceRegWith(PCReg, Scav);
3014 MRI.clearVirtRegs();
3015 } else {
3016 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3017 // SGPR spill.
3018 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3019 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3020 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3021 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3022 MRI.clearVirtRegs();
3023 }
3024
3025 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3026 // Now, the distance could be defined.
3028 MCSymbolRefExpr::create(DestLabel, MCCtx),
3029 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3030 // Add offset assignments.
3031 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3032 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3033 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3034 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3035}
3036
3037unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3038 switch (Cond) {
3039 case SIInstrInfo::SCC_TRUE:
3040 return AMDGPU::S_CBRANCH_SCC1;
3041 case SIInstrInfo::SCC_FALSE:
3042 return AMDGPU::S_CBRANCH_SCC0;
3043 case SIInstrInfo::VCCNZ:
3044 return AMDGPU::S_CBRANCH_VCCNZ;
3045 case SIInstrInfo::VCCZ:
3046 return AMDGPU::S_CBRANCH_VCCZ;
3047 case SIInstrInfo::EXECNZ:
3048 return AMDGPU::S_CBRANCH_EXECNZ;
3049 case SIInstrInfo::EXECZ:
3050 return AMDGPU::S_CBRANCH_EXECZ;
3051 default:
3052 llvm_unreachable("invalid branch predicate");
3053 }
3054}
3055
3056SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3057 switch (Opcode) {
3058 case AMDGPU::S_CBRANCH_SCC0:
3059 return SCC_FALSE;
3060 case AMDGPU::S_CBRANCH_SCC1:
3061 return SCC_TRUE;
3062 case AMDGPU::S_CBRANCH_VCCNZ:
3063 return VCCNZ;
3064 case AMDGPU::S_CBRANCH_VCCZ:
3065 return VCCZ;
3066 case AMDGPU::S_CBRANCH_EXECNZ:
3067 return EXECNZ;
3068 case AMDGPU::S_CBRANCH_EXECZ:
3069 return EXECZ;
3070 default:
3071 return INVALID_BR;
3072 }
3073}
3074
3078 MachineBasicBlock *&FBB,
3080 bool AllowModify) const {
3081 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3082 // Unconditional Branch
3083 TBB = I->getOperand(0).getMBB();
3084 return false;
3085 }
3086
3087 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3088 if (Pred == INVALID_BR)
3089 return true;
3090
3091 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3092 Cond.push_back(MachineOperand::CreateImm(Pred));
3093 Cond.push_back(I->getOperand(1)); // Save the branch register.
3094
3095 ++I;
3096
3097 if (I == MBB.end()) {
3098 // Conditional branch followed by fall-through.
3099 TBB = CondBB;
3100 return false;
3101 }
3102
3103 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3104 TBB = CondBB;
3105 FBB = I->getOperand(0).getMBB();
3106 return false;
3107 }
3108
3109 return true;
3110}
3111
3113 MachineBasicBlock *&FBB,
3115 bool AllowModify) const {
3116 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3117 auto E = MBB.end();
3118 if (I == E)
3119 return false;
3120
3121 // Skip over the instructions that are artificially terminators for special
3122 // exec management.
3123 while (I != E && !I->isBranch() && !I->isReturn()) {
3124 switch (I->getOpcode()) {
3125 case AMDGPU::S_MOV_B64_term:
3126 case AMDGPU::S_XOR_B64_term:
3127 case AMDGPU::S_OR_B64_term:
3128 case AMDGPU::S_ANDN2_B64_term:
3129 case AMDGPU::S_AND_B64_term:
3130 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3131 case AMDGPU::S_MOV_B32_term:
3132 case AMDGPU::S_XOR_B32_term:
3133 case AMDGPU::S_OR_B32_term:
3134 case AMDGPU::S_ANDN2_B32_term:
3135 case AMDGPU::S_AND_B32_term:
3136 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3137 break;
3138 case AMDGPU::SI_IF:
3139 case AMDGPU::SI_ELSE:
3140 case AMDGPU::SI_KILL_I1_TERMINATOR:
3141 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3142 // FIXME: It's messy that these need to be considered here at all.
3143 return true;
3144 default:
3145 llvm_unreachable("unexpected non-branch terminator inst");
3146 }
3147
3148 ++I;
3149 }
3150
3151 if (I == E)
3152 return false;
3153
3154 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3155}
3156
3158 int *BytesRemoved) const {
3159 unsigned Count = 0;
3160 unsigned RemovedSize = 0;
3161 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3162 // Skip over artificial terminators when removing instructions.
3163 if (MI.isBranch() || MI.isReturn()) {
3164 RemovedSize += getInstSizeInBytes(MI);
3165 MI.eraseFromParent();
3166 ++Count;
3167 }
3168 }
3169
3170 if (BytesRemoved)
3171 *BytesRemoved = RemovedSize;
3172
3173 return Count;
3174}
3175
3176// Copy the flags onto the implicit condition register operand.
3178 const MachineOperand &OrigCond) {
3179 CondReg.setIsUndef(OrigCond.isUndef());
3180 CondReg.setIsKill(OrigCond.isKill());
3181}
3182
3185 MachineBasicBlock *FBB,
3187 const DebugLoc &DL,
3188 int *BytesAdded) const {
3189 if (!FBB && Cond.empty()) {
3190 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3191 .addMBB(TBB);
3192 if (BytesAdded)
3193 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3194 return 1;
3195 }
3196
3197 assert(TBB && Cond[0].isImm());
3198
3199 unsigned Opcode
3200 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3201
3202 if (!FBB) {
3203 MachineInstr *CondBr =
3204 BuildMI(&MBB, DL, get(Opcode))
3205 .addMBB(TBB);
3206
3207 // Copy the flags onto the implicit condition register operand.
3208 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3209 fixImplicitOperands(*CondBr);
3210
3211 if (BytesAdded)
3212 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3213 return 1;
3214 }
3215
3216 assert(TBB && FBB);
3217
3218 MachineInstr *CondBr =
3219 BuildMI(&MBB, DL, get(Opcode))
3220 .addMBB(TBB);
3221 fixImplicitOperands(*CondBr);
3222 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3223 .addMBB(FBB);
3224
3225 MachineOperand &CondReg = CondBr->getOperand(1);
3226 CondReg.setIsUndef(Cond[1].isUndef());
3227 CondReg.setIsKill(Cond[1].isKill());
3228
3229 if (BytesAdded)
3230 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3231
3232 return 2;
3233}
3234
3237 if (Cond.size() != 2) {
3238 return true;
3239 }
3240
3241 if (Cond[0].isImm()) {
3242 Cond[0].setImm(-Cond[0].getImm());
3243 return false;
3244 }
3245
3246 return true;
3247}
3248
3251 Register DstReg, Register TrueReg,
3252 Register FalseReg, int &CondCycles,
3253 int &TrueCycles, int &FalseCycles) const {
3254 switch (Cond[0].getImm()) {
3255 case VCCNZ:
3256 case VCCZ: {
3257 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3258 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3259 if (MRI.getRegClass(FalseReg) != RC)
3260 return false;
3261
3262 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3263 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3264
3265 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3266 return RI.hasVGPRs(RC) && NumInsts <= 6;
3267 }
3268 case SCC_TRUE:
3269 case SCC_FALSE: {
3270 // FIXME: We could insert for VGPRs if we could replace the original compare
3271 // with a vector one.
3272 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3273 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3274 if (MRI.getRegClass(FalseReg) != RC)
3275 return false;
3276
3277 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3278
3279 // Multiples of 8 can do s_cselect_b64
3280 if (NumInsts % 2 == 0)
3281 NumInsts /= 2;
3282
3283 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3284 return RI.isSGPRClass(RC);
3285 }
3286 default:
3287 return false;
3288 }
3289}
3290
3294 Register TrueReg, Register FalseReg) const {
3295 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3296 if (Pred == VCCZ || Pred == SCC_FALSE) {
3297 Pred = static_cast<BranchPredicate>(-Pred);
3298 std::swap(TrueReg, FalseReg);
3299 }
3300
3301 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3302 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3303 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3304
3305 if (DstSize == 32) {
3307 if (Pred == SCC_TRUE) {
3308 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3309 .addReg(TrueReg)
3310 .addReg(FalseReg);
3311 } else {
3312 // Instruction's operands are backwards from what is expected.
3313 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3314 .addReg(FalseReg)
3315 .addReg(TrueReg);
3316 }
3317
3318 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3319 return;
3320 }
3321
3322 if (DstSize == 64 && Pred == SCC_TRUE) {
3324 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3325 .addReg(TrueReg)
3326 .addReg(FalseReg);
3327
3328 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3329 return;
3330 }
3331
3332 static const int16_t Sub0_15[] = {
3333 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3334 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3335 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3336 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3337 };
3338
3339 static const int16_t Sub0_15_64[] = {
3340 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3341 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3342 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3343 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3344 };
3345
3346 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3347 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3348 const int16_t *SubIndices = Sub0_15;
3349 int NElts = DstSize / 32;
3350
3351 // 64-bit select is only available for SALU.
3352 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3353 if (Pred == SCC_TRUE) {
3354 if (NElts % 2) {
3355 SelOp = AMDGPU::S_CSELECT_B32;
3356 EltRC = &AMDGPU::SGPR_32RegClass;
3357 } else {
3358 SelOp = AMDGPU::S_CSELECT_B64;
3359 EltRC = &AMDGPU::SGPR_64RegClass;
3360 SubIndices = Sub0_15_64;
3361 NElts /= 2;
3362 }
3363 }
3364
3366 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3367
3368 I = MIB->getIterator();
3369
3371 for (int Idx = 0; Idx != NElts; ++Idx) {
3372 Register DstElt = MRI.createVirtualRegister(EltRC);
3373 Regs.push_back(DstElt);
3374
3375 unsigned SubIdx = SubIndices[Idx];
3376
3378 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3379 Select =
3380 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3381 .addReg(FalseReg, 0, SubIdx)
3382 .addReg(TrueReg, 0, SubIdx);
3383 } else {
3384 Select =
3385 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3386 .addReg(TrueReg, 0, SubIdx)
3387 .addReg(FalseReg, 0, SubIdx);
3388 }
3389
3390 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3392
3393 MIB.addReg(DstElt)
3394 .addImm(SubIdx);
3395 }
3396}
3397
3399 switch (MI.getOpcode()) {
3400 case AMDGPU::V_MOV_B16_t16_e32:
3401 case AMDGPU::V_MOV_B16_t16_e64:
3402 case AMDGPU::V_MOV_B32_e32:
3403 case AMDGPU::V_MOV_B32_e64:
3404 case AMDGPU::V_MOV_B64_PSEUDO:
3405 case AMDGPU::V_MOV_B64_e32:
3406 case AMDGPU::V_MOV_B64_e64:
3407 case AMDGPU::S_MOV_B32:
3408 case AMDGPU::S_MOV_B64:
3409 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3410 case AMDGPU::COPY:
3411 case AMDGPU::WWM_COPY:
3412 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3413 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3414 case AMDGPU::V_ACCVGPR_MOV_B32:
3415 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3416 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3417 return true;
3418 default:
3419 return false;
3420 }
3421}
3422
3424 switch (MI.getOpcode()) {
3425 case AMDGPU::V_MOV_B16_t16_e32:
3426 case AMDGPU::V_MOV_B16_t16_e64:
3427 return 2;
3428 case AMDGPU::V_MOV_B32_e32:
3429 case AMDGPU::V_MOV_B32_e64:
3430 case AMDGPU::V_MOV_B64_PSEUDO:
3431 case AMDGPU::V_MOV_B64_e32:
3432 case AMDGPU::V_MOV_B64_e64:
3433 case AMDGPU::S_MOV_B32:
3434 case AMDGPU::S_MOV_B64:
3435 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3436 case AMDGPU::COPY:
3437 case AMDGPU::WWM_COPY:
3438 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3439 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3440 case AMDGPU::V_ACCVGPR_MOV_B32:
3441 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3442 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3443 return 1;
3444 default:
3445 llvm_unreachable("MI is not a foldable copy");
3446 }
3447}
3448
3449static constexpr AMDGPU::OpName ModifierOpNames[] = {
3450 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3451 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3452 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3453
3455 unsigned Opc = MI.getOpcode();
3456 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3457 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3458 if (Idx >= 0)
3459 MI.removeOperand(Idx);
3460 }
3461}
3462
3463std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3464 unsigned SubRegIndex) {
3465 switch (SubRegIndex) {
3466 case AMDGPU::NoSubRegister:
3467 return Imm;
3468 case AMDGPU::sub0:
3469 return SignExtend64<32>(Imm);
3470 case AMDGPU::sub1:
3471 return SignExtend64<32>(Imm >> 32);
3472 case AMDGPU::lo16:
3473 return SignExtend64<16>(Imm);
3474 case AMDGPU::hi16:
3475 return SignExtend64<16>(Imm >> 16);
3476 case AMDGPU::sub1_lo16:
3477 return SignExtend64<16>(Imm >> 32);
3478 case AMDGPU::sub1_hi16:
3479 return SignExtend64<16>(Imm >> 48);
3480 default:
3481 return std::nullopt;
3482 }
3483
3484 llvm_unreachable("covered subregister switch");
3485}
3486
3487static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3488 switch (Opc) {
3489 case AMDGPU::V_MAC_F16_e32:
3490 case AMDGPU::V_MAC_F16_e64:
3491 case AMDGPU::V_MAD_F16_e64:
3492 return AMDGPU::V_MADAK_F16;
3493 case AMDGPU::V_MAC_F32_e32:
3494 case AMDGPU::V_MAC_F32_e64:
3495 case AMDGPU::V_MAD_F32_e64:
3496 return AMDGPU::V_MADAK_F32;
3497 case AMDGPU::V_FMAC_F32_e32:
3498 case AMDGPU::V_FMAC_F32_e64:
3499 case AMDGPU::V_FMA_F32_e64:
3500 return AMDGPU::V_FMAAK_F32;
3501 case AMDGPU::V_FMAC_F16_e32:
3502 case AMDGPU::V_FMAC_F16_e64:
3503 case AMDGPU::V_FMAC_F16_t16_e64:
3504 case AMDGPU::V_FMAC_F16_fake16_e64:
3505 case AMDGPU::V_FMA_F16_e64:
3506 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3507 ? AMDGPU::V_FMAAK_F16_t16
3508 : AMDGPU::V_FMAAK_F16_fake16
3509 : AMDGPU::V_FMAAK_F16;
3510 case AMDGPU::V_FMAC_F64_e32:
3511 case AMDGPU::V_FMAC_F64_e64:
3512 case AMDGPU::V_FMA_F64_e64:
3513 return AMDGPU::V_FMAAK_F64;
3514 default:
3515 llvm_unreachable("invalid instruction");
3516 }
3517}
3518
3519static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3520 switch (Opc) {
3521 case AMDGPU::V_MAC_F16_e32:
3522 case AMDGPU::V_MAC_F16_e64:
3523 case AMDGPU::V_MAD_F16_e64:
3524 return AMDGPU::V_MADMK_F16;
3525 case AMDGPU::V_MAC_F32_e32:
3526 case AMDGPU::V_MAC_F32_e64:
3527 case AMDGPU::V_MAD_F32_e64:
3528 return AMDGPU::V_MADMK_F32;
3529 case AMDGPU::V_FMAC_F32_e32:
3530 case AMDGPU::V_FMAC_F32_e64:
3531 case AMDGPU::V_FMA_F32_e64:
3532 return AMDGPU::V_FMAMK_F32;
3533 case AMDGPU::V_FMAC_F16_e32:
3534 case AMDGPU::V_FMAC_F16_e64:
3535 case AMDGPU::V_FMAC_F16_t16_e64:
3536 case AMDGPU::V_FMAC_F16_fake16_e64:
3537 case AMDGPU::V_FMA_F16_e64:
3538 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3539 ? AMDGPU::V_FMAMK_F16_t16
3540 : AMDGPU::V_FMAMK_F16_fake16
3541 : AMDGPU::V_FMAMK_F16;
3542 case AMDGPU::V_FMAC_F64_e32:
3543 case AMDGPU::V_FMAC_F64_e64:
3544 case AMDGPU::V_FMA_F64_e64:
3545 return AMDGPU::V_FMAMK_F64;
3546 default:
3547 llvm_unreachable("invalid instruction");
3548 }
3549}
3550
3552 Register Reg, MachineRegisterInfo *MRI) const {
3553 int64_t Imm;
3554 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3555 return false;
3556
3557 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3558
3559 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3560
3561 unsigned Opc = UseMI.getOpcode();
3562 if (Opc == AMDGPU::COPY) {
3563 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3564
3565 Register DstReg = UseMI.getOperand(0).getReg();
3566 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3567
3568 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3569
3570 if (HasMultipleUses) {
3571 // TODO: This should fold in more cases with multiple use, but we need to
3572 // more carefully consider what those uses are.
3573 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3574
3575 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3576 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3577 return false;
3578
3579 // Most of the time folding a 32-bit inline constant is free (though this
3580 // might not be true if we can't later fold it into a real user).
3581 //
3582 // FIXME: This isInlineConstant check is imprecise if
3583 // getConstValDefinedInReg handled the tricky non-mov cases.
3584 if (ImmDefSize == 32 &&
3586 return false;
3587 }
3588
3589 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3590 RI.getSubRegIdxSize(UseSubReg) == 16;
3591
3592 if (Is16Bit) {
3593 if (RI.hasVGPRs(DstRC))
3594 return false; // Do not clobber vgpr_hi16
3595
3596 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3597 return false;
3598 }
3599
3600 MachineFunction *MF = UseMI.getMF();
3601
3602 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3603 MCRegister MovDstPhysReg =
3604 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3605
3606 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3607
3608 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3609 for (unsigned MovOp :
3610 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3611 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3612 const MCInstrDesc &MovDesc = get(MovOp);
3613
3614 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3615 if (Is16Bit) {
3616 // We just need to find a correctly sized register class, so the
3617 // subregister index compatibility doesn't matter since we're statically
3618 // extracting the immediate value.
3619 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3620 if (!MovDstRC)
3621 continue;
3622
3623 if (MovDstPhysReg) {
3624 // FIXME: We probably should not do this. If there is a live value in
3625 // the high half of the register, it will be corrupted.
3626 MovDstPhysReg =
3627 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3628 if (!MovDstPhysReg)
3629 continue;
3630 }
3631 }
3632
3633 // Result class isn't the right size, try the next instruction.
3634 if (MovDstPhysReg) {
3635 if (!MovDstRC->contains(MovDstPhysReg))
3636 return false;
3637 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3638 // TODO: This will be overly conservative in the case of 16-bit virtual
3639 // SGPRs. We could hack up the virtual register uses to use a compatible
3640 // 32-bit class.
3641 continue;
3642 }
3643
3644 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3645
3646 // Ensure the interpreted immediate value is a valid operand in the new
3647 // mov.
3648 //
3649 // FIXME: isImmOperandLegal should have form that doesn't require existing
3650 // MachineInstr or MachineOperand
3651 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3652 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3653 break;
3654
3655 NewOpc = MovOp;
3656 break;
3657 }
3658
3659 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3660 return false;
3661
3662 if (Is16Bit) {
3663 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3664 if (MovDstPhysReg)
3665 UseMI.getOperand(0).setReg(MovDstPhysReg);
3666 assert(UseMI.getOperand(1).getReg().isVirtual());
3667 }
3668
3669 const MCInstrDesc &NewMCID = get(NewOpc);
3670 UseMI.setDesc(NewMCID);
3671 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3672 UseMI.addImplicitDefUseOperands(*MF);
3673 return true;
3674 }
3675
3676 if (HasMultipleUses)
3677 return false;
3678
3679 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3680 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3681 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3682 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3683 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3684 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3685 Opc == AMDGPU::V_FMAC_F64_e64) {
3686 // Don't fold if we are using source or output modifiers. The new VOP2
3687 // instructions don't have them.
3689 return false;
3690
3691 // If this is a free constant, there's no reason to do this.
3692 // TODO: We could fold this here instead of letting SIFoldOperands do it
3693 // later.
3694 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3695
3696 // Any src operand can be used for the legality check.
3697 if (isInlineConstant(UseMI, Src0Idx, Imm))
3698 return false;
3699
3700 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3701
3702 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3703 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3704
3705 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3706 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3707 (Src1->isReg() && Src1->getReg() == Reg)) {
3708 MachineOperand *RegSrc =
3709 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3710 if (!RegSrc->isReg())
3711 return false;
3712 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3713 ST.getConstantBusLimit(Opc) < 2)
3714 return false;
3715
3716 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3717 return false;
3718
3719 // If src2 is also a literal constant then we have to choose which one to
3720 // fold. In general it is better to choose madak so that the other literal
3721 // can be materialized in an sgpr instead of a vgpr:
3722 // s_mov_b32 s0, literal
3723 // v_madak_f32 v0, s0, v0, literal
3724 // Instead of:
3725 // v_mov_b32 v1, literal
3726 // v_madmk_f32 v0, v0, literal, v1
3727 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3728 if (Def && Def->isMoveImmediate() &&
3729 !isInlineConstant(Def->getOperand(1)))
3730 return false;
3731
3732 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3733 if (pseudoToMCOpcode(NewOpc) == -1)
3734 return false;
3735
3736 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3737 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3738 // restricting their register classes. For now just bail out.
3739 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3740 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3741 return false;
3742
3743 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3744 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3745
3746 // FIXME: This would be a lot easier if we could return a new instruction
3747 // instead of having to modify in place.
3748
3749 Register SrcReg = RegSrc->getReg();
3750 unsigned SrcSubReg = RegSrc->getSubReg();
3751 Src0->setReg(SrcReg);
3752 Src0->setSubReg(SrcSubReg);
3753 Src0->setIsKill(RegSrc->isKill());
3754
3755 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3756 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3757 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3758 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3759 UseMI.untieRegOperand(
3760 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3761
3762 Src1->ChangeToImmediate(*SubRegImm);
3763
3765 UseMI.setDesc(get(NewOpc));
3766
3767 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3768 if (DeleteDef)
3769 DefMI.eraseFromParent();
3770
3771 return true;
3772 }
3773
3774 // Added part is the constant: Use v_madak_{f16, f32}.
3775 if (Src2->isReg() && Src2->getReg() == Reg) {
3776 if (ST.getConstantBusLimit(Opc) < 2) {
3777 // Not allowed to use constant bus for another operand.
3778 // We can however allow an inline immediate as src0.
3779 bool Src0Inlined = false;
3780 if (Src0->isReg()) {
3781 // Try to inline constant if possible.
3782 // If the Def moves immediate and the use is single
3783 // We are saving VGPR here.
3784 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3785 if (Def && Def->isMoveImmediate() &&
3786 isInlineConstant(Def->getOperand(1)) &&
3787 MRI->hasOneNonDBGUse(Src0->getReg())) {
3788 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3789 Src0Inlined = true;
3790 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3791 RI.isSGPRReg(*MRI, Src0->getReg())) {
3792 return false;
3793 }
3794 // VGPR is okay as Src0 - fallthrough
3795 }
3796
3797 if (Src1->isReg() && !Src0Inlined) {
3798 // We have one slot for inlinable constant so far - try to fill it
3799 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3800 if (Def && Def->isMoveImmediate() &&
3801 isInlineConstant(Def->getOperand(1)) &&
3802 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3803 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3804 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3805 return false;
3806 // VGPR is okay as Src1 - fallthrough
3807 }
3808 }
3809
3810 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3811 if (pseudoToMCOpcode(NewOpc) == -1)
3812 return false;
3813
3814 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3815 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3816 // restricting their register classes. For now just bail out.
3817 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3818 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3819 return false;
3820
3821 // FIXME: This would be a lot easier if we could return a new instruction
3822 // instead of having to modify in place.
3823
3824 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3825 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3826 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3827 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3828 UseMI.untieRegOperand(
3829 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3830
3831 const std::optional<int64_t> SubRegImm =
3832 extractSubregFromImm(Imm, Src2->getSubReg());
3833
3834 // ChangingToImmediate adds Src2 back to the instruction.
3835 Src2->ChangeToImmediate(*SubRegImm);
3836
3837 // These come before src2.
3839 UseMI.setDesc(get(NewOpc));
3840 // It might happen that UseMI was commuted
3841 // and we now have SGPR as SRC1. If so 2 inlined
3842 // constant and SGPR are illegal.
3844
3845 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3846 if (DeleteDef)
3847 DefMI.eraseFromParent();
3848
3849 return true;
3850 }
3851 }
3852
3853 return false;
3854}
3855
3856static bool
3859 if (BaseOps1.size() != BaseOps2.size())
3860 return false;
3861 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3862 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3863 return false;
3864 }
3865 return true;
3866}
3867
3868static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3869 LocationSize WidthB, int OffsetB) {
3870 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3871 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3872 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3873 return LowWidth.hasValue() &&
3874 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3875}
3876
3877bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3878 const MachineInstr &MIb) const {
3879 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3880 int64_t Offset0, Offset1;
3881 LocationSize Dummy0 = LocationSize::precise(0);
3882 LocationSize Dummy1 = LocationSize::precise(0);
3883 bool Offset0IsScalable, Offset1IsScalable;
3884 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3885 Dummy0, &RI) ||
3886 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3887 Dummy1, &RI))
3888 return false;
3889
3890 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3891 return false;
3892
3893 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3894 // FIXME: Handle ds_read2 / ds_write2.
3895 return false;
3896 }
3897 LocationSize Width0 = MIa.memoperands().front()->getSize();
3898 LocationSize Width1 = MIb.memoperands().front()->getSize();
3899 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3900}
3901
3903 const MachineInstr &MIb) const {
3904 assert(MIa.mayLoadOrStore() &&
3905 "MIa must load from or modify a memory location");
3906 assert(MIb.mayLoadOrStore() &&
3907 "MIb must load from or modify a memory location");
3908
3910 return false;
3911
3912 // XXX - Can we relax this between address spaces?
3913 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3914 return false;
3915
3916 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3917 return false;
3918
3919 if (MIa.isBundle() || MIb.isBundle())
3920 return false;
3921
3922 // TODO: Should we check the address space from the MachineMemOperand? That
3923 // would allow us to distinguish objects we know don't alias based on the
3924 // underlying address space, even if it was lowered to a different one,
3925 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3926 // buffer.
3927 if (isDS(MIa)) {
3928 if (isDS(MIb))
3929 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3930
3931 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3932 }
3933
3934 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3935 if (isMUBUF(MIb) || isMTBUF(MIb))
3936 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3937
3938 if (isFLAT(MIb))
3939 return isFLATScratch(MIb);
3940
3941 return !isSMRD(MIb);
3942 }
3943
3944 if (isSMRD(MIa)) {
3945 if (isSMRD(MIb))
3946 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3947
3948 if (isFLAT(MIb))
3949 return isFLATScratch(MIb);
3950
3951 return !isMUBUF(MIb) && !isMTBUF(MIb);
3952 }
3953
3954 if (isFLAT(MIa)) {
3955 if (isFLAT(MIb)) {
3956 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3957 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3958 return true;
3959
3960 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3961 }
3962
3963 return false;
3964 }
3965
3966 return false;
3967}
3968
3970 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3971 if (Reg.isPhysical())
3972 return false;
3973 auto *Def = MRI.getUniqueVRegDef(Reg);
3974 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3975 Imm = Def->getOperand(1).getImm();
3976 if (DefMI)
3977 *DefMI = Def;
3978 return true;
3979 }
3980 return false;
3981}
3982
3983static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3984 MachineInstr **DefMI = nullptr) {
3985 if (!MO->isReg())
3986 return false;
3987 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3988 const MachineRegisterInfo &MRI = MF->getRegInfo();
3989 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3990}
3991
3993 MachineInstr &NewMI) {
3994 if (LV) {
3995 unsigned NumOps = MI.getNumOperands();
3996 for (unsigned I = 1; I < NumOps; ++I) {
3997 MachineOperand &Op = MI.getOperand(I);
3998 if (Op.isReg() && Op.isKill())
3999 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4000 }
4001 }
4002}
4003
4004static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4005 switch (Opc) {
4006 case AMDGPU::V_MAC_F16_e32:
4007 case AMDGPU::V_MAC_F16_e64:
4008 return AMDGPU::V_MAD_F16_e64;
4009 case AMDGPU::V_MAC_F32_e32:
4010 case AMDGPU::V_MAC_F32_e64:
4011 return AMDGPU::V_MAD_F32_e64;
4012 case AMDGPU::V_MAC_LEGACY_F32_e32:
4013 case AMDGPU::V_MAC_LEGACY_F32_e64:
4014 return AMDGPU::V_MAD_LEGACY_F32_e64;
4015 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4016 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4017 return AMDGPU::V_FMA_LEGACY_F32_e64;
4018 case AMDGPU::V_FMAC_F16_e32:
4019 case AMDGPU::V_FMAC_F16_e64:
4020 case AMDGPU::V_FMAC_F16_t16_e64:
4021 case AMDGPU::V_FMAC_F16_fake16_e64:
4022 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4023 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4024 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4025 : AMDGPU::V_FMA_F16_gfx9_e64;
4026 case AMDGPU::V_FMAC_F32_e32:
4027 case AMDGPU::V_FMAC_F32_e64:
4028 return AMDGPU::V_FMA_F32_e64;
4029 case AMDGPU::V_FMAC_F64_e32:
4030 case AMDGPU::V_FMAC_F64_e64:
4031 return AMDGPU::V_FMA_F64_e64;
4032 default:
4033 llvm_unreachable("invalid instruction");
4034 }
4035}
4036
4037/// Helper struct for the implementation of 3-address conversion to communicate
4038/// updates made to instruction operands.
4040 /// Other instruction whose def is no longer used by the converted
4041 /// instruction.
4043};
4044
4046 LiveVariables *LV,
4047 LiveIntervals *LIS) const {
4048 MachineBasicBlock &MBB = *MI.getParent();
4050 MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
4051
4052 if (NewMI) {
4053 updateLiveVariables(LV, MI, *NewMI);
4054 if (LIS) {
4055 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4056 // SlotIndex of defs needs to be updated when converting to early-clobber
4057 MachineOperand &Def = NewMI->getOperand(0);
4058 if (Def.isEarlyClobber() && Def.isReg() &&
4059 LIS->hasInterval(Def.getReg())) {
4060 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4061 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4062 auto &LI = LIS->getInterval(Def.getReg());
4063 auto UpdateDefIndex = [&](LiveRange &LR) {
4064 auto *S = LR.find(OldIndex);
4065 if (S != LR.end() && S->start == OldIndex) {
4066 assert(S->valno && S->valno->def == OldIndex);
4067 S->start = NewIndex;
4068 S->valno->def = NewIndex;
4069 }
4070 };
4071 UpdateDefIndex(LI);
4072 for (auto &SR : LI.subranges())
4073 UpdateDefIndex(SR);
4074 }
4075 }
4076 }
4077
4078 if (U.RemoveMIUse) {
4079 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4080 // The only user is the instruction which will be killed.
4081 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4082
4083 if (MRI.hasOneNonDBGUse(DefReg)) {
4084 // We cannot just remove the DefMI here, calling pass will crash.
4085 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4086 U.RemoveMIUse->getOperand(0).setIsDead(true);
4087 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4088 U.RemoveMIUse->removeOperand(I);
4089 if (LV)
4090 LV->getVarInfo(DefReg).AliveBlocks.clear();
4091 }
4092
4093 if (LIS) {
4094 LiveInterval &DefLI = LIS->getInterval(DefReg);
4095
4096 // We cannot delete the original instruction here, so hack out the use
4097 // in the original instruction with a dummy register so we can use
4098 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4099 // not have the complexity of deleting a use to consider here.
4100 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4101 for (MachineOperand &MIOp : MI.uses()) {
4102 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4103 MIOp.setIsUndef(true);
4104 MIOp.setReg(DummyReg);
4105 }
4106 }
4107
4108 LIS->shrinkToUses(&DefLI);
4109 }
4110 }
4111
4112 return NewMI;
4113}
4114
4116SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4117 ThreeAddressUpdates &U) const {
4118 MachineBasicBlock &MBB = *MI.getParent();
4119 unsigned Opc = MI.getOpcode();
4120
4121 // Handle MFMA.
4122 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4123 if (NewMFMAOpc != -1) {
4125 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4126 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4127 MIB.add(MI.getOperand(I));
4128 return MIB;
4129 }
4130
4131 if (SIInstrInfo::isWMMA(MI)) {
4132 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4133 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4134 .setMIFlags(MI.getFlags());
4135 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4136 MIB->addOperand(MI.getOperand(I));
4137 return MIB;
4138 }
4139
4140 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4141 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4142 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4143 "present pre-RA");
4144
4145 // Handle MAC/FMAC.
4146 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4147 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4148 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4149 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4150 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4151 bool Src0Literal = false;
4152
4153 switch (Opc) {
4154 default:
4155 return nullptr;
4156 case AMDGPU::V_MAC_F16_e64:
4157 case AMDGPU::V_FMAC_F16_e64:
4158 case AMDGPU::V_FMAC_F16_t16_e64:
4159 case AMDGPU::V_FMAC_F16_fake16_e64:
4160 case AMDGPU::V_MAC_F32_e64:
4161 case AMDGPU::V_MAC_LEGACY_F32_e64:
4162 case AMDGPU::V_FMAC_F32_e64:
4163 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4164 case AMDGPU::V_FMAC_F64_e64:
4165 break;
4166 case AMDGPU::V_MAC_F16_e32:
4167 case AMDGPU::V_FMAC_F16_e32:
4168 case AMDGPU::V_MAC_F32_e32:
4169 case AMDGPU::V_MAC_LEGACY_F32_e32:
4170 case AMDGPU::V_FMAC_F32_e32:
4171 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4172 case AMDGPU::V_FMAC_F64_e32: {
4173 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4174 AMDGPU::OpName::src0);
4175 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4176 if (!Src0->isReg() && !Src0->isImm())
4177 return nullptr;
4178
4179 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4180 Src0Literal = true;
4181
4182 break;
4183 }
4184 }
4185
4186 MachineInstrBuilder MIB;
4187 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4188 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4189 const MachineOperand *Src0Mods =
4190 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4191 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4192 const MachineOperand *Src1Mods =
4193 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4194 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4195 const MachineOperand *Src2Mods =
4196 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4197 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4198 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4199 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4200
4201 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4202 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4203 // If we have an SGPR input, we will violate the constant bus restriction.
4204 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4205 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4206 MachineInstr *DefMI;
4207
4208 int64_t Imm;
4209 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4210 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4211 if (pseudoToMCOpcode(NewOpc) != -1) {
4212 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4213 .add(*Dst)
4214 .add(*Src0)
4215 .add(*Src1)
4216 .addImm(Imm)
4217 .setMIFlags(MI.getFlags());
4218 U.RemoveMIUse = DefMI;
4219 return MIB;
4220 }
4221 }
4222 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4223 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4224 if (pseudoToMCOpcode(NewOpc) != -1) {
4225 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4226 .add(*Dst)
4227 .add(*Src0)
4228 .addImm(Imm)
4229 .add(*Src2)
4230 .setMIFlags(MI.getFlags());
4231 U.RemoveMIUse = DefMI;
4232 return MIB;
4233 }
4234 }
4235 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4236 if (Src0Literal) {
4237 Imm = Src0->getImm();
4238 DefMI = nullptr;
4239 }
4240 if (pseudoToMCOpcode(NewOpc) != -1 &&
4242 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4243 Src1)) {
4244 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4245 .add(*Dst)
4246 .add(*Src1)
4247 .addImm(Imm)
4248 .add(*Src2)
4249 .setMIFlags(MI.getFlags());
4250 U.RemoveMIUse = DefMI;
4251 return MIB;
4252 }
4253 }
4254 }
4255
4256 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4257 // if VOP3 does not allow a literal operand.
4258 if (Src0Literal && !ST.hasVOP3Literal())
4259 return nullptr;
4260
4261 unsigned NewOpc = getNewFMAInst(ST, Opc);
4262
4263 if (pseudoToMCOpcode(NewOpc) == -1)
4264 return nullptr;
4265
4266 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4267 .add(*Dst)
4268 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4269 .add(*Src0)
4270 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4271 .add(*Src1)
4272 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4273 .add(*Src2)
4274 .addImm(Clamp ? Clamp->getImm() : 0)
4275 .addImm(Omod ? Omod->getImm() : 0)
4276 .setMIFlags(MI.getFlags());
4277 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4278 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4279 return MIB;
4280}
4281
4282// It's not generally safe to move VALU instructions across these since it will
4283// start using the register as a base index rather than directly.
4284// XXX - Why isn't hasSideEffects sufficient for these?
4286 switch (MI.getOpcode()) {
4287 case AMDGPU::S_SET_GPR_IDX_ON:
4288 case AMDGPU::S_SET_GPR_IDX_MODE:
4289 case AMDGPU::S_SET_GPR_IDX_OFF:
4290 return true;
4291 default:
4292 return false;
4293 }
4294}
4295
4297 const MachineBasicBlock *MBB,
4298 const MachineFunction &MF) const {
4299 // Skipping the check for SP writes in the base implementation. The reason it
4300 // was added was apparently due to compile time concerns.
4301 //
4302 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4303 // but is probably avoidable.
4304
4305 // Copied from base implementation.
4306 // Terminators and labels can't be scheduled around.
4307 if (MI.isTerminator() || MI.isPosition())
4308 return true;
4309
4310 // INLINEASM_BR can jump to another block
4311 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4312 return true;
4313
4314 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4315 return true;
4316
4317 // Target-independent instructions do not have an implicit-use of EXEC, even
4318 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4319 // boundaries prevents incorrect movements of such instructions.
4320 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4321 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4322 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4323 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4324 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4326}
4327
4329 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4330 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4331 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4332}
4333
4335 if (!isFLAT(MI) || isFLATGlobal(MI))
4336 return false;
4337
4338 // If scratch is not initialized, we can never access it.
4339 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4340 return false;
4341
4342 // SCRATCH instructions always access scratch.
4343 if (isFLATScratch(MI))
4344 return true;
4345
4346 // If there are no memory operands then conservatively assume the flat
4347 // operation may access scratch.
4348 if (MI.memoperands_empty())
4349 return true;
4350
4351 // See if any memory operand specifies an address space that involves scratch.
4352 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4353 unsigned AS = Memop->getAddrSpace();
4354 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4355 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4356 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4357 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4358 }
4359 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4360 });
4361}
4362
4364 assert(isFLAT(MI));
4365
4366 // All flat instructions use the VMEM counter except prefetch.
4367 if (!usesVM_CNT(MI))
4368 return false;
4369
4370 // If there are no memory operands then conservatively assume the flat
4371 // operation may access VMEM.
4372 if (MI.memoperands_empty())
4373 return true;
4374
4375 // See if any memory operand specifies an address space that involves VMEM.
4376 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4377 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4378 // (GDS) address space is not supported by flat operations. Therefore, simply
4379 // return true unless only the LDS address space is found.
4380 for (const MachineMemOperand *Memop : MI.memoperands()) {
4381 unsigned AS = Memop->getAddrSpace();
4383 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4384 return true;
4385 }
4386
4387 return false;
4388}
4389
4391 assert(isFLAT(MI));
4392
4393 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4394 if (!usesLGKM_CNT(MI))
4395 return false;
4396
4397 // If in tgsplit mode then there can be no use of LDS.
4398 if (ST.isTgSplitEnabled())
4399 return false;
4400
4401 // If there are no memory operands then conservatively assume the flat
4402 // operation may access LDS.
4403 if (MI.memoperands_empty())
4404 return true;
4405
4406 // See if any memory operand specifies an address space that involves LDS.
4407 for (const MachineMemOperand *Memop : MI.memoperands()) {
4408 unsigned AS = Memop->getAddrSpace();
4410 return true;
4411 }
4412
4413 return false;
4414}
4415
4417 // Skip the full operand and register alias search modifiesRegister
4418 // does. There's only a handful of instructions that touch this, it's only an
4419 // implicit def, and doesn't alias any other registers.
4420 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4421}
4422
4424 unsigned Opcode = MI.getOpcode();
4425
4426 if (MI.mayStore() && isSMRD(MI))
4427 return true; // scalar store or atomic
4428
4429 // This will terminate the function when other lanes may need to continue.
4430 if (MI.isReturn())
4431 return true;
4432
4433 // These instructions cause shader I/O that may cause hardware lockups
4434 // when executed with an empty EXEC mask.
4435 //
4436 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4437 // EXEC = 0, but checking for that case here seems not worth it
4438 // given the typical code patterns.
4439 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4440 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4441 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4442 return true;
4443
4444 if (MI.isCall() || MI.isInlineAsm())
4445 return true; // conservative assumption
4446
4447 // Assume that barrier interactions are only intended with active lanes.
4448 if (isBarrier(Opcode))
4449 return true;
4450
4451 // A mode change is a scalar operation that influences vector instructions.
4453 return true;
4454
4455 // These are like SALU instructions in terms of effects, so it's questionable
4456 // whether we should return true for those.
4457 //
4458 // However, executing them with EXEC = 0 causes them to operate on undefined
4459 // data, which we avoid by returning true here.
4460 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4461 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4462 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4463 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4464 return true;
4465
4466 return false;
4467}
4468
4470 const MachineInstr &MI) const {
4471 if (MI.isMetaInstruction())
4472 return false;
4473
4474 // This won't read exec if this is an SGPR->SGPR copy.
4475 if (MI.isCopyLike()) {
4476 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4477 return true;
4478
4479 // Make sure this isn't copying exec as a normal operand
4480 return MI.readsRegister(AMDGPU::EXEC, &RI);
4481 }
4482
4483 // Make a conservative assumption about the callee.
4484 if (MI.isCall())
4485 return true;
4486
4487 // Be conservative with any unhandled generic opcodes.
4488 if (!isTargetSpecificOpcode(MI.getOpcode()))
4489 return true;
4490
4491 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4492}
4493
4494bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4495 switch (Imm.getBitWidth()) {
4496 case 1: // This likely will be a condition code mask.
4497 return true;
4498
4499 case 32:
4500 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4501 ST.hasInv2PiInlineImm());
4502 case 64:
4503 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4504 ST.hasInv2PiInlineImm());
4505 case 16:
4506 return ST.has16BitInsts() &&
4507 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4508 ST.hasInv2PiInlineImm());
4509 default:
4510 llvm_unreachable("invalid bitwidth");
4511 }
4512}
4513
4515 APInt IntImm = Imm.bitcastToAPInt();
4516 int64_t IntImmVal = IntImm.getSExtValue();
4517 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4518 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4519 default:
4520 llvm_unreachable("invalid fltSemantics");
4523 return isInlineConstant(IntImm);
4525 return ST.has16BitInsts() &&
4526 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4528 return ST.has16BitInsts() &&
4529 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4530 }
4531}
4532
4533bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4534 // MachineOperand provides no way to tell the true operand size, since it only
4535 // records a 64-bit value. We need to know the size to determine if a 32-bit
4536 // floating point immediate bit pattern is legal for an integer immediate. It
4537 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4538 switch (OperandType) {
4548 int32_t Trunc = static_cast<int32_t>(Imm);
4549 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4550 }
4556 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4559 // We would expect inline immediates to not be concerned with an integer/fp
4560 // distinction. However, in the case of 16-bit integer operations, the
4561 // "floating point" values appear to not work. It seems read the low 16-bits
4562 // of 32-bit immediates, which happens to always work for the integer
4563 // values.
4564 //
4565 // See llvm bugzilla 46302.
4566 //
4567 // TODO: Theoretically we could use op-sel to use the high bits of the
4568 // 32-bit FP values.
4580 return false;
4583 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4584 // A few special case instructions have 16-bit operands on subtargets
4585 // where 16-bit instructions are not legal.
4586 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4587 // constants in these cases
4588 int16_t Trunc = static_cast<int16_t>(Imm);
4589 return ST.has16BitInsts() &&
4590 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4591 }
4592
4593 return false;
4594 }
4597 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4598 int16_t Trunc = static_cast<int16_t>(Imm);
4599 return ST.has16BitInsts() &&
4600 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4601 }
4602 return false;
4603 }
4607 return false;
4609 return isLegalAV64PseudoImm(Imm);
4612 // Always embedded in the instruction for free.
4613 return true;
4623 // Just ignore anything else.
4624 return true;
4625 default:
4626 llvm_unreachable("invalid operand type");
4627 }
4628}
4629
4630static bool compareMachineOp(const MachineOperand &Op0,
4631 const MachineOperand &Op1) {
4632 if (Op0.getType() != Op1.getType())
4633 return false;
4634
4635 switch (Op0.getType()) {
4637 return Op0.getReg() == Op1.getReg();
4639 return Op0.getImm() == Op1.getImm();
4640 default:
4641 llvm_unreachable("Didn't expect to be comparing these operand types");
4642 }
4643}
4644
4646 const MCOperandInfo &OpInfo) const {
4647 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4648 return true;
4649
4650 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4651 return false;
4652
4653 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4654 return true;
4655
4656 return ST.hasVOP3Literal();
4657}
4658
4659bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4660 int64_t ImmVal) const {
4661 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4662 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4663 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4664 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4665 AMDGPU::OpName::src2))
4666 return false;
4667 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4668 }
4669
4670 return isLiteralOperandLegal(InstDesc, OpInfo);
4671}
4672
4673bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4674 const MachineOperand &MO) const {
4675 if (MO.isImm())
4676 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4677
4678 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4679 "unexpected imm-like operand kind");
4680 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4681 return isLiteralOperandLegal(InstDesc, OpInfo);
4682}
4683
4685 // 2 32-bit inline constants packed into one.
4686 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4687 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4688}
4689
4690bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4691 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4692 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4693 return false;
4694
4695 int Op32 = AMDGPU::getVOPe32(Opcode);
4696 if (Op32 == -1)
4697 return false;
4698
4699 return pseudoToMCOpcode(Op32) != -1;
4700}
4701
4702bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4703 // The src0_modifier operand is present on all instructions
4704 // that have modifiers.
4705
4706 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4707}
4708
4710 AMDGPU::OpName OpName) const {
4711 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4712 return Mods && Mods->getImm();
4713}
4714
4716 return any_of(ModifierOpNames,
4717 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4718}
4719
4721 const MachineRegisterInfo &MRI) const {
4722 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4723 // Can't shrink instruction with three operands.
4724 if (Src2) {
4725 switch (MI.getOpcode()) {
4726 default: return false;
4727
4728 case AMDGPU::V_ADDC_U32_e64:
4729 case AMDGPU::V_SUBB_U32_e64:
4730 case AMDGPU::V_SUBBREV_U32_e64: {
4731 const MachineOperand *Src1
4732 = getNamedOperand(MI, AMDGPU::OpName::src1);
4733 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4734 return false;
4735 // Additional verification is needed for sdst/src2.
4736 return true;
4737 }
4738 case AMDGPU::V_MAC_F16_e64:
4739 case AMDGPU::V_MAC_F32_e64:
4740 case AMDGPU::V_MAC_LEGACY_F32_e64:
4741 case AMDGPU::V_FMAC_F16_e64:
4742 case AMDGPU::V_FMAC_F16_t16_e64:
4743 case AMDGPU::V_FMAC_F16_fake16_e64:
4744 case AMDGPU::V_FMAC_F32_e64:
4745 case AMDGPU::V_FMAC_F64_e64:
4746 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4747 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4748 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4749 return false;
4750 break;
4751
4752 case AMDGPU::V_CNDMASK_B32_e64:
4753 break;
4754 }
4755 }
4756
4757 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4758 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4759 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4760 return false;
4761
4762 // We don't need to check src0, all input types are legal, so just make sure
4763 // src0 isn't using any modifiers.
4764 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4765 return false;
4766
4767 // Can it be shrunk to a valid 32 bit opcode?
4768 if (!hasVALU32BitEncoding(MI.getOpcode()))
4769 return false;
4770
4771 // Check output modifiers
4772 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4773 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4774 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4775 // TODO: Can we avoid checking bound_ctrl/fi here?
4776 // They are only used by permlane*_swap special case.
4777 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4778 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4779}
4780
4781// Set VCC operand with all flags from \p Orig, except for setting it as
4782// implicit.
4784 const MachineOperand &Orig) {
4785
4786 for (MachineOperand &Use : MI.implicit_operands()) {
4787 if (Use.isUse() &&
4788 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4789 Use.setIsUndef(Orig.isUndef());
4790 Use.setIsKill(Orig.isKill());
4791 return;
4792 }
4793 }
4794}
4795
4797 unsigned Op32) const {
4798 MachineBasicBlock *MBB = MI.getParent();
4799
4800 const MCInstrDesc &Op32Desc = get(Op32);
4801 MachineInstrBuilder Inst32 =
4802 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4803 .setMIFlags(MI.getFlags());
4804
4805 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4806 // For VOPC instructions, this is replaced by an implicit def of vcc.
4807
4808 // We assume the defs of the shrunk opcode are in the same order, and the
4809 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4810 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4811 Inst32.add(MI.getOperand(I));
4812
4813 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4814
4815 int Idx = MI.getNumExplicitDefs();
4816 for (const MachineOperand &Use : MI.explicit_uses()) {
4817 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4819 continue;
4820
4821 if (&Use == Src2) {
4822 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4823 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4824 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4825 // of vcc was already added during the initial BuildMI, but we
4826 // 1) may need to change vcc to vcc_lo to preserve the original register
4827 // 2) have to preserve the original flags.
4828 copyFlagsToImplicitVCC(*Inst32, *Src2);
4829 continue;
4830 }
4831 }
4832
4833 Inst32.add(Use);
4834 }
4835
4836 // FIXME: Losing implicit operands
4837 fixImplicitOperands(*Inst32);
4838 return Inst32;
4839}
4840
4842 // Null is free
4843 Register Reg = RegOp.getReg();
4844 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4845 return false;
4846
4847 // SGPRs use the constant bus
4848
4849 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4850 // physical register operands should also count, except for exec.
4851 if (RegOp.isImplicit())
4852 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4853
4854 // SGPRs use the constant bus
4855 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4856 AMDGPU::SReg_64RegClass.contains(Reg);
4857}
4858
4860 const MachineRegisterInfo &MRI) const {
4861 Register Reg = RegOp.getReg();
4862 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4863 : physRegUsesConstantBus(RegOp);
4864}
4865
4867 const MachineOperand &MO,
4868 const MCOperandInfo &OpInfo) const {
4869 // Literal constants use the constant bus.
4870 if (!MO.isReg())
4871 return !isInlineConstant(MO, OpInfo);
4872
4873 Register Reg = MO.getReg();
4874 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4876}
4877
4879 for (const MachineOperand &MO : MI.implicit_operands()) {
4880 // We only care about reads.
4881 if (MO.isDef())
4882 continue;
4883
4884 switch (MO.getReg()) {
4885 case AMDGPU::VCC:
4886 case AMDGPU::VCC_LO:
4887 case AMDGPU::VCC_HI:
4888 case AMDGPU::M0:
4889 case AMDGPU::FLAT_SCR:
4890 return MO.getReg();
4891
4892 default:
4893 break;
4894 }
4895 }
4896
4897 return Register();
4898}
4899
4900static bool shouldReadExec(const MachineInstr &MI) {
4901 if (SIInstrInfo::isVALU(MI)) {
4902 switch (MI.getOpcode()) {
4903 case AMDGPU::V_READLANE_B32:
4904 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4905 case AMDGPU::V_WRITELANE_B32:
4906 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4907 return false;
4908 }
4909
4910 return true;
4911 }
4912
4913 if (MI.isPreISelOpcode() ||
4914 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4917 return false;
4918
4919 return true;
4920}
4921
4922static bool isRegOrFI(const MachineOperand &MO) {
4923 return MO.isReg() || MO.isFI();
4924}
4925
4926static bool isSubRegOf(const SIRegisterInfo &TRI,
4927 const MachineOperand &SuperVec,
4928 const MachineOperand &SubReg) {
4929 if (SubReg.getReg().isPhysical())
4930 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4931
4932 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4933 SubReg.getReg() == SuperVec.getReg();
4934}
4935
4936// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4937bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4938 const MachineRegisterInfo &MRI,
4939 StringRef &ErrInfo) const {
4940 Register DstReg = MI.getOperand(0).getReg();
4941 Register SrcReg = MI.getOperand(1).getReg();
4942 // This is a check for copy from vector register to SGPR
4943 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4944 ErrInfo = "illegal copy from vector register to SGPR";
4945 return false;
4946 }
4947 return true;
4948}
4949
4951 StringRef &ErrInfo) const {
4952 uint16_t Opcode = MI.getOpcode();
4953 const MachineFunction *MF = MI.getParent()->getParent();
4954 const MachineRegisterInfo &MRI = MF->getRegInfo();
4955
4956 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4957 // Find a better property to recognize the point where instruction selection
4958 // is just done.
4959 // We can only enforce this check after SIFixSGPRCopies pass so that the
4960 // illegal copies are legalized and thereafter we don't expect a pass
4961 // inserting similar copies.
4962 if (!MRI.isSSA() && MI.isCopy())
4963 return verifyCopy(MI, MRI, ErrInfo);
4964
4965 if (SIInstrInfo::isGenericOpcode(Opcode))
4966 return true;
4967
4968 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4969 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4970 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4971 int Src3Idx = -1;
4972 if (Src0Idx == -1) {
4973 // VOPD V_DUAL_* instructions use different operand names.
4974 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4975 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4976 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4977 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4978 }
4979
4980 // Make sure the number of operands is correct.
4981 const MCInstrDesc &Desc = get(Opcode);
4982 if (!Desc.isVariadic() &&
4983 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4984 ErrInfo = "Instruction has wrong number of operands.";
4985 return false;
4986 }
4987
4988 if (MI.isInlineAsm()) {
4989 // Verify register classes for inlineasm constraints.
4990 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4991 I != E; ++I) {
4992 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4993 if (!RC)
4994 continue;
4995
4996 const MachineOperand &Op = MI.getOperand(I);
4997 if (!Op.isReg())
4998 continue;
4999
5000 Register Reg = Op.getReg();
5001 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5002 ErrInfo = "inlineasm operand has incorrect register class.";
5003 return false;
5004 }
5005 }
5006
5007 return true;
5008 }
5009
5010 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5011 ErrInfo = "missing memory operand from image instruction.";
5012 return false;
5013 }
5014
5015 // Make sure the register classes are correct.
5016 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5017 const MachineOperand &MO = MI.getOperand(i);
5018 if (MO.isFPImm()) {
5019 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5020 "all fp values to integers.";
5021 return false;
5022 }
5023
5024 const MCOperandInfo &OpInfo = Desc.operands()[i];
5025 int16_t RegClass = getOpRegClassID(OpInfo);
5026
5027 switch (OpInfo.OperandType) {
5029 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5030 ErrInfo = "Illegal immediate value for operand.";
5031 return false;
5032 }
5033 break;
5046 break;
5048 break;
5049 break;
5063 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5064 ErrInfo = "Illegal immediate value for operand.";
5065 return false;
5066 }
5067 break;
5068 }
5070 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5071 ErrInfo = "Expected inline constant for operand.";
5072 return false;
5073 }
5074 break;
5078 break;
5083 // Check if this operand is an immediate.
5084 // FrameIndex operands will be replaced by immediates, so they are
5085 // allowed.
5086 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5087 ErrInfo = "Expected immediate, but got non-immediate";
5088 return false;
5089 }
5090 break;
5094 break;
5095 default:
5096 if (OpInfo.isGenericType())
5097 continue;
5098 break;
5099 }
5100
5101 if (!MO.isReg())
5102 continue;
5103 Register Reg = MO.getReg();
5104 if (!Reg)
5105 continue;
5106
5107 // FIXME: Ideally we would have separate instruction definitions with the
5108 // aligned register constraint.
5109 // FIXME: We do not verify inline asm operands, but custom inline asm
5110 // verification is broken anyway
5111 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5112 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5113 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5114 if (const TargetRegisterClass *SubRC =
5115 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5116 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5117 if (RC)
5118 RC = SubRC;
5119 }
5120 }
5121
5122 // Check that this is the aligned version of the class.
5123 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5124 ErrInfo = "Subtarget requires even aligned vector registers";
5125 return false;
5126 }
5127 }
5128
5129 if (RegClass != -1) {
5130 if (Reg.isVirtual())
5131 continue;
5132
5133 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5134 if (!RC->contains(Reg)) {
5135 ErrInfo = "Operand has incorrect register class.";
5136 return false;
5137 }
5138 }
5139 }
5140
5141 // Verify SDWA
5142 if (isSDWA(MI)) {
5143 if (!ST.hasSDWA()) {
5144 ErrInfo = "SDWA is not supported on this target";
5145 return false;
5146 }
5147
5148 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5149 AMDGPU::OpName::dst_sel}) {
5150 const MachineOperand *MO = getNamedOperand(MI, Op);
5151 if (!MO)
5152 continue;
5153 int64_t Imm = MO->getImm();
5154 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5155 ErrInfo = "Invalid SDWA selection";
5156 return false;
5157 }
5158 }
5159
5160 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5161
5162 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5163 if (OpIdx == -1)
5164 continue;
5165 const MachineOperand &MO = MI.getOperand(OpIdx);
5166
5167 if (!ST.hasSDWAScalar()) {
5168 // Only VGPRS on VI
5169 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5170 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5171 return false;
5172 }
5173 } else {
5174 // No immediates on GFX9
5175 if (!MO.isReg()) {
5176 ErrInfo =
5177 "Only reg allowed as operands in SDWA instructions on GFX9+";
5178 return false;
5179 }
5180 }
5181 }
5182
5183 if (!ST.hasSDWAOmod()) {
5184 // No omod allowed on VI
5185 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5186 if (OMod != nullptr &&
5187 (!OMod->isImm() || OMod->getImm() != 0)) {
5188 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5189 return false;
5190 }
5191 }
5192
5193 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5194 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5195 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5196 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5197 const MachineOperand *Src0ModsMO =
5198 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5199 unsigned Mods = Src0ModsMO->getImm();
5200 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5201 Mods & SISrcMods::SEXT) {
5202 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5203 return false;
5204 }
5205 }
5206
5207 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5208 if (isVOPC(BasicOpcode)) {
5209 if (!ST.hasSDWASdst() && DstIdx != -1) {
5210 // Only vcc allowed as dst on VI for VOPC
5211 const MachineOperand &Dst = MI.getOperand(DstIdx);
5212 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5213 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5214 return false;
5215 }
5216 } else if (!ST.hasSDWAOutModsVOPC()) {
5217 // No clamp allowed on GFX9 for VOPC
5218 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5219 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5220 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5221 return false;
5222 }
5223
5224 // No omod allowed on GFX9 for VOPC
5225 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5226 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5227 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5228 return false;
5229 }
5230 }
5231 }
5232
5233 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5234 if (DstUnused && DstUnused->isImm() &&
5235 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5236 const MachineOperand &Dst = MI.getOperand(DstIdx);
5237 if (!Dst.isReg() || !Dst.isTied()) {
5238 ErrInfo = "Dst register should have tied register";
5239 return false;
5240 }
5241
5242 const MachineOperand &TiedMO =
5243 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5244 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5245 ErrInfo =
5246 "Dst register should be tied to implicit use of preserved register";
5247 return false;
5248 }
5249 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5250 ErrInfo = "Dst register should use same physical register as preserved";
5251 return false;
5252 }
5253 }
5254 }
5255
5256 // Verify MIMG / VIMAGE / VSAMPLE
5257 if (isImage(Opcode) && !MI.mayStore()) {
5258 // Ensure that the return type used is large enough for all the options
5259 // being used TFE/LWE require an extra result register.
5260 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5261 if (DMask) {
5262 uint64_t DMaskImm = DMask->getImm();
5263 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5264 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5265 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5266 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5267
5268 // Adjust for packed 16 bit values
5269 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5270 RegCount = divideCeil(RegCount, 2);
5271
5272 // Adjust if using LWE or TFE
5273 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5274 RegCount += 1;
5275
5276 const uint32_t DstIdx =
5277 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5278 const MachineOperand &Dst = MI.getOperand(DstIdx);
5279 if (Dst.isReg()) {
5280 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5281 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5282 if (RegCount > DstSize) {
5283 ErrInfo = "Image instruction returns too many registers for dst "
5284 "register class";
5285 return false;
5286 }
5287 }
5288 }
5289 }
5290
5291 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5292 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5293 unsigned ConstantBusCount = 0;
5294 bool UsesLiteral = false;
5295 const MachineOperand *LiteralVal = nullptr;
5296
5297 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5298 if (ImmIdx != -1) {
5299 ++ConstantBusCount;
5300 UsesLiteral = true;
5301 LiteralVal = &MI.getOperand(ImmIdx);
5302 }
5303
5304 SmallVector<Register, 2> SGPRsUsed;
5305 Register SGPRUsed;
5306
5307 // Only look at the true operands. Only a real operand can use the constant
5308 // bus, and we don't want to check pseudo-operands like the source modifier
5309 // flags.
5310 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5311 if (OpIdx == -1)
5312 continue;
5313 const MachineOperand &MO = MI.getOperand(OpIdx);
5314 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5315 if (MO.isReg()) {
5316 SGPRUsed = MO.getReg();
5317 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5318 ++ConstantBusCount;
5319 SGPRsUsed.push_back(SGPRUsed);
5320 }
5321 } else if (!MO.isFI()) { // Treat FI like a register.
5322 if (!UsesLiteral) {
5323 ++ConstantBusCount;
5324 UsesLiteral = true;
5325 LiteralVal = &MO;
5326 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5327 assert(isVOP2(MI) || isVOP3(MI));
5328 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5329 return false;
5330 }
5331 }
5332 }
5333 }
5334
5335 SGPRUsed = findImplicitSGPRRead(MI);
5336 if (SGPRUsed) {
5337 // Implicit uses may safely overlap true operands
5338 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5339 return !RI.regsOverlap(SGPRUsed, SGPR);
5340 })) {
5341 ++ConstantBusCount;
5342 SGPRsUsed.push_back(SGPRUsed);
5343 }
5344 }
5345
5346 // v_writelane_b32 is an exception from constant bus restriction:
5347 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5348 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5349 Opcode != AMDGPU::V_WRITELANE_B32) {
5350 ErrInfo = "VOP* instruction violates constant bus restriction";
5351 return false;
5352 }
5353
5354 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5355 ErrInfo = "VOP3 instruction uses literal";
5356 return false;
5357 }
5358 }
5359
5360 // Special case for writelane - this can break the multiple constant bus rule,
5361 // but still can't use more than one SGPR register
5362 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5363 unsigned SGPRCount = 0;
5364 Register SGPRUsed;
5365
5366 for (int OpIdx : {Src0Idx, Src1Idx}) {
5367 if (OpIdx == -1)
5368 break;
5369
5370 const MachineOperand &MO = MI.getOperand(OpIdx);
5371
5372 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5373 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5374 if (MO.getReg() != SGPRUsed)
5375 ++SGPRCount;
5376 SGPRUsed = MO.getReg();
5377 }
5378 }
5379 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5380 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5381 return false;
5382 }
5383 }
5384 }
5385
5386 // Verify misc. restrictions on specific instructions.
5387 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5388 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5389 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5390 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5391 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5392 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5393 if (!compareMachineOp(Src0, Src1) &&
5394 !compareMachineOp(Src0, Src2)) {
5395 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5396 return false;
5397 }
5398 }
5399 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5400 SISrcMods::ABS) ||
5401 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5402 SISrcMods::ABS) ||
5403 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5404 SISrcMods::ABS)) {
5405 ErrInfo = "ABS not allowed in VOP3B instructions";
5406 return false;
5407 }
5408 }
5409
5410 if (isSOP2(MI) || isSOPC(MI)) {
5411 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5412 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5413
5414 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5415 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5416 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5417 !Src0.isIdenticalTo(Src1)) {
5418 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5419 return false;
5420 }
5421 }
5422
5423 if (isSOPK(MI)) {
5424 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5425 if (Desc.isBranch()) {
5426 if (!Op->isMBB()) {
5427 ErrInfo = "invalid branch target for SOPK instruction";
5428 return false;
5429 }
5430 } else {
5431 uint64_t Imm = Op->getImm();
5432 if (sopkIsZext(Opcode)) {
5433 if (!isUInt<16>(Imm)) {
5434 ErrInfo = "invalid immediate for SOPK instruction";
5435 return false;
5436 }
5437 } else {
5438 if (!isInt<16>(Imm)) {
5439 ErrInfo = "invalid immediate for SOPK instruction";
5440 return false;
5441 }
5442 }
5443 }
5444 }
5445
5446 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5447 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5448 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5449 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5450 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5451 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5452
5453 const unsigned StaticNumOps =
5454 Desc.getNumOperands() + Desc.implicit_uses().size();
5455 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5456
5457 // Allow additional implicit operands. This allows a fixup done by the post
5458 // RA scheduler where the main implicit operand is killed and implicit-defs
5459 // are added for sub-registers that remain live after this instruction.
5460 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5461 ErrInfo = "missing implicit register operands";
5462 return false;
5463 }
5464
5465 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5466 if (IsDst) {
5467 if (!Dst->isUse()) {
5468 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5469 return false;
5470 }
5471
5472 unsigned UseOpIdx;
5473 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5474 UseOpIdx != StaticNumOps + 1) {
5475 ErrInfo = "movrel implicit operands should be tied";
5476 return false;
5477 }
5478 }
5479
5480 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5481 const MachineOperand &ImpUse
5482 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5483 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5484 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5485 ErrInfo = "src0 should be subreg of implicit vector use";
5486 return false;
5487 }
5488 }
5489
5490 // Make sure we aren't losing exec uses in the td files. This mostly requires
5491 // being careful when using let Uses to try to add other use registers.
5492 if (shouldReadExec(MI)) {
5493 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5494 ErrInfo = "VALU instruction does not implicitly read exec mask";
5495 return false;
5496 }
5497 }
5498
5499 if (isSMRD(MI)) {
5500 if (MI.mayStore() &&
5501 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5502 // The register offset form of scalar stores may only use m0 as the
5503 // soffset register.
5504 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5505 if (Soff && Soff->getReg() != AMDGPU::M0) {
5506 ErrInfo = "scalar stores must use m0 as offset register";
5507 return false;
5508 }
5509 }
5510 }
5511
5512 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5513 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5514 if (Offset->getImm() != 0) {
5515 ErrInfo = "subtarget does not support offsets in flat instructions";
5516 return false;
5517 }
5518 }
5519
5520 if (isDS(MI) && !ST.hasGDS()) {
5521 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5522 if (GDSOp && GDSOp->getImm() != 0) {
5523 ErrInfo = "GDS is not supported on this subtarget";
5524 return false;
5525 }
5526 }
5527
5528 if (isImage(MI)) {
5529 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5530 if (DimOp) {
5531 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5532 AMDGPU::OpName::vaddr0);
5533 AMDGPU::OpName RSrcOpName =
5534 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5535 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5536 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5537 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5538 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5539 const AMDGPU::MIMGDimInfo *Dim =
5541
5542 if (!Dim) {
5543 ErrInfo = "dim is out of range";
5544 return false;
5545 }
5546
5547 bool IsA16 = false;
5548 if (ST.hasR128A16()) {
5549 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5550 IsA16 = R128A16->getImm() != 0;
5551 } else if (ST.hasA16()) {
5552 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5553 IsA16 = A16->getImm() != 0;
5554 }
5555
5556 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5557
5558 unsigned AddrWords =
5559 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5560
5561 unsigned VAddrWords;
5562 if (IsNSA) {
5563 VAddrWords = RsrcIdx - VAddr0Idx;
5564 if (ST.hasPartialNSAEncoding() &&
5565 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5566 unsigned LastVAddrIdx = RsrcIdx - 1;
5567 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5568 }
5569 } else {
5570 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5571 if (AddrWords > 12)
5572 AddrWords = 16;
5573 }
5574
5575 if (VAddrWords != AddrWords) {
5576 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5577 << " but got " << VAddrWords << "\n");
5578 ErrInfo = "bad vaddr size";
5579 return false;
5580 }
5581 }
5582 }
5583
5584 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5585 if (DppCt) {
5586 using namespace AMDGPU::DPP;
5587
5588 unsigned DC = DppCt->getImm();
5589 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5590 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5591 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5592 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5593 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5594 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5595 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5596 ErrInfo = "Invalid dpp_ctrl value";
5597 return false;
5598 }
5599 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5600 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5601 ErrInfo = "Invalid dpp_ctrl value: "
5602 "wavefront shifts are not supported on GFX10+";
5603 return false;
5604 }
5605 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5606 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5607 ErrInfo = "Invalid dpp_ctrl value: "
5608 "broadcasts are not supported on GFX10+";
5609 return false;
5610 }
5611 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5612 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5613 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5614 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5615 !ST.hasGFX90AInsts()) {
5616 ErrInfo = "Invalid dpp_ctrl value: "
5617 "row_newbroadcast/row_share is not supported before "
5618 "GFX90A/GFX10";
5619 return false;
5620 }
5621 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5622 ErrInfo = "Invalid dpp_ctrl value: "
5623 "row_share and row_xmask are not supported before GFX10";
5624 return false;
5625 }
5626 }
5627
5628 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5630 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5631 ErrInfo = "Invalid dpp_ctrl value: "
5632 "DP ALU dpp only support row_newbcast";
5633 return false;
5634 }
5635 }
5636
5637 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5638 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5639 AMDGPU::OpName DataName =
5640 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5641 const MachineOperand *Data = getNamedOperand(MI, DataName);
5642 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5643 if (Data && !Data->isReg())
5644 Data = nullptr;
5645
5646 if (ST.hasGFX90AInsts()) {
5647 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5648 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5649 ErrInfo = "Invalid register class: "
5650 "vdata and vdst should be both VGPR or AGPR";
5651 return false;
5652 }
5653 if (Data && Data2 &&
5654 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5655 ErrInfo = "Invalid register class: "
5656 "both data operands should be VGPR or AGPR";
5657 return false;
5658 }
5659 } else {
5660 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5661 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5662 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5663 ErrInfo = "Invalid register class: "
5664 "agpr loads and stores not supported on this GPU";
5665 return false;
5666 }
5667 }
5668 }
5669
5670 if (ST.needsAlignedVGPRs()) {
5671 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5673 if (!Op)
5674 return true;
5675 Register Reg = Op->getReg();
5676 if (Reg.isPhysical())
5677 return !(RI.getHWRegIndex(Reg) & 1);
5678 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5679 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5680 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5681 };
5682
5683 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5684 Opcode == AMDGPU::DS_GWS_BARRIER) {
5685
5686 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5687 ErrInfo = "Subtarget requires even aligned vector registers "
5688 "for DS_GWS instructions";
5689 return false;
5690 }
5691 }
5692
5693 if (isMIMG(MI)) {
5694 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5695 ErrInfo = "Subtarget requires even aligned vector registers "
5696 "for vaddr operand of image instructions";
5697 return false;
5698 }
5699 }
5700 }
5701
5702 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5703 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5704 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5705 ErrInfo = "Invalid register class: "
5706 "v_accvgpr_write with an SGPR is not supported on this GPU";
5707 return false;
5708 }
5709 }
5710
5711 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5712 const MachineOperand &SrcOp = MI.getOperand(1);
5713 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5714 ErrInfo = "pseudo expects only physical SGPRs";
5715 return false;
5716 }
5717 }
5718
5719 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5720 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5721 if (!ST.hasScaleOffset()) {
5722 ErrInfo = "Subtarget does not support offset scaling";
5723 return false;
5724 }
5725 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5726 ErrInfo = "Instruction does not support offset scaling";
5727 return false;
5728 }
5729 }
5730 }
5731
5732 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5733 // information.
5734 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5735 for (unsigned I = 0; I < 3; ++I) {
5737 return false;
5738 }
5739 }
5740
5741 return true;
5742}
5743
5744// It is more readable to list mapped opcodes on the same line.
5745// clang-format off
5746
5748 switch (MI.getOpcode()) {
5749 default: return AMDGPU::INSTRUCTION_LIST_END;
5750 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5751 case AMDGPU::COPY: return AMDGPU::COPY;
5752 case AMDGPU::PHI: return AMDGPU::PHI;
5753 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5754 case AMDGPU::WQM: return AMDGPU::WQM;
5755 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5756 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5757 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5758 case AMDGPU::S_MOV_B32: {
5759 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5760 return MI.getOperand(1).isReg() ||
5761 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5762 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5763 }
5764 case AMDGPU::S_ADD_I32:
5765 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5766 case AMDGPU::S_ADDC_U32:
5767 return AMDGPU::V_ADDC_U32_e32;
5768 case AMDGPU::S_SUB_I32:
5769 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5770 // FIXME: These are not consistently handled, and selected when the carry is
5771 // used.
5772 case AMDGPU::S_ADD_U32:
5773 return AMDGPU::V_ADD_CO_U32_e32;
5774 case AMDGPU::S_SUB_U32:
5775 return AMDGPU::V_SUB_CO_U32_e32;
5776 case AMDGPU::S_ADD_U64_PSEUDO:
5777 return AMDGPU::V_ADD_U64_PSEUDO;
5778 case AMDGPU::S_SUB_U64_PSEUDO:
5779 return AMDGPU::V_SUB_U64_PSEUDO;
5780 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5781 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5782 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5783 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5784 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5785 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5786 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5787 case AMDGPU::S_XNOR_B32:
5788 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5789 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5790 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5791 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5792 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5793 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5794 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5795 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5796 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5797 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5798 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5799 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5800 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5801 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5802 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5803 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5804 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5805 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5806 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5807 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5808 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5809 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5810 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5811 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5812 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5813 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5814 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5815 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5816 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5817 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5818 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5819 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5820 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5821 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5822 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5823 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5824 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5825 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5826 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5827 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5828 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5829 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5830 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5831 case AMDGPU::S_CVT_F32_F16:
5832 case AMDGPU::S_CVT_HI_F32_F16:
5833 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5834 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5835 case AMDGPU::S_CVT_F16_F32:
5836 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5837 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5838 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5839 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5840 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5841 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5842 case AMDGPU::S_CEIL_F16:
5843 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5844 : AMDGPU::V_CEIL_F16_fake16_e64;
5845 case AMDGPU::S_FLOOR_F16:
5846 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5847 : AMDGPU::V_FLOOR_F16_fake16_e64;
5848 case AMDGPU::S_TRUNC_F16:
5849 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5850 : AMDGPU::V_TRUNC_F16_fake16_e64;
5851 case AMDGPU::S_RNDNE_F16:
5852 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5853 : AMDGPU::V_RNDNE_F16_fake16_e64;
5854 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5855 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5856 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5857 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5858 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5859 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5860 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5861 case AMDGPU::S_ADD_F16:
5862 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5863 : AMDGPU::V_ADD_F16_fake16_e64;
5864 case AMDGPU::S_SUB_F16:
5865 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5866 : AMDGPU::V_SUB_F16_fake16_e64;
5867 case AMDGPU::S_MIN_F16:
5868 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5869 : AMDGPU::V_MIN_F16_fake16_e64;
5870 case AMDGPU::S_MAX_F16:
5871 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5872 : AMDGPU::V_MAX_F16_fake16_e64;
5873 case AMDGPU::S_MINIMUM_F16:
5874 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5875 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5876 case AMDGPU::S_MAXIMUM_F16:
5877 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5878 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5879 case AMDGPU::S_MUL_F16:
5880 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5881 : AMDGPU::V_MUL_F16_fake16_e64;
5882 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5883 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5884 case AMDGPU::S_FMAC_F16:
5885 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5886 : AMDGPU::V_FMAC_F16_fake16_e64;
5887 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5888 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5889 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5890 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5891 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5892 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5893 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5894 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5895 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5896 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5897 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5898 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5899 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5900 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5901 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5902 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5903 case AMDGPU::S_CMP_LT_F16:
5904 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5905 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5906 case AMDGPU::S_CMP_EQ_F16:
5907 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5908 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5909 case AMDGPU::S_CMP_LE_F16:
5910 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5911 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5912 case AMDGPU::S_CMP_GT_F16:
5913 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5914 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5915 case AMDGPU::S_CMP_LG_F16:
5916 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5917 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5918 case AMDGPU::S_CMP_GE_F16:
5919 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5920 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5921 case AMDGPU::S_CMP_O_F16:
5922 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5923 : AMDGPU::V_CMP_O_F16_fake16_e64;
5924 case AMDGPU::S_CMP_U_F16:
5925 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5926 : AMDGPU::V_CMP_U_F16_fake16_e64;
5927 case AMDGPU::S_CMP_NGE_F16:
5928 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5929 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5930 case AMDGPU::S_CMP_NLG_F16:
5931 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5932 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5933 case AMDGPU::S_CMP_NGT_F16:
5934 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5935 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5936 case AMDGPU::S_CMP_NLE_F16:
5937 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5938 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5939 case AMDGPU::S_CMP_NEQ_F16:
5940 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5941 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5942 case AMDGPU::S_CMP_NLT_F16:
5943 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5944 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5945 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5946 case AMDGPU::V_S_EXP_F16_e64:
5947 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5948 : AMDGPU::V_EXP_F16_fake16_e64;
5949 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5950 case AMDGPU::V_S_LOG_F16_e64:
5951 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5952 : AMDGPU::V_LOG_F16_fake16_e64;
5953 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5954 case AMDGPU::V_S_RCP_F16_e64:
5955 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5956 : AMDGPU::V_RCP_F16_fake16_e64;
5957 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5958 case AMDGPU::V_S_RSQ_F16_e64:
5959 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5960 : AMDGPU::V_RSQ_F16_fake16_e64;
5961 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5962 case AMDGPU::V_S_SQRT_F16_e64:
5963 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5964 : AMDGPU::V_SQRT_F16_fake16_e64;
5965 }
5967 "Unexpected scalar opcode without corresponding vector one!");
5968}
5969
5970// clang-format on
5971
5975 const DebugLoc &DL, Register Reg,
5976 bool IsSCCLive,
5977 SlotIndexes *Indexes) const {
5978 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5979 const SIInstrInfo *TII = ST.getInstrInfo();
5981 if (IsSCCLive) {
5982 // Insert two move instructions, one to save the original value of EXEC and
5983 // the other to turn on all bits in EXEC. This is required as we can't use
5984 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5985 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
5987 auto FlipExecMI =
5988 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
5989 if (Indexes) {
5990 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5991 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5992 }
5993 } else {
5994 auto SaveExec =
5995 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
5996 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5997 if (Indexes)
5998 Indexes->insertMachineInstrInMaps(*SaveExec);
5999 }
6000}
6001
6004 const DebugLoc &DL, Register Reg,
6005 SlotIndexes *Indexes) const {
6007 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6008 .addReg(Reg, RegState::Kill);
6009 if (Indexes)
6010 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6011}
6012
6016 "Not a whole wave func");
6017 MachineBasicBlock &MBB = *MF.begin();
6018 for (MachineInstr &MI : MBB)
6019 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6020 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6021 return &MI;
6022
6023 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6024}
6025
6027 unsigned OpNo) const {
6028 const MCInstrDesc &Desc = get(MI.getOpcode());
6029 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6030 Desc.operands()[OpNo].RegClass == -1) {
6031 Register Reg = MI.getOperand(OpNo).getReg();
6032
6033 if (Reg.isVirtual()) {
6034 const MachineRegisterInfo &MRI =
6035 MI.getParent()->getParent()->getRegInfo();
6036 return MRI.getRegClass(Reg);
6037 }
6038 return RI.getPhysRegBaseClass(Reg);
6039 }
6040
6041 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6042 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6043}
6044
6047 MachineBasicBlock *MBB = MI.getParent();
6048 MachineOperand &MO = MI.getOperand(OpIdx);
6049 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6050 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6051 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6052 unsigned Size = RI.getRegSizeInBits(*RC);
6053 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6054 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6055 : AMDGPU::V_MOV_B32_e32;
6056 if (MO.isReg())
6057 Opcode = AMDGPU::COPY;
6058 else if (RI.isSGPRClass(RC))
6059 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6060
6061 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6062 Register Reg = MRI.createVirtualRegister(VRC);
6063 DebugLoc DL = MBB->findDebugLoc(I);
6064 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6065 MO.ChangeToRegister(Reg, false);
6066}
6067
6070 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6071 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6072 if (!SuperReg.getReg().isVirtual())
6073 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6074
6075 MachineBasicBlock *MBB = MI->getParent();
6076 const DebugLoc &DL = MI->getDebugLoc();
6077 Register SubReg = MRI.createVirtualRegister(SubRC);
6078
6079 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6080 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6081 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6082 return SubReg;
6083}
6084
6087 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6088 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6089 if (Op.isImm()) {
6090 if (SubIdx == AMDGPU::sub0)
6091 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6092 if (SubIdx == AMDGPU::sub1)
6093 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6094
6095 llvm_unreachable("Unhandled register index for immediate");
6096 }
6097
6098 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6099 SubIdx, SubRC);
6100 return MachineOperand::CreateReg(SubReg, false);
6101}
6102
6103// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6104void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6105 assert(Inst.getNumExplicitOperands() == 3);
6106 MachineOperand Op1 = Inst.getOperand(1);
6107 Inst.removeOperand(1);
6108 Inst.addOperand(Op1);
6109}
6110
6112 const MCOperandInfo &OpInfo,
6113 const MachineOperand &MO) const {
6114 if (!MO.isReg())
6115 return false;
6116
6117 Register Reg = MO.getReg();
6118
6119 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6120 if (Reg.isPhysical())
6121 return DRC->contains(Reg);
6122
6123 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6124
6125 if (MO.getSubReg()) {
6126 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6127 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6128 if (!SuperRC)
6129 return false;
6130 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6131 }
6132
6133 return RI.getCommonSubClass(DRC, RC) != nullptr;
6134}
6135
6137 const MachineOperand &MO) const {
6138 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6139 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6140 unsigned Opc = MI.getOpcode();
6141
6142 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6143 // information.
6144 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6145 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6146 constexpr AMDGPU::OpName OpNames[] = {
6147 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6148
6149 for (auto [I, OpName] : enumerate(OpNames)) {
6150 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6151 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6153 return false;
6154 }
6155 }
6156
6157 if (!isLegalRegOperand(MRI, OpInfo, MO))
6158 return false;
6159
6160 // check Accumulate GPR operand
6161 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6162 if (IsAGPR && !ST.hasMAIInsts())
6163 return false;
6164 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6165 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6166 return false;
6167 // Atomics should have both vdst and vdata either vgpr or agpr.
6168 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6169 const int DataIdx = AMDGPU::getNamedOperandIdx(
6170 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6171 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6172 MI.getOperand(DataIdx).isReg() &&
6173 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6174 return false;
6175 if ((int)OpIdx == DataIdx) {
6176 if (VDstIdx != -1 &&
6177 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6178 return false;
6179 // DS instructions with 2 src operands also must have tied RC.
6180 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6181 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6182 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6183 return false;
6184 }
6185
6186 // Check V_ACCVGPR_WRITE_B32_e64
6187 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6188 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6189 RI.isSGPRReg(MRI, MO.getReg()))
6190 return false;
6191 return true;
6192}
6193
6195 const MCOperandInfo &OpInfo,
6196 const MachineOperand &MO) const {
6197 if (MO.isReg())
6198 return isLegalRegOperand(MRI, OpInfo, MO);
6199
6200 // Handle non-register types that are treated like immediates.
6201 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6202 return true;
6203}
6204
6206 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6207 const MachineOperand *MO) const {
6208 constexpr unsigned NumOps = 3;
6209 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6210 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6211 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6212 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6213
6214 assert(SrcN < NumOps);
6215
6216 if (!MO) {
6217 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6218 if (SrcIdx == -1)
6219 return true;
6220 MO = &MI.getOperand(SrcIdx);
6221 }
6222
6223 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6224 return true;
6225
6226 int ModsIdx =
6227 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6228 if (ModsIdx == -1)
6229 return true;
6230
6231 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6232 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6233 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6234
6235 return !OpSel && !OpSelHi;
6236}
6237
6239 const MachineOperand *MO) const {
6240 const MachineFunction &MF = *MI.getParent()->getParent();
6241 const MachineRegisterInfo &MRI = MF.getRegInfo();
6242 const MCInstrDesc &InstDesc = MI.getDesc();
6243 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6244 int64_t RegClass = getOpRegClassID(OpInfo);
6245 const TargetRegisterClass *DefinedRC =
6246 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6247 if (!MO)
6248 MO = &MI.getOperand(OpIdx);
6249
6250 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6251
6252 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6253 const MachineOperand *UsedLiteral = nullptr;
6254
6255 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6256 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6257
6258 // TODO: Be more permissive with frame indexes.
6259 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6260 if (!LiteralLimit--)
6261 return false;
6262
6263 UsedLiteral = MO;
6264 }
6265
6267 if (MO->isReg())
6268 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6269
6270 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6271 if (i == OpIdx)
6272 continue;
6273 const MachineOperand &Op = MI.getOperand(i);
6274 if (Op.isReg()) {
6275 if (Op.isUse()) {
6276 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6277 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6278 if (--ConstantBusLimit <= 0)
6279 return false;
6280 }
6281 }
6282 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6283 !isInlineConstant(Op, InstDesc.operands()[i])) {
6284 // The same literal may be used multiple times.
6285 if (!UsedLiteral)
6286 UsedLiteral = &Op;
6287 else if (UsedLiteral->isIdenticalTo(Op))
6288 continue;
6289
6290 if (!LiteralLimit--)
6291 return false;
6292 if (--ConstantBusLimit <= 0)
6293 return false;
6294 }
6295 }
6296 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6297 // There can be at most one literal operand, but it can be repeated.
6298 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6299 if (i == OpIdx)
6300 continue;
6301 const MachineOperand &Op = MI.getOperand(i);
6302 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6303 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6304 !Op.isIdenticalTo(*MO))
6305 return false;
6306
6307 // Do not fold a non-inlineable and non-register operand into an
6308 // instruction that already has a frame index. The frame index handling
6309 // code could not handle well when a frame index co-exists with another
6310 // non-register operand, unless that operand is an inlineable immediate.
6311 if (Op.isFI())
6312 return false;
6313 }
6314 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6315 isF16PseudoScalarTrans(MI.getOpcode())) {
6316 return false;
6317 }
6318
6319 if (MO->isReg()) {
6320 if (!DefinedRC)
6321 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6322 return isLegalRegOperand(MI, OpIdx, *MO);
6323 }
6324
6325 if (MO->isImm()) {
6326 uint64_t Imm = MO->getImm();
6327 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6328 bool Is64BitOp = Is64BitFPOp ||
6329 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6330 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6331 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6332 if (Is64BitOp &&
6333 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6334 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6335 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6336 return false;
6337
6338 // FIXME: We can use sign extended 64-bit literals, but only for signed
6339 // operands. At the moment we do not know if an operand is signed.
6340 // Such operand will be encoded as its low 32 bits and then either
6341 // correctly sign extended or incorrectly zero extended by HW.
6342 // If 64-bit literals are supported and the literal will be encoded
6343 // as full 64 bit we still can use it.
6344 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6345 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6346 return false;
6347 }
6348 }
6349
6350 // Handle non-register types that are treated like immediates.
6351 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6352
6353 if (!DefinedRC) {
6354 // This operand expects an immediate.
6355 return true;
6356 }
6357
6358 return isImmOperandLegal(MI, OpIdx, *MO);
6359}
6360
6362 bool IsGFX950Only = ST.hasGFX950Insts();
6363 bool IsGFX940Only = ST.hasGFX940Insts();
6364
6365 if (!IsGFX950Only && !IsGFX940Only)
6366 return false;
6367
6368 if (!isVALU(MI))
6369 return false;
6370
6371 // V_COS, V_EXP, V_RCP, etc.
6372 if (isTRANS(MI))
6373 return true;
6374
6375 // DOT2, DOT2C, DOT4, etc.
6376 if (isDOT(MI))
6377 return true;
6378
6379 // MFMA, SMFMA
6380 if (isMFMA(MI))
6381 return true;
6382
6383 unsigned Opcode = MI.getOpcode();
6384 switch (Opcode) {
6385 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6386 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6387 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6388 case AMDGPU::V_MQSAD_U32_U8_e64:
6389 case AMDGPU::V_PK_ADD_F16:
6390 case AMDGPU::V_PK_ADD_F32:
6391 case AMDGPU::V_PK_ADD_I16:
6392 case AMDGPU::V_PK_ADD_U16:
6393 case AMDGPU::V_PK_ASHRREV_I16:
6394 case AMDGPU::V_PK_FMA_F16:
6395 case AMDGPU::V_PK_FMA_F32:
6396 case AMDGPU::V_PK_FMAC_F16_e32:
6397 case AMDGPU::V_PK_FMAC_F16_e64:
6398 case AMDGPU::V_PK_LSHLREV_B16:
6399 case AMDGPU::V_PK_LSHRREV_B16:
6400 case AMDGPU::V_PK_MAD_I16:
6401 case AMDGPU::V_PK_MAD_U16:
6402 case AMDGPU::V_PK_MAX_F16:
6403 case AMDGPU::V_PK_MAX_I16:
6404 case AMDGPU::V_PK_MAX_U16:
6405 case AMDGPU::V_PK_MIN_F16:
6406 case AMDGPU::V_PK_MIN_I16:
6407 case AMDGPU::V_PK_MIN_U16:
6408 case AMDGPU::V_PK_MOV_B32:
6409 case AMDGPU::V_PK_MUL_F16:
6410 case AMDGPU::V_PK_MUL_F32:
6411 case AMDGPU::V_PK_MUL_LO_U16:
6412 case AMDGPU::V_PK_SUB_I16:
6413 case AMDGPU::V_PK_SUB_U16:
6414 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6415 return true;
6416 default:
6417 return false;
6418 }
6419}
6420
6422 MachineInstr &MI) const {
6423 unsigned Opc = MI.getOpcode();
6424 const MCInstrDesc &InstrDesc = get(Opc);
6425
6426 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6427 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6428
6429 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6430 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6431
6432 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6433 // we need to only have one constant bus use before GFX10.
6434 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6435 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6436 RI.isSGPRReg(MRI, Src0.getReg()))
6437 legalizeOpWithMove(MI, Src0Idx);
6438
6439 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6440 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6441 // src0/src1 with V_READFIRSTLANE.
6442 if (Opc == AMDGPU::V_WRITELANE_B32) {
6443 const DebugLoc &DL = MI.getDebugLoc();
6444 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6445 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6446 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6447 .add(Src0);
6448 Src0.ChangeToRegister(Reg, false);
6449 }
6450 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6451 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6452 const DebugLoc &DL = MI.getDebugLoc();
6453 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6454 .add(Src1);
6455 Src1.ChangeToRegister(Reg, false);
6456 }
6457 return;
6458 }
6459
6460 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6461 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6462 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6463 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6464 legalizeOpWithMove(MI, Src2Idx);
6465 }
6466
6467 // VOP2 src0 instructions support all operand types, so we don't need to check
6468 // their legality. If src1 is already legal, we don't need to do anything.
6469 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6470 return;
6471
6472 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6473 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6474 // select is uniform.
6475 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6476 RI.isVGPR(MRI, Src1.getReg())) {
6477 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6478 const DebugLoc &DL = MI.getDebugLoc();
6479 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6480 .add(Src1);
6481 Src1.ChangeToRegister(Reg, false);
6482 return;
6483 }
6484
6485 // We do not use commuteInstruction here because it is too aggressive and will
6486 // commute if it is possible. We only want to commute here if it improves
6487 // legality. This can be called a fairly large number of times so don't waste
6488 // compile time pointlessly swapping and checking legality again.
6489 if (HasImplicitSGPR || !MI.isCommutable()) {
6490 legalizeOpWithMove(MI, Src1Idx);
6491 return;
6492 }
6493
6494 // If src0 can be used as src1, commuting will make the operands legal.
6495 // Otherwise we have to give up and insert a move.
6496 //
6497 // TODO: Other immediate-like operand kinds could be commuted if there was a
6498 // MachineOperand::ChangeTo* for them.
6499 if ((!Src1.isImm() && !Src1.isReg()) ||
6500 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6501 legalizeOpWithMove(MI, Src1Idx);
6502 return;
6503 }
6504
6505 int CommutedOpc = commuteOpcode(MI);
6506 if (CommutedOpc == -1) {
6507 legalizeOpWithMove(MI, Src1Idx);
6508 return;
6509 }
6510
6511 MI.setDesc(get(CommutedOpc));
6512
6513 Register Src0Reg = Src0.getReg();
6514 unsigned Src0SubReg = Src0.getSubReg();
6515 bool Src0Kill = Src0.isKill();
6516
6517 if (Src1.isImm())
6518 Src0.ChangeToImmediate(Src1.getImm());
6519 else if (Src1.isReg()) {
6520 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6521 Src0.setSubReg(Src1.getSubReg());
6522 } else
6523 llvm_unreachable("Should only have register or immediate operands");
6524
6525 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6526 Src1.setSubReg(Src0SubReg);
6528}
6529
6530// Legalize VOP3 operands. All operand types are supported for any operand
6531// but only one literal constant and only starting from GFX10.
6533 MachineInstr &MI) const {
6534 unsigned Opc = MI.getOpcode();
6535
6536 int VOP3Idx[3] = {
6537 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6538 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6539 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6540 };
6541
6542 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6543 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6544 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6545 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6546 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6547 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6548 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6549 // src1 and src2 must be scalar
6550 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6551 const DebugLoc &DL = MI.getDebugLoc();
6552 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6553 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6554 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6555 .add(Src1);
6556 Src1.ChangeToRegister(Reg, false);
6557 }
6558 if (VOP3Idx[2] != -1) {
6559 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6560 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6561 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6562 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6563 .add(Src2);
6564 Src2.ChangeToRegister(Reg, false);
6565 }
6566 }
6567 }
6568
6569 // Find the one SGPR operand we are allowed to use.
6570 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6571 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6572 SmallDenseSet<unsigned> SGPRsUsed;
6573 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6574 if (SGPRReg) {
6575 SGPRsUsed.insert(SGPRReg);
6576 --ConstantBusLimit;
6577 }
6578
6579 for (int Idx : VOP3Idx) {
6580 if (Idx == -1)
6581 break;
6582 MachineOperand &MO = MI.getOperand(Idx);
6583
6584 if (!MO.isReg()) {
6585 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6586 continue;
6587
6588 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6589 --LiteralLimit;
6590 --ConstantBusLimit;
6591 continue;
6592 }
6593
6594 --LiteralLimit;
6595 --ConstantBusLimit;
6596 legalizeOpWithMove(MI, Idx);
6597 continue;
6598 }
6599
6600 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6601 continue; // VGPRs are legal
6602
6603 // We can use one SGPR in each VOP3 instruction prior to GFX10
6604 // and two starting from GFX10.
6605 if (SGPRsUsed.count(MO.getReg()))
6606 continue;
6607 if (ConstantBusLimit > 0) {
6608 SGPRsUsed.insert(MO.getReg());
6609 --ConstantBusLimit;
6610 continue;
6611 }
6612
6613 // If we make it this far, then the operand is not legal and we must
6614 // legalize it.
6615 legalizeOpWithMove(MI, Idx);
6616 }
6617
6618 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6619 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6620 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6621 legalizeOpWithMove(MI, VOP3Idx[2]);
6622
6623 // Fix the register class of packed FP32 instructions on gfx12+. See
6624 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6626 for (unsigned I = 0; I < 3; ++I) {
6628 legalizeOpWithMove(MI, VOP3Idx[I]);
6629 }
6630 }
6631}
6632
6635 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6636 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6637 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6638 if (DstRC)
6639 SRC = RI.getCommonSubClass(SRC, DstRC);
6640
6641 Register DstReg = MRI.createVirtualRegister(SRC);
6642 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6643
6644 if (RI.hasAGPRs(VRC)) {
6645 VRC = RI.getEquivalentVGPRClass(VRC);
6646 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6647 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6648 get(TargetOpcode::COPY), NewSrcReg)
6649 .addReg(SrcReg);
6650 SrcReg = NewSrcReg;
6651 }
6652
6653 if (SubRegs == 1) {
6654 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6655 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6656 .addReg(SrcReg);
6657 return DstReg;
6658 }
6659
6661 for (unsigned i = 0; i < SubRegs; ++i) {
6662 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6663 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6664 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6665 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6666 SRegs.push_back(SGPR);
6667 }
6668
6670 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6671 get(AMDGPU::REG_SEQUENCE), DstReg);
6672 for (unsigned i = 0; i < SubRegs; ++i) {
6673 MIB.addReg(SRegs[i]);
6674 MIB.addImm(RI.getSubRegFromChannel(i));
6675 }
6676 return DstReg;
6677}
6678
6680 MachineInstr &MI) const {
6681
6682 // If the pointer is store in VGPRs, then we need to move them to
6683 // SGPRs using v_readfirstlane. This is safe because we only select
6684 // loads with uniform pointers to SMRD instruction so we know the
6685 // pointer value is uniform.
6686 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6687 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6688 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6689 SBase->setReg(SGPR);
6690 }
6691 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6692 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6693 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6694 SOff->setReg(SGPR);
6695 }
6696}
6697
6699 unsigned Opc = Inst.getOpcode();
6700 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6701 if (OldSAddrIdx < 0)
6702 return false;
6703
6704 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6705
6706 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6707 if (NewOpc < 0)
6709 if (NewOpc < 0)
6710 return false;
6711
6713 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6714 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6715 return false;
6716
6717 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6718 if (NewVAddrIdx < 0)
6719 return false;
6720
6721 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6722
6723 // Check vaddr, it shall be zero or absent.
6724 MachineInstr *VAddrDef = nullptr;
6725 if (OldVAddrIdx >= 0) {
6726 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6727 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6728 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6729 !VAddrDef->getOperand(1).isImm() ||
6730 VAddrDef->getOperand(1).getImm() != 0)
6731 return false;
6732 }
6733
6734 const MCInstrDesc &NewDesc = get(NewOpc);
6735 Inst.setDesc(NewDesc);
6736
6737 // Callers expect iterator to be valid after this call, so modify the
6738 // instruction in place.
6739 if (OldVAddrIdx == NewVAddrIdx) {
6740 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6741 // Clear use list from the old vaddr holding a zero register.
6742 MRI.removeRegOperandFromUseList(&NewVAddr);
6743 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6744 Inst.removeOperand(OldSAddrIdx);
6745 // Update the use list with the pointer we have just moved from vaddr to
6746 // saddr position. Otherwise new vaddr will be missing from the use list.
6747 MRI.removeRegOperandFromUseList(&NewVAddr);
6748 MRI.addRegOperandToUseList(&NewVAddr);
6749 } else {
6750 assert(OldSAddrIdx == NewVAddrIdx);
6751
6752 if (OldVAddrIdx >= 0) {
6753 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6754 AMDGPU::OpName::vdst_in);
6755
6756 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6757 // it asserts. Untie the operands for now and retie them afterwards.
6758 if (NewVDstIn != -1) {
6759 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6760 Inst.untieRegOperand(OldVDstIn);
6761 }
6762
6763 Inst.removeOperand(OldVAddrIdx);
6764
6765 if (NewVDstIn != -1) {
6766 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6767 Inst.tieOperands(NewVDst, NewVDstIn);
6768 }
6769 }
6770 }
6771
6772 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6773 VAddrDef->eraseFromParent();
6774
6775 return true;
6776}
6777
6778// FIXME: Remove this when SelectionDAG is obsoleted.
6780 MachineInstr &MI) const {
6781 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6782 return;
6783
6784 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6785 // thinks they are uniform, so a readfirstlane should be valid.
6786 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6787 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6788 return;
6789
6791 return;
6792
6793 const TargetRegisterClass *DeclaredRC =
6794 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6795
6796 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6797 SAddr->setReg(ToSGPR);
6798}
6799
6802 const TargetRegisterClass *DstRC,
6805 const DebugLoc &DL) const {
6806 Register OpReg = Op.getReg();
6807 unsigned OpSubReg = Op.getSubReg();
6808
6809 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6810 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6811
6812 // Check if operand is already the correct register class.
6813 if (DstRC == OpRC)
6814 return;
6815
6816 Register DstReg = MRI.createVirtualRegister(DstRC);
6817 auto Copy =
6818 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6819 Op.setReg(DstReg);
6820
6821 MachineInstr *Def = MRI.getVRegDef(OpReg);
6822 if (!Def)
6823 return;
6824
6825 // Try to eliminate the copy if it is copying an immediate value.
6826 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6827 foldImmediate(*Copy, *Def, OpReg, &MRI);
6828
6829 bool ImpDef = Def->isImplicitDef();
6830 while (!ImpDef && Def && Def->isCopy()) {
6831 if (Def->getOperand(1).getReg().isPhysical())
6832 break;
6833 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6834 ImpDef = Def && Def->isImplicitDef();
6835 }
6836 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6837 !ImpDef)
6838 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6839}
6840
6841// Emit the actual waterfall loop, executing the wrapped instruction for each
6842// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6843// iteration, in the worst case we execute 64 (once per lane).
6844static void
6847 MachineBasicBlock &LoopBB,
6848 MachineBasicBlock &BodyBB,
6849 const DebugLoc &DL,
6850 ArrayRef<MachineOperand *> ScalarOps) {
6851 MachineFunction &MF = *LoopBB.getParent();
6852 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6853 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6855 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6856
6858 Register CondReg;
6859
6860 for (MachineOperand *ScalarOp : ScalarOps) {
6861 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6862 unsigned NumSubRegs = RegSize / 32;
6863 Register VScalarOp = ScalarOp->getReg();
6864
6865 if (NumSubRegs == 1) {
6866 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6867
6868 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6869 .addReg(VScalarOp);
6870
6871 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6872
6873 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6874 .addReg(CurReg)
6875 .addReg(VScalarOp);
6876
6877 // Combine the comparison results with AND.
6878 if (!CondReg) // First.
6879 CondReg = NewCondReg;
6880 else { // If not the first, we create an AND.
6881 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6882 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6883 .addReg(CondReg)
6884 .addReg(NewCondReg);
6885 CondReg = AndReg;
6886 }
6887
6888 // Update ScalarOp operand to use the SGPR ScalarOp.
6889 ScalarOp->setReg(CurReg);
6890 ScalarOp->setIsKill();
6891 } else {
6892 SmallVector<Register, 8> ReadlanePieces;
6893 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6894 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6895 "Unhandled register size");
6896
6897 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6898 Register CurRegLo =
6899 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6900 Register CurRegHi =
6901 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6902
6903 // Read the next variant <- also loop target.
6904 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6905 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6906
6907 // Read the next variant <- also loop target.
6908 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6909 .addReg(VScalarOp, VScalarOpUndef,
6910 TRI->getSubRegFromChannel(Idx + 1));
6911
6912 ReadlanePieces.push_back(CurRegLo);
6913 ReadlanePieces.push_back(CurRegHi);
6914
6915 // Comparison is to be done as 64-bit.
6916 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6917 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6918 .addReg(CurRegLo)
6919 .addImm(AMDGPU::sub0)
6920 .addReg(CurRegHi)
6921 .addImm(AMDGPU::sub1);
6922
6923 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6924 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6925 NewCondReg)
6926 .addReg(CurReg);
6927 if (NumSubRegs <= 2)
6928 Cmp.addReg(VScalarOp);
6929 else
6930 Cmp.addReg(VScalarOp, VScalarOpUndef,
6931 TRI->getSubRegFromChannel(Idx, 2));
6932
6933 // Combine the comparison results with AND.
6934 if (!CondReg) // First.
6935 CondReg = NewCondReg;
6936 else { // If not the first, we create an AND.
6937 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6938 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6939 .addReg(CondReg)
6940 .addReg(NewCondReg);
6941 CondReg = AndReg;
6942 }
6943 } // End for loop.
6944
6945 const auto *SScalarOpRC =
6946 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6947 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6948
6949 // Build scalar ScalarOp.
6950 auto Merge =
6951 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6952 unsigned Channel = 0;
6953 for (Register Piece : ReadlanePieces) {
6954 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6955 }
6956
6957 // Update ScalarOp operand to use the SGPR ScalarOp.
6958 ScalarOp->setReg(SScalarOp);
6959 ScalarOp->setIsKill();
6960 }
6961 }
6962
6963 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6964 MRI.setSimpleHint(SaveExec, CondReg);
6965
6966 // Update EXEC to matching lanes, saving original to SaveExec.
6967 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
6968 .addReg(CondReg, RegState::Kill);
6969
6970 // The original instruction is here; we insert the terminators after it.
6971 I = BodyBB.end();
6972
6973 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6974 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
6975 .addReg(LMC.ExecReg)
6976 .addReg(SaveExec);
6977
6978 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6979}
6980
6981// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6982// with SGPRs by iterating over all unique values across all lanes.
6983// Returns the loop basic block that now contains \p MI.
6984static MachineBasicBlock *
6988 MachineBasicBlock::iterator Begin = nullptr,
6989 MachineBasicBlock::iterator End = nullptr) {
6990 MachineBasicBlock &MBB = *MI.getParent();
6991 MachineFunction &MF = *MBB.getParent();
6992 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6993 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6995 if (!Begin.isValid())
6996 Begin = &MI;
6997 if (!End.isValid()) {
6998 End = &MI;
6999 ++End;
7000 }
7001 const DebugLoc &DL = MI.getDebugLoc();
7003 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7004
7005 // Save SCC. Waterfall Loop may overwrite SCC.
7006 Register SaveSCCReg;
7007
7008 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7009 // rather than unlimited scan everywhere
7010 bool SCCNotDead =
7011 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7012 std::numeric_limits<unsigned>::max()) !=
7014 if (SCCNotDead) {
7015 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7016 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7017 .addImm(1)
7018 .addImm(0);
7019 }
7020
7021 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7022
7023 // Save the EXEC mask
7024 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7025
7026 // Killed uses in the instruction we are waterfalling around will be
7027 // incorrect due to the added control-flow.
7029 ++AfterMI;
7030 for (auto I = Begin; I != AfterMI; I++) {
7031 for (auto &MO : I->all_uses())
7032 MRI.clearKillFlags(MO.getReg());
7033 }
7034
7035 // To insert the loop we need to split the block. Move everything after this
7036 // point to a new block, and insert a new empty block between the two.
7039 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7041 ++MBBI;
7042
7043 MF.insert(MBBI, LoopBB);
7044 MF.insert(MBBI, BodyBB);
7045 MF.insert(MBBI, RemainderBB);
7046
7047 LoopBB->addSuccessor(BodyBB);
7048 BodyBB->addSuccessor(LoopBB);
7049 BodyBB->addSuccessor(RemainderBB);
7050
7051 // Move Begin to MI to the BodyBB, and the remainder of the block to
7052 // RemainderBB.
7053 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7054 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7055 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7056
7057 MBB.addSuccessor(LoopBB);
7058
7059 // Update dominators. We know that MBB immediately dominates LoopBB, that
7060 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7061 // RemainderBB. RemainderBB immediately dominates all of the successors
7062 // transferred to it from MBB that MBB used to properly dominate.
7063 if (MDT) {
7064 MDT->addNewBlock(LoopBB, &MBB);
7065 MDT->addNewBlock(BodyBB, LoopBB);
7066 MDT->addNewBlock(RemainderBB, BodyBB);
7067 for (auto &Succ : RemainderBB->successors()) {
7068 if (MDT->properlyDominates(&MBB, Succ)) {
7069 MDT->changeImmediateDominator(Succ, RemainderBB);
7070 }
7071 }
7072 }
7073
7074 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7075
7076 MachineBasicBlock::iterator First = RemainderBB->begin();
7077 // Restore SCC
7078 if (SCCNotDead) {
7079 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7080 .addReg(SaveSCCReg, RegState::Kill)
7081 .addImm(0);
7082 }
7083
7084 // Restore the EXEC mask
7085 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7086 .addReg(SaveExec);
7087 return BodyBB;
7088}
7089
7090// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7091static std::tuple<unsigned, unsigned>
7093 MachineBasicBlock &MBB = *MI.getParent();
7094 MachineFunction &MF = *MBB.getParent();
7096
7097 // Extract the ptr from the resource descriptor.
7098 unsigned RsrcPtr =
7099 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7100 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7101
7102 // Create an empty resource descriptor
7103 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7104 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7105 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7106 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7107 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7108
7109 // Zero64 = 0
7110 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7111 .addImm(0);
7112
7113 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7114 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7115 .addImm(Lo_32(RsrcDataFormat));
7116
7117 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7118 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7119 .addImm(Hi_32(RsrcDataFormat));
7120
7121 // NewSRsrc = {Zero64, SRsrcFormat}
7122 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7123 .addReg(Zero64)
7124 .addImm(AMDGPU::sub0_sub1)
7125 .addReg(SRsrcFormatLo)
7126 .addImm(AMDGPU::sub2)
7127 .addReg(SRsrcFormatHi)
7128 .addImm(AMDGPU::sub3);
7129
7130 return std::tuple(RsrcPtr, NewSRsrc);
7131}
7132
7135 MachineDominatorTree *MDT) const {
7136 MachineFunction &MF = *MI.getParent()->getParent();
7138 MachineBasicBlock *CreatedBB = nullptr;
7139
7140 // Legalize VOP2
7141 if (isVOP2(MI) || isVOPC(MI)) {
7143 return CreatedBB;
7144 }
7145
7146 // Legalize VOP3
7147 if (isVOP3(MI)) {
7149 return CreatedBB;
7150 }
7151
7152 // Legalize SMRD
7153 if (isSMRD(MI)) {
7155 return CreatedBB;
7156 }
7157
7158 // Legalize FLAT
7159 if (isFLAT(MI)) {
7161 return CreatedBB;
7162 }
7163
7164 // Legalize REG_SEQUENCE and PHI
7165 // The register class of the operands much be the same type as the register
7166 // class of the output.
7167 if (MI.getOpcode() == AMDGPU::PHI) {
7168 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7169 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7170 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7171 continue;
7172 const TargetRegisterClass *OpRC =
7173 MRI.getRegClass(MI.getOperand(i).getReg());
7174 if (RI.hasVectorRegisters(OpRC)) {
7175 VRC = OpRC;
7176 } else {
7177 SRC = OpRC;
7178 }
7179 }
7180
7181 // If any of the operands are VGPR registers, then they all most be
7182 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7183 // them.
7184 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7185 if (!VRC) {
7186 assert(SRC);
7187 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7188 VRC = &AMDGPU::VReg_1RegClass;
7189 } else
7190 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7191 ? RI.getEquivalentAGPRClass(SRC)
7192 : RI.getEquivalentVGPRClass(SRC);
7193 } else {
7194 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7195 ? RI.getEquivalentAGPRClass(VRC)
7196 : RI.getEquivalentVGPRClass(VRC);
7197 }
7198 RC = VRC;
7199 } else {
7200 RC = SRC;
7201 }
7202
7203 // Update all the operands so they have the same type.
7204 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7205 MachineOperand &Op = MI.getOperand(I);
7206 if (!Op.isReg() || !Op.getReg().isVirtual())
7207 continue;
7208
7209 // MI is a PHI instruction.
7210 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7212
7213 // Avoid creating no-op copies with the same src and dst reg class. These
7214 // confuse some of the machine passes.
7215 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7216 }
7217 }
7218
7219 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7220 // VGPR dest type and SGPR sources, insert copies so all operands are
7221 // VGPRs. This seems to help operand folding / the register coalescer.
7222 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7223 MachineBasicBlock *MBB = MI.getParent();
7224 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7225 if (RI.hasVGPRs(DstRC)) {
7226 // Update all the operands so they are VGPR register classes. These may
7227 // not be the same register class because REG_SEQUENCE supports mixing
7228 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7229 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7230 MachineOperand &Op = MI.getOperand(I);
7231 if (!Op.isReg() || !Op.getReg().isVirtual())
7232 continue;
7233
7234 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7235 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7236 if (VRC == OpRC)
7237 continue;
7238
7239 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7240 Op.setIsKill();
7241 }
7242 }
7243
7244 return CreatedBB;
7245 }
7246
7247 // Legalize INSERT_SUBREG
7248 // src0 must have the same register class as dst
7249 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7250 Register Dst = MI.getOperand(0).getReg();
7251 Register Src0 = MI.getOperand(1).getReg();
7252 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7253 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7254 if (DstRC != Src0RC) {
7255 MachineBasicBlock *MBB = MI.getParent();
7256 MachineOperand &Op = MI.getOperand(1);
7257 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7258 }
7259 return CreatedBB;
7260 }
7261
7262 // Legalize SI_INIT_M0
7263 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7264 MachineOperand &Src = MI.getOperand(0);
7265 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7266 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7267 return CreatedBB;
7268 }
7269
7270 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7271 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7272 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7273 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7274 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7275 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7276 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7277 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7278 MachineOperand &Src = MI.getOperand(1);
7279 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7280 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7281 return CreatedBB;
7282 }
7283
7284 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7285 //
7286 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7287 // scratch memory access. In both cases, the legalization never involves
7288 // conversion to the addr64 form.
7290 (isMUBUF(MI) || isMTBUF(MI)))) {
7291 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7292 ? AMDGPU::OpName::rsrc
7293 : AMDGPU::OpName::srsrc;
7294 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7295 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7296 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7297
7298 AMDGPU::OpName SampOpName =
7299 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7300 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7301 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7302 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7303
7304 return CreatedBB;
7305 }
7306
7307 // Legalize SI_CALL
7308 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7309 MachineOperand *Dest = &MI.getOperand(0);
7310 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7311 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7312 // following copies, we also need to move copies from and to physical
7313 // registers into the loop block.
7314 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7315 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7316
7317 // Also move the copies to physical registers into the loop block
7318 MachineBasicBlock &MBB = *MI.getParent();
7320 while (Start->getOpcode() != FrameSetupOpcode)
7321 --Start;
7323 while (End->getOpcode() != FrameDestroyOpcode)
7324 ++End;
7325 // Also include following copies of the return value
7326 ++End;
7327 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7328 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7329 ++End;
7330 CreatedBB =
7331 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7332 }
7333 }
7334
7335 // Legalize s_sleep_var.
7336 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7337 const DebugLoc &DL = MI.getDebugLoc();
7338 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7339 int Src0Idx =
7340 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7341 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7342 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7343 .add(Src0);
7344 Src0.ChangeToRegister(Reg, false);
7345 return nullptr;
7346 }
7347
7348 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7349 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7350 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7351 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7352 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7353 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7354 for (MachineOperand &Src : MI.explicit_operands()) {
7355 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7356 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7357 }
7358 return CreatedBB;
7359 }
7360
7361 // Legalize MUBUF instructions.
7362 bool isSoffsetLegal = true;
7363 int SoffsetIdx =
7364 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7365 if (SoffsetIdx != -1) {
7366 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7367 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7368 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7369 isSoffsetLegal = false;
7370 }
7371 }
7372
7373 bool isRsrcLegal = true;
7374 int RsrcIdx =
7375 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7376 if (RsrcIdx != -1) {
7377 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7378 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7379 isRsrcLegal = false;
7380 }
7381
7382 // The operands are legal.
7383 if (isRsrcLegal && isSoffsetLegal)
7384 return CreatedBB;
7385
7386 if (!isRsrcLegal) {
7387 // Legalize a VGPR Rsrc
7388 //
7389 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7390 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7391 // a zero-value SRsrc.
7392 //
7393 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7394 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7395 // above.
7396 //
7397 // Otherwise we are on non-ADDR64 hardware, and/or we have
7398 // idxen/offen/bothen and we fall back to a waterfall loop.
7399
7400 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7401 MachineBasicBlock &MBB = *MI.getParent();
7402
7403 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7404 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7405 // This is already an ADDR64 instruction so we need to add the pointer
7406 // extracted from the resource descriptor to the current value of VAddr.
7407 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7408 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7409 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7410
7411 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7412 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7413 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7414
7415 unsigned RsrcPtr, NewSRsrc;
7416 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7417
7418 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7419 const DebugLoc &DL = MI.getDebugLoc();
7420 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7421 .addDef(CondReg0)
7422 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7423 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7424 .addImm(0);
7425
7426 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7427 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7428 .addDef(CondReg1, RegState::Dead)
7429 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7430 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7431 .addReg(CondReg0, RegState::Kill)
7432 .addImm(0);
7433
7434 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7435 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7436 .addReg(NewVAddrLo)
7437 .addImm(AMDGPU::sub0)
7438 .addReg(NewVAddrHi)
7439 .addImm(AMDGPU::sub1);
7440
7441 VAddr->setReg(NewVAddr);
7442 Rsrc->setReg(NewSRsrc);
7443 } else if (!VAddr && ST.hasAddr64()) {
7444 // This instructions is the _OFFSET variant, so we need to convert it to
7445 // ADDR64.
7446 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7447 "FIXME: Need to emit flat atomics here");
7448
7449 unsigned RsrcPtr, NewSRsrc;
7450 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7451
7452 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7453 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7454 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7455 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7456 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7457
7458 // Atomics with return have an additional tied operand and are
7459 // missing some of the special bits.
7460 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7461 MachineInstr *Addr64;
7462
7463 if (!VDataIn) {
7464 // Regular buffer load / store.
7466 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7467 .add(*VData)
7468 .addReg(NewVAddr)
7469 .addReg(NewSRsrc)
7470 .add(*SOffset)
7471 .add(*Offset);
7472
7473 if (const MachineOperand *CPol =
7474 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7475 MIB.addImm(CPol->getImm());
7476 }
7477
7478 if (const MachineOperand *TFE =
7479 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7480 MIB.addImm(TFE->getImm());
7481 }
7482
7483 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7484
7485 MIB.cloneMemRefs(MI);
7486 Addr64 = MIB;
7487 } else {
7488 // Atomics with return.
7489 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7490 .add(*VData)
7491 .add(*VDataIn)
7492 .addReg(NewVAddr)
7493 .addReg(NewSRsrc)
7494 .add(*SOffset)
7495 .add(*Offset)
7496 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7497 .cloneMemRefs(MI);
7498 }
7499
7500 MI.removeFromParent();
7501
7502 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7503 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7504 NewVAddr)
7505 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7506 .addImm(AMDGPU::sub0)
7507 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7508 .addImm(AMDGPU::sub1);
7509 } else {
7510 // Legalize a VGPR Rsrc and soffset together.
7511 if (!isSoffsetLegal) {
7512 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7513 CreatedBB =
7514 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7515 return CreatedBB;
7516 }
7517 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7518 return CreatedBB;
7519 }
7520 }
7521
7522 // Legalize a VGPR soffset.
7523 if (!isSoffsetLegal) {
7524 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7525 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7526 return CreatedBB;
7527 }
7528 return CreatedBB;
7529}
7530
7532 InstrList.insert(MI);
7533 // Add MBUF instructiosn to deferred list.
7534 int RsrcIdx =
7535 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7536 if (RsrcIdx != -1) {
7537 DeferredList.insert(MI);
7538 }
7539}
7540
7542 return DeferredList.contains(MI);
7543}
7544
7545// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7546// lowering (change spgr to vgpr).
7547// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7548// size. Need to legalize the size of the operands during the vgpr lowering
7549// chain. This can be removed after we have sgpr16 in place
7551 MachineRegisterInfo &MRI) const {
7552 if (!ST.useRealTrue16Insts())
7553 return;
7554
7555 unsigned Opcode = MI.getOpcode();
7556 MachineBasicBlock *MBB = MI.getParent();
7557 // Legalize operands and check for size mismatch
7558 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7559 OpIdx >= get(Opcode).getNumOperands() ||
7560 get(Opcode).operands()[OpIdx].RegClass == -1)
7561 return;
7562
7563 MachineOperand &Op = MI.getOperand(OpIdx);
7564 if (!Op.isReg() || !Op.getReg().isVirtual())
7565 return;
7566
7567 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7568 if (!RI.isVGPRClass(CurrRC))
7569 return;
7570
7571 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7572 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7573 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7574 Op.setSubReg(AMDGPU::lo16);
7575 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7576 const DebugLoc &DL = MI.getDebugLoc();
7577 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7578 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7579 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7580 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7581 .addReg(Op.getReg())
7582 .addImm(AMDGPU::lo16)
7583 .addReg(Undef)
7584 .addImm(AMDGPU::hi16);
7585 Op.setReg(NewDstReg);
7586 }
7587}
7589 MachineRegisterInfo &MRI) const {
7590 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7592}
7593
7595 MachineDominatorTree *MDT) const {
7596
7597 while (!Worklist.empty()) {
7598 MachineInstr &Inst = *Worklist.top();
7599 Worklist.erase_top();
7600 // Skip MachineInstr in the deferred list.
7601 if (Worklist.isDeferred(&Inst))
7602 continue;
7603 moveToVALUImpl(Worklist, MDT, Inst);
7604 }
7605
7606 // Deferred list of instructions will be processed once
7607 // all the MachineInstr in the worklist are done.
7608 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7609 moveToVALUImpl(Worklist, MDT, *Inst);
7610 assert(Worklist.empty() &&
7611 "Deferred MachineInstr are not supposed to re-populate worklist");
7612 }
7613}
7614
7617 MachineInstr &Inst) const {
7618
7620 if (!MBB)
7621 return;
7622 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7623 unsigned Opcode = Inst.getOpcode();
7624 unsigned NewOpcode = getVALUOp(Inst);
7625 const DebugLoc &DL = Inst.getDebugLoc();
7626
7627 // Handle some special cases
7628 switch (Opcode) {
7629 default:
7630 break;
7631 case AMDGPU::S_ADD_I32:
7632 case AMDGPU::S_SUB_I32: {
7633 // FIXME: The u32 versions currently selected use the carry.
7634 bool Changed;
7635 MachineBasicBlock *CreatedBBTmp = nullptr;
7636 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7637 if (Changed)
7638 return;
7639
7640 // Default handling
7641 break;
7642 }
7643
7644 case AMDGPU::S_MUL_U64:
7645 if (ST.hasVectorMulU64()) {
7646 NewOpcode = AMDGPU::V_MUL_U64_e64;
7647 break;
7648 }
7649 // Split s_mul_u64 in 32-bit vector multiplications.
7650 splitScalarSMulU64(Worklist, Inst, MDT);
7651 Inst.eraseFromParent();
7652 return;
7653
7654 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7655 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7656 // This is a special case of s_mul_u64 where all the operands are either
7657 // zero extended or sign extended.
7658 splitScalarSMulPseudo(Worklist, Inst, MDT);
7659 Inst.eraseFromParent();
7660 return;
7661
7662 case AMDGPU::S_AND_B64:
7663 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7664 Inst.eraseFromParent();
7665 return;
7666
7667 case AMDGPU::S_OR_B64:
7668 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7669 Inst.eraseFromParent();
7670 return;
7671
7672 case AMDGPU::S_XOR_B64:
7673 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7674 Inst.eraseFromParent();
7675 return;
7676
7677 case AMDGPU::S_NAND_B64:
7678 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7679 Inst.eraseFromParent();
7680 return;
7681
7682 case AMDGPU::S_NOR_B64:
7683 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7684 Inst.eraseFromParent();
7685 return;
7686
7687 case AMDGPU::S_XNOR_B64:
7688 if (ST.hasDLInsts())
7689 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7690 else
7691 splitScalar64BitXnor(Worklist, Inst, MDT);
7692 Inst.eraseFromParent();
7693 return;
7694
7695 case AMDGPU::S_ANDN2_B64:
7696 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7697 Inst.eraseFromParent();
7698 return;
7699
7700 case AMDGPU::S_ORN2_B64:
7701 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7702 Inst.eraseFromParent();
7703 return;
7704
7705 case AMDGPU::S_BREV_B64:
7706 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7707 Inst.eraseFromParent();
7708 return;
7709
7710 case AMDGPU::S_NOT_B64:
7711 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7712 Inst.eraseFromParent();
7713 return;
7714
7715 case AMDGPU::S_BCNT1_I32_B64:
7716 splitScalar64BitBCNT(Worklist, Inst);
7717 Inst.eraseFromParent();
7718 return;
7719
7720 case AMDGPU::S_BFE_I64:
7721 splitScalar64BitBFE(Worklist, Inst);
7722 Inst.eraseFromParent();
7723 return;
7724
7725 case AMDGPU::S_FLBIT_I32_B64:
7726 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7727 Inst.eraseFromParent();
7728 return;
7729 case AMDGPU::S_FF1_I32_B64:
7730 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7731 Inst.eraseFromParent();
7732 return;
7733
7734 case AMDGPU::S_LSHL_B32:
7735 if (ST.hasOnlyRevVALUShifts()) {
7736 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7737 swapOperands(Inst);
7738 }
7739 break;
7740 case AMDGPU::S_ASHR_I32:
7741 if (ST.hasOnlyRevVALUShifts()) {
7742 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7743 swapOperands(Inst);
7744 }
7745 break;
7746 case AMDGPU::S_LSHR_B32:
7747 if (ST.hasOnlyRevVALUShifts()) {
7748 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7749 swapOperands(Inst);
7750 }
7751 break;
7752 case AMDGPU::S_LSHL_B64:
7753 if (ST.hasOnlyRevVALUShifts()) {
7754 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7755 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7756 : AMDGPU::V_LSHLREV_B64_e64;
7757 swapOperands(Inst);
7758 }
7759 break;
7760 case AMDGPU::S_ASHR_I64:
7761 if (ST.hasOnlyRevVALUShifts()) {
7762 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7763 swapOperands(Inst);
7764 }
7765 break;
7766 case AMDGPU::S_LSHR_B64:
7767 if (ST.hasOnlyRevVALUShifts()) {
7768 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7769 swapOperands(Inst);
7770 }
7771 break;
7772
7773 case AMDGPU::S_ABS_I32:
7774 lowerScalarAbs(Worklist, Inst);
7775 Inst.eraseFromParent();
7776 return;
7777
7778 case AMDGPU::S_CBRANCH_SCC0:
7779 case AMDGPU::S_CBRANCH_SCC1: {
7780 // Clear unused bits of vcc
7781 Register CondReg = Inst.getOperand(1).getReg();
7782 bool IsSCC = CondReg == AMDGPU::SCC;
7784 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7785 .addReg(LMC.ExecReg)
7786 .addReg(IsSCC ? LMC.VccReg : CondReg);
7787 Inst.removeOperand(1);
7788 } break;
7789
7790 case AMDGPU::S_BFE_U64:
7791 case AMDGPU::S_BFM_B64:
7792 llvm_unreachable("Moving this op to VALU not implemented");
7793
7794 case AMDGPU::S_PACK_LL_B32_B16:
7795 case AMDGPU::S_PACK_LH_B32_B16:
7796 case AMDGPU::S_PACK_HL_B32_B16:
7797 case AMDGPU::S_PACK_HH_B32_B16:
7798 movePackToVALU(Worklist, MRI, Inst);
7799 Inst.eraseFromParent();
7800 return;
7801
7802 case AMDGPU::S_XNOR_B32:
7803 lowerScalarXnor(Worklist, Inst);
7804 Inst.eraseFromParent();
7805 return;
7806
7807 case AMDGPU::S_NAND_B32:
7808 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7809 Inst.eraseFromParent();
7810 return;
7811
7812 case AMDGPU::S_NOR_B32:
7813 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7814 Inst.eraseFromParent();
7815 return;
7816
7817 case AMDGPU::S_ANDN2_B32:
7818 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7819 Inst.eraseFromParent();
7820 return;
7821
7822 case AMDGPU::S_ORN2_B32:
7823 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7824 Inst.eraseFromParent();
7825 return;
7826
7827 // TODO: remove as soon as everything is ready
7828 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7829 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7830 // can only be selected from the uniform SDNode.
7831 case AMDGPU::S_ADD_CO_PSEUDO:
7832 case AMDGPU::S_SUB_CO_PSEUDO: {
7833 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7834 ? AMDGPU::V_ADDC_U32_e64
7835 : AMDGPU::V_SUBB_U32_e64;
7836 const auto *CarryRC = RI.getWaveMaskRegClass();
7837
7838 Register CarryInReg = Inst.getOperand(4).getReg();
7839 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7840 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7841 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7842 .addReg(CarryInReg);
7843 }
7844
7845 Register CarryOutReg = Inst.getOperand(1).getReg();
7846
7847 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7848 MRI.getRegClass(Inst.getOperand(0).getReg())));
7849 MachineInstr *CarryOp =
7850 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7851 .addReg(CarryOutReg, RegState::Define)
7852 .add(Inst.getOperand(2))
7853 .add(Inst.getOperand(3))
7854 .addReg(CarryInReg)
7855 .addImm(0);
7856 legalizeOperands(*CarryOp);
7857 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7858 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7859 Inst.eraseFromParent();
7860 }
7861 return;
7862 case AMDGPU::S_UADDO_PSEUDO:
7863 case AMDGPU::S_USUBO_PSEUDO: {
7864 MachineOperand &Dest0 = Inst.getOperand(0);
7865 MachineOperand &Dest1 = Inst.getOperand(1);
7866 MachineOperand &Src0 = Inst.getOperand(2);
7867 MachineOperand &Src1 = Inst.getOperand(3);
7868
7869 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7870 ? AMDGPU::V_ADD_CO_U32_e64
7871 : AMDGPU::V_SUB_CO_U32_e64;
7872 const TargetRegisterClass *NewRC =
7873 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7874 Register DestReg = MRI.createVirtualRegister(NewRC);
7875 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7876 .addReg(Dest1.getReg(), RegState::Define)
7877 .add(Src0)
7878 .add(Src1)
7879 .addImm(0); // clamp bit
7880
7881 legalizeOperands(*NewInstr, MDT);
7882 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7883 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7884 Inst.eraseFromParent();
7885 }
7886 return;
7887 case AMDGPU::S_LSHL1_ADD_U32:
7888 case AMDGPU::S_LSHL2_ADD_U32:
7889 case AMDGPU::S_LSHL3_ADD_U32:
7890 case AMDGPU::S_LSHL4_ADD_U32: {
7891 MachineOperand &Dest = Inst.getOperand(0);
7892 MachineOperand &Src0 = Inst.getOperand(1);
7893 MachineOperand &Src1 = Inst.getOperand(2);
7894 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
7895 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
7896 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
7897 : 4);
7898
7899 const TargetRegisterClass *NewRC =
7900 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
7901 Register DestReg = MRI.createVirtualRegister(NewRC);
7902 MachineInstr *NewInstr =
7903 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
7904 .add(Src0)
7905 .addImm(ShiftAmt)
7906 .add(Src1);
7907
7908 legalizeOperands(*NewInstr, MDT);
7909 MRI.replaceRegWith(Dest.getReg(), DestReg);
7910 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7911 Inst.eraseFromParent();
7912 }
7913 return;
7914 case AMDGPU::S_CSELECT_B32:
7915 case AMDGPU::S_CSELECT_B64:
7916 lowerSelect(Worklist, Inst, MDT);
7917 Inst.eraseFromParent();
7918 return;
7919 case AMDGPU::S_CMP_EQ_I32:
7920 case AMDGPU::S_CMP_LG_I32:
7921 case AMDGPU::S_CMP_GT_I32:
7922 case AMDGPU::S_CMP_GE_I32:
7923 case AMDGPU::S_CMP_LT_I32:
7924 case AMDGPU::S_CMP_LE_I32:
7925 case AMDGPU::S_CMP_EQ_U32:
7926 case AMDGPU::S_CMP_LG_U32:
7927 case AMDGPU::S_CMP_GT_U32:
7928 case AMDGPU::S_CMP_GE_U32:
7929 case AMDGPU::S_CMP_LT_U32:
7930 case AMDGPU::S_CMP_LE_U32:
7931 case AMDGPU::S_CMP_EQ_U64:
7932 case AMDGPU::S_CMP_LG_U64:
7933 case AMDGPU::S_CMP_LT_F32:
7934 case AMDGPU::S_CMP_EQ_F32:
7935 case AMDGPU::S_CMP_LE_F32:
7936 case AMDGPU::S_CMP_GT_F32:
7937 case AMDGPU::S_CMP_LG_F32:
7938 case AMDGPU::S_CMP_GE_F32:
7939 case AMDGPU::S_CMP_O_F32:
7940 case AMDGPU::S_CMP_U_F32:
7941 case AMDGPU::S_CMP_NGE_F32:
7942 case AMDGPU::S_CMP_NLG_F32:
7943 case AMDGPU::S_CMP_NGT_F32:
7944 case AMDGPU::S_CMP_NLE_F32:
7945 case AMDGPU::S_CMP_NEQ_F32:
7946 case AMDGPU::S_CMP_NLT_F32: {
7947 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7948 auto NewInstr =
7949 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7950 .setMIFlags(Inst.getFlags());
7951 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7952 0) {
7953 NewInstr
7954 .addImm(0) // src0_modifiers
7955 .add(Inst.getOperand(0)) // src0
7956 .addImm(0) // src1_modifiers
7957 .add(Inst.getOperand(1)) // src1
7958 .addImm(0); // clamp
7959 } else {
7960 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7961 }
7962 legalizeOperands(*NewInstr, MDT);
7963 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7964 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
7965 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7966 Inst.eraseFromParent();
7967 return;
7968 }
7969 case AMDGPU::S_CMP_LT_F16:
7970 case AMDGPU::S_CMP_EQ_F16:
7971 case AMDGPU::S_CMP_LE_F16:
7972 case AMDGPU::S_CMP_GT_F16:
7973 case AMDGPU::S_CMP_LG_F16:
7974 case AMDGPU::S_CMP_GE_F16:
7975 case AMDGPU::S_CMP_O_F16:
7976 case AMDGPU::S_CMP_U_F16:
7977 case AMDGPU::S_CMP_NGE_F16:
7978 case AMDGPU::S_CMP_NLG_F16:
7979 case AMDGPU::S_CMP_NGT_F16:
7980 case AMDGPU::S_CMP_NLE_F16:
7981 case AMDGPU::S_CMP_NEQ_F16:
7982 case AMDGPU::S_CMP_NLT_F16: {
7983 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7984 auto NewInstr =
7985 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7986 .setMIFlags(Inst.getFlags());
7987 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7988 NewInstr
7989 .addImm(0) // src0_modifiers
7990 .add(Inst.getOperand(0)) // src0
7991 .addImm(0) // src1_modifiers
7992 .add(Inst.getOperand(1)) // src1
7993 .addImm(0); // clamp
7994 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7995 NewInstr.addImm(0); // op_sel0
7996 } else {
7997 NewInstr
7998 .add(Inst.getOperand(0))
7999 .add(Inst.getOperand(1));
8000 }
8001 legalizeOperandsVALUt16(*NewInstr, MRI);
8002 legalizeOperands(*NewInstr, MDT);
8003 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8004 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8005 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8006 Inst.eraseFromParent();
8007 return;
8008 }
8009 case AMDGPU::S_CVT_HI_F32_F16: {
8010 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8011 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8012 if (ST.useRealTrue16Insts()) {
8013 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8014 .add(Inst.getOperand(1));
8015 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8016 .addImm(0) // src0_modifiers
8017 .addReg(TmpReg, 0, AMDGPU::hi16)
8018 .addImm(0) // clamp
8019 .addImm(0) // omod
8020 .addImm(0); // op_sel0
8021 } else {
8022 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8023 .addImm(16)
8024 .add(Inst.getOperand(1));
8025 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8026 .addImm(0) // src0_modifiers
8027 .addReg(TmpReg)
8028 .addImm(0) // clamp
8029 .addImm(0); // omod
8030 }
8031
8032 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8033 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8034 Inst.eraseFromParent();
8035 return;
8036 }
8037 case AMDGPU::S_MINIMUM_F32:
8038 case AMDGPU::S_MAXIMUM_F32: {
8039 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8040 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8041 .addImm(0) // src0_modifiers
8042 .add(Inst.getOperand(1))
8043 .addImm(0) // src1_modifiers
8044 .add(Inst.getOperand(2))
8045 .addImm(0) // clamp
8046 .addImm(0); // omod
8047 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8048
8049 legalizeOperands(*NewInstr, MDT);
8050 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8051 Inst.eraseFromParent();
8052 return;
8053 }
8054 case AMDGPU::S_MINIMUM_F16:
8055 case AMDGPU::S_MAXIMUM_F16: {
8056 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8057 ? &AMDGPU::VGPR_16RegClass
8058 : &AMDGPU::VGPR_32RegClass);
8059 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8060 .addImm(0) // src0_modifiers
8061 .add(Inst.getOperand(1))
8062 .addImm(0) // src1_modifiers
8063 .add(Inst.getOperand(2))
8064 .addImm(0) // clamp
8065 .addImm(0) // omod
8066 .addImm(0); // opsel0
8067 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8068 legalizeOperandsVALUt16(*NewInstr, MRI);
8069 legalizeOperands(*NewInstr, MDT);
8070 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8071 Inst.eraseFromParent();
8072 return;
8073 }
8074 case AMDGPU::V_S_EXP_F16_e64:
8075 case AMDGPU::V_S_LOG_F16_e64:
8076 case AMDGPU::V_S_RCP_F16_e64:
8077 case AMDGPU::V_S_RSQ_F16_e64:
8078 case AMDGPU::V_S_SQRT_F16_e64: {
8079 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8080 ? &AMDGPU::VGPR_16RegClass
8081 : &AMDGPU::VGPR_32RegClass);
8082 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8083 .add(Inst.getOperand(1)) // src0_modifiers
8084 .add(Inst.getOperand(2))
8085 .add(Inst.getOperand(3)) // clamp
8086 .add(Inst.getOperand(4)) // omod
8087 .setMIFlags(Inst.getFlags());
8088 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8089 NewInstr.addImm(0); // opsel0
8090 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8091 legalizeOperandsVALUt16(*NewInstr, MRI);
8092 legalizeOperands(*NewInstr, MDT);
8093 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8094 Inst.eraseFromParent();
8095 return;
8096 }
8097 }
8098
8099 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8100 // We cannot move this instruction to the VALU, so we should try to
8101 // legalize its operands instead.
8102 legalizeOperands(Inst, MDT);
8103 return;
8104 }
8105 // Handle converting generic instructions like COPY-to-SGPR into
8106 // COPY-to-VGPR.
8107 if (NewOpcode == Opcode) {
8108 Register DstReg = Inst.getOperand(0).getReg();
8109 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8110
8111 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8112 // hope for the best.
8113 if (Inst.isCopy() && DstReg.isPhysical() &&
8114 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8115 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8116 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8117 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8118 .add(Inst.getOperand(1));
8119 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8120 DstReg)
8121 .addReg(NewDst);
8122
8123 Inst.eraseFromParent();
8124 return;
8125 }
8126
8127 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8128 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8129 // Instead of creating a copy where src and dst are the same register
8130 // class, we just replace all uses of dst with src. These kinds of
8131 // copies interfere with the heuristics MachineSink uses to decide
8132 // whether or not to split a critical edge. Since the pass assumes
8133 // that copies will end up as machine instructions and not be
8134 // eliminated.
8135 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8136 Register NewDstReg = Inst.getOperand(1).getReg();
8137 MRI.replaceRegWith(DstReg, NewDstReg);
8138 MRI.clearKillFlags(NewDstReg);
8139 Inst.getOperand(0).setReg(DstReg);
8140 Inst.eraseFromParent();
8141 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8142 for (MachineOperand &MO :
8143 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8144 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8145 }
8146 return;
8147 }
8148
8149 // If this is a v2s copy between 16bit and 32bit reg,
8150 // replace vgpr copy to reg_sequence/extract_subreg
8151 // This can be remove after we have sgpr16 in place
8152 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8153 Inst.getOperand(1).getReg().isVirtual() &&
8154 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8155 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8156 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8157 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8158 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8159 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8160 get(AMDGPU::IMPLICIT_DEF), Undef);
8161 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8162 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8163 .addReg(Inst.getOperand(1).getReg())
8164 .addImm(AMDGPU::lo16)
8165 .addReg(Undef)
8166 .addImm(AMDGPU::hi16);
8167 Inst.eraseFromParent();
8168 MRI.replaceRegWith(DstReg, NewDstReg);
8169 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8170 return;
8171 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8172 AMDGPU::lo16)) {
8173 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8174 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8175 MRI.replaceRegWith(DstReg, NewDstReg);
8176 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8177 return;
8178 }
8179 }
8180
8181 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8182 MRI.replaceRegWith(DstReg, NewDstReg);
8183 legalizeOperands(Inst, MDT);
8184 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8185 return;
8186 }
8187
8188 // Use the new VALU Opcode.
8189 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8190 .setMIFlags(Inst.getFlags());
8191 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8192 // Intersperse VOP3 modifiers among the SALU operands.
8193 NewInstr->addOperand(Inst.getOperand(0));
8194 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8195 AMDGPU::OpName::src0_modifiers) >= 0)
8196 NewInstr.addImm(0);
8197 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8198 const MachineOperand &Src = Inst.getOperand(1);
8199 NewInstr->addOperand(Src);
8200 }
8201
8202 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8203 // We are converting these to a BFE, so we need to add the missing
8204 // operands for the size and offset.
8205 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8206 NewInstr.addImm(0);
8207 NewInstr.addImm(Size);
8208 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8209 // The VALU version adds the second operand to the result, so insert an
8210 // extra 0 operand.
8211 NewInstr.addImm(0);
8212 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8213 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8214 // If we need to move this to VGPRs, we need to unpack the second
8215 // operand back into the 2 separate ones for bit offset and width.
8216 assert(OffsetWidthOp.isImm() &&
8217 "Scalar BFE is only implemented for constant width and offset");
8218 uint32_t Imm = OffsetWidthOp.getImm();
8219
8220 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8221 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8222 NewInstr.addImm(Offset);
8223 NewInstr.addImm(BitWidth);
8224 } else {
8225 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8226 AMDGPU::OpName::src1_modifiers) >= 0)
8227 NewInstr.addImm(0);
8228 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8229 NewInstr->addOperand(Inst.getOperand(2));
8230 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8231 AMDGPU::OpName::src2_modifiers) >= 0)
8232 NewInstr.addImm(0);
8233 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8234 NewInstr->addOperand(Inst.getOperand(3));
8235 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8236 NewInstr.addImm(0);
8237 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8238 NewInstr.addImm(0);
8239 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8240 NewInstr.addImm(0);
8241 }
8242 } else {
8243 // Just copy the SALU operands.
8244 for (const MachineOperand &Op : Inst.explicit_operands())
8245 NewInstr->addOperand(Op);
8246 }
8247
8248 // Remove any references to SCC. Vector instructions can't read from it, and
8249 // We're just about to add the implicit use / defs of VCC, and we don't want
8250 // both.
8251 for (MachineOperand &Op : Inst.implicit_operands()) {
8252 if (Op.getReg() == AMDGPU::SCC) {
8253 // Only propagate through live-def of SCC.
8254 if (Op.isDef() && !Op.isDead())
8255 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8256 if (Op.isUse())
8257 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8258 }
8259 }
8260 Inst.eraseFromParent();
8261 Register NewDstReg;
8262 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8263 Register DstReg = NewInstr->getOperand(0).getReg();
8264 assert(DstReg.isVirtual());
8265 // Update the destination register class.
8266 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8267 assert(NewDstRC);
8268 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8269 MRI.replaceRegWith(DstReg, NewDstReg);
8270 }
8271 fixImplicitOperands(*NewInstr);
8272
8273 legalizeOperandsVALUt16(*NewInstr, MRI);
8274
8275 // Legalize the operands
8276 legalizeOperands(*NewInstr, MDT);
8277 if (NewDstReg)
8278 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8279}
8280
8281// Add/sub require special handling to deal with carry outs.
8282std::pair<bool, MachineBasicBlock *>
8283SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8284 MachineDominatorTree *MDT) const {
8285 if (ST.hasAddNoCarry()) {
8286 // Assume there is no user of scc since we don't select this in that case.
8287 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8288 // is used.
8289
8290 MachineBasicBlock &MBB = *Inst.getParent();
8291 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8292
8293 Register OldDstReg = Inst.getOperand(0).getReg();
8294 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8295
8296 unsigned Opc = Inst.getOpcode();
8297 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8298
8299 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8300 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8301
8302 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8303 Inst.removeOperand(3);
8304
8305 Inst.setDesc(get(NewOpc));
8306 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8307 Inst.addImplicitDefUseOperands(*MBB.getParent());
8308 MRI.replaceRegWith(OldDstReg, ResultReg);
8309 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8310
8311 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8312 return std::pair(true, NewBB);
8313 }
8314
8315 return std::pair(false, nullptr);
8316}
8317
8318void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8319 MachineDominatorTree *MDT) const {
8320
8321 MachineBasicBlock &MBB = *Inst.getParent();
8322 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8323 MachineBasicBlock::iterator MII = Inst;
8324 DebugLoc DL = Inst.getDebugLoc();
8325
8326 MachineOperand &Dest = Inst.getOperand(0);
8327 MachineOperand &Src0 = Inst.getOperand(1);
8328 MachineOperand &Src1 = Inst.getOperand(2);
8329 MachineOperand &Cond = Inst.getOperand(3);
8330
8331 Register CondReg = Cond.getReg();
8332 bool IsSCC = (CondReg == AMDGPU::SCC);
8333
8334 // If this is a trivial select where the condition is effectively not SCC
8335 // (CondReg is a source of copy to SCC), then the select is semantically
8336 // equivalent to copying CondReg. Hence, there is no need to create
8337 // V_CNDMASK, we can just use that and bail out.
8338 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8339 (Src1.getImm() == 0)) {
8340 MRI.replaceRegWith(Dest.getReg(), CondReg);
8341 return;
8342 }
8343
8344 Register NewCondReg = CondReg;
8345 if (IsSCC) {
8346 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8347 NewCondReg = MRI.createVirtualRegister(TC);
8348
8349 // Now look for the closest SCC def if it is a copy
8350 // replacing the CondReg with the COPY source register
8351 bool CopyFound = false;
8352 for (MachineInstr &CandI :
8354 Inst.getParent()->rend())) {
8355 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8356 -1) {
8357 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8358 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8359 .addReg(CandI.getOperand(1).getReg());
8360 CopyFound = true;
8361 }
8362 break;
8363 }
8364 }
8365 if (!CopyFound) {
8366 // SCC def is not a copy
8367 // Insert a trivial select instead of creating a copy, because a copy from
8368 // SCC would semantically mean just copying a single bit, but we may need
8369 // the result to be a vector condition mask that needs preserving.
8370 unsigned Opcode =
8371 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8372 auto NewSelect =
8373 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8374 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8375 }
8376 }
8377
8378 Register NewDestReg = MRI.createVirtualRegister(
8379 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8380 MachineInstr *NewInst;
8381 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8382 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8383 .addImm(0)
8384 .add(Src1) // False
8385 .addImm(0)
8386 .add(Src0) // True
8387 .addReg(NewCondReg);
8388 } else {
8389 NewInst =
8390 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8391 .add(Src1) // False
8392 .add(Src0) // True
8393 .addReg(NewCondReg);
8394 }
8395 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8396 legalizeOperands(*NewInst, MDT);
8397 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8398}
8399
8400void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8401 MachineInstr &Inst) const {
8402 MachineBasicBlock &MBB = *Inst.getParent();
8403 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8404 MachineBasicBlock::iterator MII = Inst;
8405 DebugLoc DL = Inst.getDebugLoc();
8406
8407 MachineOperand &Dest = Inst.getOperand(0);
8408 MachineOperand &Src = Inst.getOperand(1);
8409 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8410 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8411
8412 unsigned SubOp = ST.hasAddNoCarry() ?
8413 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8414
8415 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8416 .addImm(0)
8417 .addReg(Src.getReg());
8418
8419 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8420 .addReg(Src.getReg())
8421 .addReg(TmpReg);
8422
8423 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8424 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8425}
8426
8427void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8428 MachineInstr &Inst) const {
8429 MachineBasicBlock &MBB = *Inst.getParent();
8430 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8431 MachineBasicBlock::iterator MII = Inst;
8432 const DebugLoc &DL = Inst.getDebugLoc();
8433
8434 MachineOperand &Dest = Inst.getOperand(0);
8435 MachineOperand &Src0 = Inst.getOperand(1);
8436 MachineOperand &Src1 = Inst.getOperand(2);
8437
8438 if (ST.hasDLInsts()) {
8439 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8440 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8441 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8442
8443 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8444 .add(Src0)
8445 .add(Src1);
8446
8447 MRI.replaceRegWith(Dest.getReg(), NewDest);
8448 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8449 } else {
8450 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8451 // invert either source and then perform the XOR. If either source is a
8452 // scalar register, then we can leave the inversion on the scalar unit to
8453 // achieve a better distribution of scalar and vector instructions.
8454 bool Src0IsSGPR = Src0.isReg() &&
8455 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8456 bool Src1IsSGPR = Src1.isReg() &&
8457 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8458 MachineInstr *Xor;
8459 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8460 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8461
8462 // Build a pair of scalar instructions and add them to the work list.
8463 // The next iteration over the work list will lower these to the vector
8464 // unit as necessary.
8465 if (Src0IsSGPR) {
8466 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8467 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8468 .addReg(Temp)
8469 .add(Src1);
8470 } else if (Src1IsSGPR) {
8471 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8472 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8473 .add(Src0)
8474 .addReg(Temp);
8475 } else {
8476 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8477 .add(Src0)
8478 .add(Src1);
8479 MachineInstr *Not =
8480 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8481 Worklist.insert(Not);
8482 }
8483
8484 MRI.replaceRegWith(Dest.getReg(), NewDest);
8485
8486 Worklist.insert(Xor);
8487
8488 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8489 }
8490}
8491
8492void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8493 MachineInstr &Inst,
8494 unsigned Opcode) const {
8495 MachineBasicBlock &MBB = *Inst.getParent();
8496 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8497 MachineBasicBlock::iterator MII = Inst;
8498 const DebugLoc &DL = Inst.getDebugLoc();
8499
8500 MachineOperand &Dest = Inst.getOperand(0);
8501 MachineOperand &Src0 = Inst.getOperand(1);
8502 MachineOperand &Src1 = Inst.getOperand(2);
8503
8504 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8505 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8506
8507 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8508 .add(Src0)
8509 .add(Src1);
8510
8511 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8512 .addReg(Interm);
8513
8514 Worklist.insert(&Op);
8515 Worklist.insert(&Not);
8516
8517 MRI.replaceRegWith(Dest.getReg(), NewDest);
8518 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8519}
8520
8521void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8522 MachineInstr &Inst,
8523 unsigned Opcode) const {
8524 MachineBasicBlock &MBB = *Inst.getParent();
8525 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8526 MachineBasicBlock::iterator MII = Inst;
8527 const DebugLoc &DL = Inst.getDebugLoc();
8528
8529 MachineOperand &Dest = Inst.getOperand(0);
8530 MachineOperand &Src0 = Inst.getOperand(1);
8531 MachineOperand &Src1 = Inst.getOperand(2);
8532
8533 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8534 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8535
8536 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8537 .add(Src1);
8538
8539 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8540 .add(Src0)
8541 .addReg(Interm);
8542
8543 Worklist.insert(&Not);
8544 Worklist.insert(&Op);
8545
8546 MRI.replaceRegWith(Dest.getReg(), NewDest);
8547 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8548}
8549
8550void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8551 MachineInstr &Inst, unsigned Opcode,
8552 bool Swap) const {
8553 MachineBasicBlock &MBB = *Inst.getParent();
8554 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8555
8556 MachineOperand &Dest = Inst.getOperand(0);
8557 MachineOperand &Src0 = Inst.getOperand(1);
8558 DebugLoc DL = Inst.getDebugLoc();
8559
8560 MachineBasicBlock::iterator MII = Inst;
8561
8562 const MCInstrDesc &InstDesc = get(Opcode);
8563 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8564 MRI.getRegClass(Src0.getReg()) :
8565 &AMDGPU::SGPR_32RegClass;
8566
8567 const TargetRegisterClass *Src0SubRC =
8568 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8569
8570 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8571 AMDGPU::sub0, Src0SubRC);
8572
8573 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8574 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8575 const TargetRegisterClass *NewDestSubRC =
8576 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8577
8578 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8579 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8580
8581 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8582 AMDGPU::sub1, Src0SubRC);
8583
8584 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8585 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8586
8587 if (Swap)
8588 std::swap(DestSub0, DestSub1);
8589
8590 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8591 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8592 .addReg(DestSub0)
8593 .addImm(AMDGPU::sub0)
8594 .addReg(DestSub1)
8595 .addImm(AMDGPU::sub1);
8596
8597 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8598
8599 Worklist.insert(&LoHalf);
8600 Worklist.insert(&HiHalf);
8601
8602 // We don't need to legalizeOperands here because for a single operand, src0
8603 // will support any kind of input.
8604
8605 // Move all users of this moved value.
8606 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8607}
8608
8609// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8610// split the s_mul_u64 in 32-bit vector multiplications.
8611void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8612 MachineInstr &Inst,
8613 MachineDominatorTree *MDT) const {
8614 MachineBasicBlock &MBB = *Inst.getParent();
8615 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8616
8617 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8618 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8619 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8620
8621 MachineOperand &Dest = Inst.getOperand(0);
8622 MachineOperand &Src0 = Inst.getOperand(1);
8623 MachineOperand &Src1 = Inst.getOperand(2);
8624 const DebugLoc &DL = Inst.getDebugLoc();
8625 MachineBasicBlock::iterator MII = Inst;
8626
8627 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8628 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8629 const TargetRegisterClass *Src0SubRC =
8630 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8631 if (RI.isSGPRClass(Src0SubRC))
8632 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8633 const TargetRegisterClass *Src1SubRC =
8634 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8635 if (RI.isSGPRClass(Src1SubRC))
8636 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8637
8638 // First, we extract the low 32-bit and high 32-bit values from each of the
8639 // operands.
8640 MachineOperand Op0L =
8641 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8642 MachineOperand Op1L =
8643 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8644 MachineOperand Op0H =
8645 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8646 MachineOperand Op1H =
8647 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8648
8649 // The multilication is done as follows:
8650 //
8651 // Op1H Op1L
8652 // * Op0H Op0L
8653 // --------------------
8654 // Op1H*Op0L Op1L*Op0L
8655 // + Op1H*Op0H Op1L*Op0H
8656 // -----------------------------------------
8657 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8658 //
8659 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8660 // value and that would overflow.
8661 // The low 32-bit value is Op1L*Op0L.
8662 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8663
8664 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8665 MachineInstr *Op1L_Op0H =
8666 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8667 .add(Op1L)
8668 .add(Op0H);
8669
8670 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8671 MachineInstr *Op1H_Op0L =
8672 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8673 .add(Op1H)
8674 .add(Op0L);
8675
8676 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8677 MachineInstr *Carry =
8678 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8679 .add(Op1L)
8680 .add(Op0L);
8681
8682 MachineInstr *LoHalf =
8683 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8684 .add(Op1L)
8685 .add(Op0L);
8686
8687 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8688 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8689 .addReg(Op1L_Op0H_Reg)
8690 .addReg(Op1H_Op0L_Reg);
8691
8692 MachineInstr *HiHalf =
8693 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8694 .addReg(AddReg)
8695 .addReg(CarryReg);
8696
8697 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8698 .addReg(DestSub0)
8699 .addImm(AMDGPU::sub0)
8700 .addReg(DestSub1)
8701 .addImm(AMDGPU::sub1);
8702
8703 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8704
8705 // Try to legalize the operands in case we need to swap the order to keep it
8706 // valid.
8707 legalizeOperands(*Op1L_Op0H, MDT);
8708 legalizeOperands(*Op1H_Op0L, MDT);
8709 legalizeOperands(*Carry, MDT);
8710 legalizeOperands(*LoHalf, MDT);
8711 legalizeOperands(*Add, MDT);
8712 legalizeOperands(*HiHalf, MDT);
8713
8714 // Move all users of this moved value.
8715 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8716}
8717
8718// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8719// multiplications.
8720void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8721 MachineInstr &Inst,
8722 MachineDominatorTree *MDT) const {
8723 MachineBasicBlock &MBB = *Inst.getParent();
8724 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8725
8726 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8727 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8728 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8729
8730 MachineOperand &Dest = Inst.getOperand(0);
8731 MachineOperand &Src0 = Inst.getOperand(1);
8732 MachineOperand &Src1 = Inst.getOperand(2);
8733 const DebugLoc &DL = Inst.getDebugLoc();
8734 MachineBasicBlock::iterator MII = Inst;
8735
8736 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8737 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8738 const TargetRegisterClass *Src0SubRC =
8739 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8740 if (RI.isSGPRClass(Src0SubRC))
8741 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8742 const TargetRegisterClass *Src1SubRC =
8743 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8744 if (RI.isSGPRClass(Src1SubRC))
8745 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8746
8747 // First, we extract the low 32-bit and high 32-bit values from each of the
8748 // operands.
8749 MachineOperand Op0L =
8750 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8751 MachineOperand Op1L =
8752 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8753
8754 unsigned Opc = Inst.getOpcode();
8755 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8756 ? AMDGPU::V_MUL_HI_U32_e64
8757 : AMDGPU::V_MUL_HI_I32_e64;
8758 MachineInstr *HiHalf =
8759 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8760
8761 MachineInstr *LoHalf =
8762 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8763 .add(Op1L)
8764 .add(Op0L);
8765
8766 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8767 .addReg(DestSub0)
8768 .addImm(AMDGPU::sub0)
8769 .addReg(DestSub1)
8770 .addImm(AMDGPU::sub1);
8771
8772 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8773
8774 // Try to legalize the operands in case we need to swap the order to keep it
8775 // valid.
8776 legalizeOperands(*HiHalf, MDT);
8777 legalizeOperands(*LoHalf, MDT);
8778
8779 // Move all users of this moved value.
8780 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8781}
8782
8783void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8784 MachineInstr &Inst, unsigned Opcode,
8785 MachineDominatorTree *MDT) const {
8786 MachineBasicBlock &MBB = *Inst.getParent();
8787 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8788
8789 MachineOperand &Dest = Inst.getOperand(0);
8790 MachineOperand &Src0 = Inst.getOperand(1);
8791 MachineOperand &Src1 = Inst.getOperand(2);
8792 DebugLoc DL = Inst.getDebugLoc();
8793
8794 MachineBasicBlock::iterator MII = Inst;
8795
8796 const MCInstrDesc &InstDesc = get(Opcode);
8797 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8798 MRI.getRegClass(Src0.getReg()) :
8799 &AMDGPU::SGPR_32RegClass;
8800
8801 const TargetRegisterClass *Src0SubRC =
8802 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8803 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8804 MRI.getRegClass(Src1.getReg()) :
8805 &AMDGPU::SGPR_32RegClass;
8806
8807 const TargetRegisterClass *Src1SubRC =
8808 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8809
8810 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8811 AMDGPU::sub0, Src0SubRC);
8812 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8813 AMDGPU::sub0, Src1SubRC);
8814 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8815 AMDGPU::sub1, Src0SubRC);
8816 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8817 AMDGPU::sub1, Src1SubRC);
8818
8819 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8820 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8821 const TargetRegisterClass *NewDestSubRC =
8822 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8823
8824 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8825 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8826 .add(SrcReg0Sub0)
8827 .add(SrcReg1Sub0);
8828
8829 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8830 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8831 .add(SrcReg0Sub1)
8832 .add(SrcReg1Sub1);
8833
8834 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8835 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8836 .addReg(DestSub0)
8837 .addImm(AMDGPU::sub0)
8838 .addReg(DestSub1)
8839 .addImm(AMDGPU::sub1);
8840
8841 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8842
8843 Worklist.insert(&LoHalf);
8844 Worklist.insert(&HiHalf);
8845
8846 // Move all users of this moved value.
8847 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8848}
8849
8850void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8851 MachineInstr &Inst,
8852 MachineDominatorTree *MDT) const {
8853 MachineBasicBlock &MBB = *Inst.getParent();
8854 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8855
8856 MachineOperand &Dest = Inst.getOperand(0);
8857 MachineOperand &Src0 = Inst.getOperand(1);
8858 MachineOperand &Src1 = Inst.getOperand(2);
8859 const DebugLoc &DL = Inst.getDebugLoc();
8860
8861 MachineBasicBlock::iterator MII = Inst;
8862
8863 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8864
8865 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8866
8867 MachineOperand* Op0;
8868 MachineOperand* Op1;
8869
8870 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8871 Op0 = &Src0;
8872 Op1 = &Src1;
8873 } else {
8874 Op0 = &Src1;
8875 Op1 = &Src0;
8876 }
8877
8878 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8879 .add(*Op0);
8880
8881 Register NewDest = MRI.createVirtualRegister(DestRC);
8882
8883 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8884 .addReg(Interm)
8885 .add(*Op1);
8886
8887 MRI.replaceRegWith(Dest.getReg(), NewDest);
8888
8889 Worklist.insert(&Xor);
8890}
8891
8892void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8893 MachineInstr &Inst) const {
8894 MachineBasicBlock &MBB = *Inst.getParent();
8895 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8896
8897 MachineBasicBlock::iterator MII = Inst;
8898 const DebugLoc &DL = Inst.getDebugLoc();
8899
8900 MachineOperand &Dest = Inst.getOperand(0);
8901 MachineOperand &Src = Inst.getOperand(1);
8902
8903 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8904 const TargetRegisterClass *SrcRC = Src.isReg() ?
8905 MRI.getRegClass(Src.getReg()) :
8906 &AMDGPU::SGPR_32RegClass;
8907
8908 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8909 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8910
8911 const TargetRegisterClass *SrcSubRC =
8912 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8913
8914 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8915 AMDGPU::sub0, SrcSubRC);
8916 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8917 AMDGPU::sub1, SrcSubRC);
8918
8919 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8920
8921 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8922
8923 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8924
8925 // We don't need to legalize operands here. src0 for either instruction can be
8926 // an SGPR, and the second input is unused or determined here.
8927 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8928}
8929
8930void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8931 MachineInstr &Inst) const {
8932 MachineBasicBlock &MBB = *Inst.getParent();
8933 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8934 MachineBasicBlock::iterator MII = Inst;
8935 const DebugLoc &DL = Inst.getDebugLoc();
8936
8937 MachineOperand &Dest = Inst.getOperand(0);
8938 uint32_t Imm = Inst.getOperand(2).getImm();
8939 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8940 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8941
8942 (void) Offset;
8943
8944 // Only sext_inreg cases handled.
8945 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8946 Offset == 0 && "Not implemented");
8947
8948 if (BitWidth < 32) {
8949 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8950 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8951 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8952
8953 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8954 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8955 .addImm(0)
8956 .addImm(BitWidth);
8957
8958 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8959 .addImm(31)
8960 .addReg(MidRegLo);
8961
8962 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8963 .addReg(MidRegLo)
8964 .addImm(AMDGPU::sub0)
8965 .addReg(MidRegHi)
8966 .addImm(AMDGPU::sub1);
8967
8968 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8969 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8970 return;
8971 }
8972
8973 MachineOperand &Src = Inst.getOperand(1);
8974 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8975 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8976
8977 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8978 .addImm(31)
8979 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8980
8981 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8982 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8983 .addImm(AMDGPU::sub0)
8984 .addReg(TmpReg)
8985 .addImm(AMDGPU::sub1);
8986
8987 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8988 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8989}
8990
8991void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8992 MachineInstr &Inst, unsigned Opcode,
8993 MachineDominatorTree *MDT) const {
8994 // (S_FLBIT_I32_B64 hi:lo) ->
8995 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8996 // (S_FF1_I32_B64 hi:lo) ->
8997 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8998
8999 MachineBasicBlock &MBB = *Inst.getParent();
9000 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9001 MachineBasicBlock::iterator MII = Inst;
9002 const DebugLoc &DL = Inst.getDebugLoc();
9003
9004 MachineOperand &Dest = Inst.getOperand(0);
9005 MachineOperand &Src = Inst.getOperand(1);
9006
9007 const MCInstrDesc &InstDesc = get(Opcode);
9008
9009 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9010 unsigned OpcodeAdd =
9011 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9012
9013 const TargetRegisterClass *SrcRC =
9014 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9015 const TargetRegisterClass *SrcSubRC =
9016 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9017
9018 MachineOperand SrcRegSub0 =
9019 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9020 MachineOperand SrcRegSub1 =
9021 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9022
9023 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9024 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9025 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9026 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9027
9028 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9029
9030 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9031
9032 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9033 .addReg(IsCtlz ? MidReg1 : MidReg2)
9034 .addImm(32)
9035 .addImm(1); // enable clamp
9036
9037 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9038 .addReg(MidReg3)
9039 .addReg(IsCtlz ? MidReg2 : MidReg1);
9040
9041 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9042
9043 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9044}
9045
9046void SIInstrInfo::addUsersToMoveToVALUWorklist(
9048 SIInstrWorklist &Worklist) const {
9049 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9050 MachineInstr &UseMI = *MO.getParent();
9051
9052 unsigned OpNo = 0;
9053
9054 switch (UseMI.getOpcode()) {
9055 case AMDGPU::COPY:
9056 case AMDGPU::WQM:
9057 case AMDGPU::SOFT_WQM:
9058 case AMDGPU::STRICT_WWM:
9059 case AMDGPU::STRICT_WQM:
9060 case AMDGPU::REG_SEQUENCE:
9061 case AMDGPU::PHI:
9062 case AMDGPU::INSERT_SUBREG:
9063 break;
9064 default:
9065 OpNo = MO.getOperandNo();
9066 break;
9067 }
9068
9069 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9070 MRI.constrainRegClass(DstReg, OpRC);
9071
9072 if (!RI.hasVectorRegisters(OpRC))
9073 Worklist.insert(&UseMI);
9074 else
9075 // Legalization could change user list.
9077 }
9078}
9079
9080void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9082 MachineInstr &Inst) const {
9083 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9084 MachineBasicBlock *MBB = Inst.getParent();
9085 MachineOperand &Src0 = Inst.getOperand(1);
9086 MachineOperand &Src1 = Inst.getOperand(2);
9087 const DebugLoc &DL = Inst.getDebugLoc();
9088
9089 if (ST.useRealTrue16Insts()) {
9090 Register SrcReg0, SrcReg1;
9091 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9092 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9093 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9094 } else {
9095 SrcReg0 = Src0.getReg();
9096 }
9097
9098 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9099 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9100 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9101 } else {
9102 SrcReg1 = Src1.getReg();
9103 }
9104
9105 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9106 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9107
9108 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9109 switch (Inst.getOpcode()) {
9110 case AMDGPU::S_PACK_LL_B32_B16:
9111 NewMI
9112 .addReg(SrcReg0, 0,
9113 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9114 .addImm(AMDGPU::lo16)
9115 .addReg(SrcReg1, 0,
9116 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9117 .addImm(AMDGPU::hi16);
9118 break;
9119 case AMDGPU::S_PACK_LH_B32_B16:
9120 NewMI
9121 .addReg(SrcReg0, 0,
9122 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9123 .addImm(AMDGPU::lo16)
9124 .addReg(SrcReg1, 0, AMDGPU::hi16)
9125 .addImm(AMDGPU::hi16);
9126 break;
9127 case AMDGPU::S_PACK_HL_B32_B16:
9128 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9129 .addImm(AMDGPU::lo16)
9130 .addReg(SrcReg1, 0,
9131 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9132 .addImm(AMDGPU::hi16);
9133 break;
9134 case AMDGPU::S_PACK_HH_B32_B16:
9135 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9136 .addImm(AMDGPU::lo16)
9137 .addReg(SrcReg1, 0, AMDGPU::hi16)
9138 .addImm(AMDGPU::hi16);
9139 break;
9140 default:
9141 llvm_unreachable("unhandled s_pack_* instruction");
9142 }
9143
9144 MachineOperand &Dest = Inst.getOperand(0);
9145 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9146 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9147 return;
9148 }
9149
9150 switch (Inst.getOpcode()) {
9151 case AMDGPU::S_PACK_LL_B32_B16: {
9152 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9153 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9154
9155 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9156 // 0.
9157 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9158 .addImm(0xffff);
9159
9160 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9161 .addReg(ImmReg, RegState::Kill)
9162 .add(Src0);
9163
9164 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9165 .add(Src1)
9166 .addImm(16)
9167 .addReg(TmpReg, RegState::Kill);
9168 break;
9169 }
9170 case AMDGPU::S_PACK_LH_B32_B16: {
9171 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9172 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9173 .addImm(0xffff);
9174 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9175 .addReg(ImmReg, RegState::Kill)
9176 .add(Src0)
9177 .add(Src1);
9178 break;
9179 }
9180 case AMDGPU::S_PACK_HL_B32_B16: {
9181 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9182 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9183 .addImm(16)
9184 .add(Src0);
9185 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9186 .add(Src1)
9187 .addImm(16)
9188 .addReg(TmpReg, RegState::Kill);
9189 break;
9190 }
9191 case AMDGPU::S_PACK_HH_B32_B16: {
9192 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9193 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9194 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9195 .addImm(16)
9196 .add(Src0);
9197 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9198 .addImm(0xffff0000);
9199 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9200 .add(Src1)
9201 .addReg(ImmReg, RegState::Kill)
9202 .addReg(TmpReg, RegState::Kill);
9203 break;
9204 }
9205 default:
9206 llvm_unreachable("unhandled s_pack_* instruction");
9207 }
9208
9209 MachineOperand &Dest = Inst.getOperand(0);
9210 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9211 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9212}
9213
9214void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9215 MachineInstr &SCCDefInst,
9216 SIInstrWorklist &Worklist,
9217 Register NewCond) const {
9218
9219 // Ensure that def inst defines SCC, which is still live.
9220 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9221 !Op.isDead() && Op.getParent() == &SCCDefInst);
9222 SmallVector<MachineInstr *, 4> CopyToDelete;
9223 // This assumes that all the users of SCC are in the same block
9224 // as the SCC def.
9225 for (MachineInstr &MI : // Skip the def inst itself.
9226 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9227 SCCDefInst.getParent()->end())) {
9228 // Check if SCC is used first.
9229 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9230 if (SCCIdx != -1) {
9231 if (MI.isCopy()) {
9232 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9233 Register DestReg = MI.getOperand(0).getReg();
9234
9235 MRI.replaceRegWith(DestReg, NewCond);
9236 CopyToDelete.push_back(&MI);
9237 } else {
9238
9239 if (NewCond.isValid())
9240 MI.getOperand(SCCIdx).setReg(NewCond);
9241
9242 Worklist.insert(&MI);
9243 }
9244 }
9245 // Exit if we find another SCC def.
9246 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9247 break;
9248 }
9249 for (auto &Copy : CopyToDelete)
9250 Copy->eraseFromParent();
9251}
9252
9253// Instructions that use SCC may be converted to VALU instructions. When that
9254// happens, the SCC register is changed to VCC_LO. The instruction that defines
9255// SCC must be changed to an instruction that defines VCC. This function makes
9256// sure that the instruction that defines SCC is added to the moveToVALU
9257// worklist.
9258void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9259 SIInstrWorklist &Worklist) const {
9260 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9261 // then there is nothing to do because the defining instruction has been
9262 // converted to a VALU already. If SCC then that instruction needs to be
9263 // converted to a VALU.
9264 for (MachineInstr &MI :
9265 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9266 SCCUseInst->getParent()->rend())) {
9267 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9268 break;
9269 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9270 Worklist.insert(&MI);
9271 break;
9272 }
9273 }
9274}
9275
9276const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9277 const MachineInstr &Inst) const {
9278 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9279
9280 switch (Inst.getOpcode()) {
9281 // For target instructions, getOpRegClass just returns the virtual register
9282 // class associated with the operand, so we need to find an equivalent VGPR
9283 // register class in order to move the instruction to the VALU.
9284 case AMDGPU::COPY:
9285 case AMDGPU::PHI:
9286 case AMDGPU::REG_SEQUENCE:
9287 case AMDGPU::INSERT_SUBREG:
9288 case AMDGPU::WQM:
9289 case AMDGPU::SOFT_WQM:
9290 case AMDGPU::STRICT_WWM:
9291 case AMDGPU::STRICT_WQM: {
9292 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9293 if (RI.isAGPRClass(SrcRC)) {
9294 if (RI.isAGPRClass(NewDstRC))
9295 return nullptr;
9296
9297 switch (Inst.getOpcode()) {
9298 case AMDGPU::PHI:
9299 case AMDGPU::REG_SEQUENCE:
9300 case AMDGPU::INSERT_SUBREG:
9301 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9302 break;
9303 default:
9304 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9305 }
9306
9307 if (!NewDstRC)
9308 return nullptr;
9309 } else {
9310 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9311 return nullptr;
9312
9313 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9314 if (!NewDstRC)
9315 return nullptr;
9316 }
9317
9318 return NewDstRC;
9319 }
9320 default:
9321 return NewDstRC;
9322 }
9323}
9324
9325// Find the one SGPR operand we are allowed to use.
9326Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9327 int OpIndices[3]) const {
9328 const MCInstrDesc &Desc = MI.getDesc();
9329
9330 // Find the one SGPR operand we are allowed to use.
9331 //
9332 // First we need to consider the instruction's operand requirements before
9333 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9334 // of VCC, but we are still bound by the constant bus requirement to only use
9335 // one.
9336 //
9337 // If the operand's class is an SGPR, we can never move it.
9338
9339 Register SGPRReg = findImplicitSGPRRead(MI);
9340 if (SGPRReg)
9341 return SGPRReg;
9342
9343 Register UsedSGPRs[3] = {Register()};
9344 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9345
9346 for (unsigned i = 0; i < 3; ++i) {
9347 int Idx = OpIndices[i];
9348 if (Idx == -1)
9349 break;
9350
9351 const MachineOperand &MO = MI.getOperand(Idx);
9352 if (!MO.isReg())
9353 continue;
9354
9355 // Is this operand statically required to be an SGPR based on the operand
9356 // constraints?
9357 const TargetRegisterClass *OpRC =
9358 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9359 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9360 if (IsRequiredSGPR)
9361 return MO.getReg();
9362
9363 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9364 Register Reg = MO.getReg();
9365 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9366 if (RI.isSGPRClass(RegRC))
9367 UsedSGPRs[i] = Reg;
9368 }
9369
9370 // We don't have a required SGPR operand, so we have a bit more freedom in
9371 // selecting operands to move.
9372
9373 // Try to select the most used SGPR. If an SGPR is equal to one of the
9374 // others, we choose that.
9375 //
9376 // e.g.
9377 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9378 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9379
9380 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9381 // prefer those.
9382
9383 if (UsedSGPRs[0]) {
9384 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9385 SGPRReg = UsedSGPRs[0];
9386 }
9387
9388 if (!SGPRReg && UsedSGPRs[1]) {
9389 if (UsedSGPRs[1] == UsedSGPRs[2])
9390 SGPRReg = UsedSGPRs[1];
9391 }
9392
9393 return SGPRReg;
9394}
9395
9397 AMDGPU::OpName OperandName) const {
9398 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9399 return nullptr;
9400
9401 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9402 if (Idx == -1)
9403 return nullptr;
9404
9405 return &MI.getOperand(Idx);
9406}
9407
9409 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9410 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9413 return (Format << 44) |
9414 (1ULL << 56) | // RESOURCE_LEVEL = 1
9415 (3ULL << 60); // OOB_SELECT = 3
9416 }
9417
9418 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9419 if (ST.isAmdHsaOS()) {
9420 // Set ATC = 1. GFX9 doesn't have this bit.
9421 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9422 RsrcDataFormat |= (1ULL << 56);
9423
9424 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9425 // BTW, it disables TC L2 and therefore decreases performance.
9426 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9427 RsrcDataFormat |= (2ULL << 59);
9428 }
9429
9430 return RsrcDataFormat;
9431}
9432
9436 0xffffffff; // Size;
9437
9438 // GFX9 doesn't have ELEMENT_SIZE.
9439 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9440 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9441 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9442 }
9443
9444 // IndexStride = 64 / 32.
9445 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9446 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9447
9448 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9449 // Clear them unless we want a huge stride.
9450 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9451 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9452 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9453
9454 return Rsrc23;
9455}
9456
9458 unsigned Opc = MI.getOpcode();
9459
9460 return isSMRD(Opc);
9461}
9462
9464 return get(Opc).mayLoad() &&
9465 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9466}
9467
9469 int &FrameIndex) const {
9470 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9471 if (!Addr || !Addr->isFI())
9472 return Register();
9473
9474 assert(!MI.memoperands_empty() &&
9475 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9476
9477 FrameIndex = Addr->getIndex();
9478 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9479}
9480
9482 int &FrameIndex) const {
9483 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9484 assert(Addr && Addr->isFI());
9485 FrameIndex = Addr->getIndex();
9486 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9487}
9488
9490 int &FrameIndex) const {
9491 if (!MI.mayLoad())
9492 return Register();
9493
9494 if (isMUBUF(MI) || isVGPRSpill(MI))
9495 return isStackAccess(MI, FrameIndex);
9496
9497 if (isSGPRSpill(MI))
9498 return isSGPRStackAccess(MI, FrameIndex);
9499
9500 return Register();
9501}
9502
9504 int &FrameIndex) const {
9505 if (!MI.mayStore())
9506 return Register();
9507
9508 if (isMUBUF(MI) || isVGPRSpill(MI))
9509 return isStackAccess(MI, FrameIndex);
9510
9511 if (isSGPRSpill(MI))
9512 return isSGPRStackAccess(MI, FrameIndex);
9513
9514 return Register();
9515}
9516
9518 unsigned Size = 0;
9520 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9521 while (++I != E && I->isInsideBundle()) {
9522 assert(!I->isBundle() && "No nested bundle!");
9524 }
9525
9526 return Size;
9527}
9528
9530 unsigned Opc = MI.getOpcode();
9532 unsigned DescSize = Desc.getSize();
9533
9534 // If we have a definitive size, we can use it. Otherwise we need to inspect
9535 // the operands to know the size.
9536 if (isFixedSize(MI)) {
9537 unsigned Size = DescSize;
9538
9539 // If we hit the buggy offset, an extra nop will be inserted in MC so
9540 // estimate the worst case.
9541 if (MI.isBranch() && ST.hasOffset3fBug())
9542 Size += 4;
9543
9544 return Size;
9545 }
9546
9547 // Instructions may have a 32-bit literal encoded after them. Check
9548 // operands that could ever be literals.
9549 if (isVALU(MI) || isSALU(MI)) {
9550 if (isDPP(MI))
9551 return DescSize;
9552 bool HasLiteral = false;
9553 unsigned LiteralSize = 4;
9554 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9555 const MachineOperand &Op = MI.getOperand(I);
9556 const MCOperandInfo &OpInfo = Desc.operands()[I];
9557 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9558 HasLiteral = true;
9559 if (ST.has64BitLiterals()) {
9560 switch (OpInfo.OperandType) {
9561 default:
9562 break;
9564 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9565 LiteralSize = 8;
9566 break;
9568 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9569 LiteralSize = 8;
9570 break;
9571 }
9572 }
9573 break;
9574 }
9575 }
9576 return HasLiteral ? DescSize + LiteralSize : DescSize;
9577 }
9578
9579 // Check whether we have extra NSA words.
9580 if (isMIMG(MI)) {
9581 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9582 if (VAddr0Idx < 0)
9583 return 8;
9584
9585 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9586 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9587 }
9588
9589 switch (Opc) {
9590 case TargetOpcode::BUNDLE:
9591 return getInstBundleSize(MI);
9592 case TargetOpcode::INLINEASM:
9593 case TargetOpcode::INLINEASM_BR: {
9594 const MachineFunction *MF = MI.getParent()->getParent();
9595 const char *AsmStr = MI.getOperand(0).getSymbolName();
9596 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9597 }
9598 default:
9599 if (MI.isMetaInstruction())
9600 return 0;
9601
9602 // If D16 Pseudo inst, get correct MC code size
9603 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9604 if (D16Info) {
9605 // Assume d16_lo/hi inst are always in same size
9606 unsigned LoInstOpcode = D16Info->LoOp;
9607 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9608 DescSize = Desc.getSize();
9609 }
9610
9611 // If FMA Pseudo inst, get correct MC code size
9612 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9613 // All potential lowerings are the same size; arbitrarily pick one.
9614 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9615 DescSize = Desc.getSize();
9616 }
9617
9618 return DescSize;
9619 }
9620}
9621
9623 if (!isFLAT(MI))
9624 return false;
9625
9626 if (MI.memoperands_empty())
9627 return true;
9628
9629 for (const MachineMemOperand *MMO : MI.memoperands()) {
9630 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9631 return true;
9632 }
9633 return false;
9634}
9635
9638 static const std::pair<int, const char *> TargetIndices[] = {
9639 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9640 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9641 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9642 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9643 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9644 return ArrayRef(TargetIndices);
9645}
9646
9647/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9648/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9654
9655/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9656/// pass.
9661
9662// Called during:
9663// - pre-RA scheduling and post-RA scheduling
9666 const ScheduleDAGMI *DAG) const {
9667 // Borrowed from Arm Target
9668 // We would like to restrict this hazard recognizer to only
9669 // post-RA scheduling; we can tell that we're post-RA because we don't
9670 // track VRegLiveness.
9671 if (!DAG->hasVRegLiveness())
9672 return new GCNHazardRecognizer(DAG->MF);
9674}
9675
9676std::pair<unsigned, unsigned>
9678 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9679}
9680
9683 static const std::pair<unsigned, const char *> TargetFlags[] = {
9684 {MO_GOTPCREL, "amdgpu-gotprel"},
9685 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9686 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9687 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9688 {MO_REL32_LO, "amdgpu-rel32-lo"},
9689 {MO_REL32_HI, "amdgpu-rel32-hi"},
9690 {MO_REL64, "amdgpu-rel64"},
9691 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9692 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9693 {MO_ABS64, "amdgpu-abs64"},
9694 };
9695
9696 return ArrayRef(TargetFlags);
9697}
9698
9701 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9702 {
9703 {MONoClobber, "amdgpu-noclobber"},
9704 {MOLastUse, "amdgpu-last-use"},
9705 {MOCooperative, "amdgpu-cooperative"},
9706 };
9707
9708 return ArrayRef(TargetFlags);
9709}
9710
9712 const MachineFunction &MF) const {
9714 assert(SrcReg.isVirtual());
9715 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9716 return AMDGPU::WWM_COPY;
9717
9718 return AMDGPU::COPY;
9719}
9720
9722 Register Reg) const {
9723 // We need to handle instructions which may be inserted during register
9724 // allocation to handle the prolog. The initial prolog instruction may have
9725 // been separated from the start of the block by spills and copies inserted
9726 // needed by the prolog. However, the insertions for scalar registers can
9727 // always be placed at the BB top as they are independent of the exec mask
9728 // value.
9729 const MachineFunction *MF = MI.getParent()->getParent();
9730 bool IsNullOrVectorRegister = true;
9731 if (Reg) {
9732 const MachineRegisterInfo &MRI = MF->getRegInfo();
9733 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9734 }
9735
9736 uint16_t Opcode = MI.getOpcode();
9738 return IsNullOrVectorRegister &&
9739 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9740 (Opcode == AMDGPU::IMPLICIT_DEF &&
9741 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9742 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9743 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9744}
9745
9749 const DebugLoc &DL,
9750 Register DestReg) const {
9751 if (ST.hasAddNoCarry())
9752 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9753
9754 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9755 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9756 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9757
9758 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9759 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9760}
9761
9764 const DebugLoc &DL,
9765 Register DestReg,
9766 RegScavenger &RS) const {
9767 if (ST.hasAddNoCarry())
9768 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9769
9770 // If available, prefer to use vcc.
9771 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9772 ? Register(RI.getVCC())
9773 : RS.scavengeRegisterBackwards(
9774 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9775 0, /* AllowSpill */ false);
9776
9777 // TODO: Users need to deal with this.
9778 if (!UnusedCarry.isValid())
9779 return MachineInstrBuilder();
9780
9781 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9782 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9783}
9784
9785bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9786 switch (Opcode) {
9787 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9788 case AMDGPU::SI_KILL_I1_TERMINATOR:
9789 return true;
9790 default:
9791 return false;
9792 }
9793}
9794
9796 switch (Opcode) {
9797 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9798 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9799 case AMDGPU::SI_KILL_I1_PSEUDO:
9800 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9801 default:
9802 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9803 }
9804}
9805
9806bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9807 return Imm <= getMaxMUBUFImmOffset(ST);
9808}
9809
9811 // GFX12 field is non-negative 24-bit signed byte offset.
9812 const unsigned OffsetBits =
9813 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9814 return (1 << OffsetBits) - 1;
9815}
9816
9818 if (!ST.isWave32())
9819 return;
9820
9821 if (MI.isInlineAsm())
9822 return;
9823
9824 for (auto &Op : MI.implicit_operands()) {
9825 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9826 Op.setReg(AMDGPU::VCC_LO);
9827 }
9828}
9829
9831 if (!isSMRD(MI))
9832 return false;
9833
9834 // Check that it is using a buffer resource.
9835 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9836 if (Idx == -1) // e.g. s_memtime
9837 return false;
9838
9839 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
9840 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9841}
9842
9843// Given Imm, split it into the values to put into the SOffset and ImmOffset
9844// fields in an MUBUF instruction. Return false if it is not possible (due to a
9845// hardware bug needing a workaround).
9846//
9847// The required alignment ensures that individual address components remain
9848// aligned if they are aligned to begin with. It also ensures that additional
9849// offsets within the given alignment can be added to the resulting ImmOffset.
9851 uint32_t &ImmOffset, Align Alignment) const {
9852 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9853 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9854 uint32_t Overflow = 0;
9855
9856 if (Imm > MaxImm) {
9857 if (Imm <= MaxImm + 64) {
9858 // Use an SOffset inline constant for 4..64
9859 Overflow = Imm - MaxImm;
9860 Imm = MaxImm;
9861 } else {
9862 // Try to keep the same value in SOffset for adjacent loads, so that
9863 // the corresponding register contents can be re-used.
9864 //
9865 // Load values with all low-bits (except for alignment bits) set into
9866 // SOffset, so that a larger range of values can be covered using
9867 // s_movk_i32.
9868 //
9869 // Atomic operations fail to work correctly when individual address
9870 // components are unaligned, even if their sum is aligned.
9871 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9872 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9873 Imm = Low;
9874 Overflow = High - Alignment.value();
9875 }
9876 }
9877
9878 if (Overflow > 0) {
9879 // There is a hardware bug in SI and CI which prevents address clamping in
9880 // MUBUF instructions from working correctly with SOffsets. The immediate
9881 // offset is unaffected.
9882 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9883 return false;
9884
9885 // It is not possible to set immediate in SOffset field on some targets.
9886 if (ST.hasRestrictedSOffset())
9887 return false;
9888 }
9889
9890 ImmOffset = Imm;
9891 SOffset = Overflow;
9892 return true;
9893}
9894
9895// Depending on the used address space and instructions, some immediate offsets
9896// are allowed and some are not.
9897// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9898// scratch instruction offsets can also be negative. On GFX12, offsets can be
9899// negative for all variants.
9900//
9901// There are several bugs related to these offsets:
9902// On gfx10.1, flat instructions that go into the global address space cannot
9903// use an offset.
9904//
9905// For scratch instructions, the address can be either an SGPR or a VGPR.
9906// The following offsets can be used, depending on the architecture (x means
9907// cannot be used):
9908// +----------------------------+------+------+
9909// | Address-Mode | SGPR | VGPR |
9910// +----------------------------+------+------+
9911// | gfx9 | | |
9912// | negative, 4-aligned offset | x | ok |
9913// | negative, unaligned offset | x | ok |
9914// +----------------------------+------+------+
9915// | gfx10 | | |
9916// | negative, 4-aligned offset | ok | ok |
9917// | negative, unaligned offset | ok | x |
9918// +----------------------------+------+------+
9919// | gfx10.3 | | |
9920// | negative, 4-aligned offset | ok | ok |
9921// | negative, unaligned offset | ok | ok |
9922// +----------------------------+------+------+
9923//
9924// This function ignores the addressing mode, so if an offset cannot be used in
9925// one addressing mode, it is considered illegal.
9926bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9927 uint64_t FlatVariant) const {
9928 // TODO: Should 0 be special cased?
9929 if (!ST.hasFlatInstOffsets())
9930 return false;
9931
9932 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9933 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9934 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9935 return false;
9936
9937 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9938 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9939 (Offset % 4) != 0) {
9940 return false;
9941 }
9942
9943 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9944 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9945 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9946}
9947
9948// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9949std::pair<int64_t, int64_t>
9950SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9951 uint64_t FlatVariant) const {
9952 int64_t RemainderOffset = COffsetVal;
9953 int64_t ImmField = 0;
9954
9955 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9956 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9957
9958 if (AllowNegative) {
9959 // Use signed division by a power of two to truncate towards 0.
9960 int64_t D = 1LL << NumBits;
9961 RemainderOffset = (COffsetVal / D) * D;
9962 ImmField = COffsetVal - RemainderOffset;
9963
9964 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9965 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9966 (ImmField % 4) != 0) {
9967 // Make ImmField a multiple of 4
9968 RemainderOffset += ImmField % 4;
9969 ImmField -= ImmField % 4;
9970 }
9971 } else if (COffsetVal >= 0) {
9972 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9973 RemainderOffset = COffsetVal - ImmField;
9974 }
9975
9976 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9977 assert(RemainderOffset + ImmField == COffsetVal);
9978 return {ImmField, RemainderOffset};
9979}
9980
9982 if (ST.hasNegativeScratchOffsetBug() &&
9983 FlatVariant == SIInstrFlags::FlatScratch)
9984 return false;
9985
9986 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9987}
9988
9989static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9990 switch (ST.getGeneration()) {
9991 default:
9992 break;
9995 return SIEncodingFamily::SI;
9998 return SIEncodingFamily::VI;
10004 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10006 }
10007 llvm_unreachable("Unknown subtarget generation!");
10008}
10009
10010bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10011 switch(MCOp) {
10012 // These opcodes use indirect register addressing so
10013 // they need special handling by codegen (currently missing).
10014 // Therefore it is too risky to allow these opcodes
10015 // to be selected by dpp combiner or sdwa peepholer.
10016 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10017 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10018 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10019 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10020 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10021 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10022 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10023 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10024 return true;
10025 default:
10026 return false;
10027 }
10028}
10029
10030#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10031 case OPCODE##_dpp: \
10032 case OPCODE##_e32: \
10033 case OPCODE##_e64: \
10034 case OPCODE##_e64_dpp: \
10035 case OPCODE##_sdwa:
10036
10037static bool isRenamedInGFX9(int Opcode) {
10038 switch (Opcode) {
10039 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10040 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10041 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10042 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10043 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10044 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10045 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10046 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10047 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10048 //
10049 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10050 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10051 case AMDGPU::V_FMA_F16_gfx9_e64:
10052 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10053 case AMDGPU::V_INTERP_P2_F16:
10054 case AMDGPU::V_MAD_F16_e64:
10055 case AMDGPU::V_MAD_U16_e64:
10056 case AMDGPU::V_MAD_I16_e64:
10057 return true;
10058 default:
10059 return false;
10060 }
10061}
10062
10063int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10064 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
10065
10066 unsigned Gen = subtargetEncodingFamily(ST);
10067
10068 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10070
10071 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10072 // subtarget has UnpackedD16VMem feature.
10073 // TODO: remove this when we discard GFX80 encoding.
10074 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10076
10077 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10078 switch (ST.getGeneration()) {
10079 default:
10081 break;
10084 break;
10087 break;
10088 }
10089 }
10090
10091 if (isMAI(Opcode)) {
10092 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10093 if (MFMAOp != -1)
10094 Opcode = MFMAOp;
10095 }
10096
10097 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10098
10099 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10101
10102 // -1 means that Opcode is already a native instruction.
10103 if (MCOp == -1)
10104 return Opcode;
10105
10106 if (ST.hasGFX90AInsts()) {
10107 uint16_t NMCOp = (uint16_t)-1;
10108 if (ST.hasGFX940Insts())
10110 if (NMCOp == (uint16_t)-1)
10112 if (NMCOp == (uint16_t)-1)
10114 if (NMCOp != (uint16_t)-1)
10115 MCOp = NMCOp;
10116 }
10117
10118 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10119 // no encoding in the given subtarget generation.
10120 if (MCOp == (uint16_t)-1)
10121 return -1;
10122
10123 if (isAsmOnlyOpcode(MCOp))
10124 return -1;
10125
10126 return MCOp;
10127}
10128
10129static
10131 assert(RegOpnd.isReg());
10132 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10133 getRegSubRegPair(RegOpnd);
10134}
10135
10138 assert(MI.isRegSequence());
10139 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10140 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10141 auto &RegOp = MI.getOperand(1 + 2 * I);
10142 return getRegOrUndef(RegOp);
10143 }
10145}
10146
10147// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10148// Following a subreg of reg:subreg isn't supported
10151 if (!RSR.SubReg)
10152 return false;
10153 switch (MI.getOpcode()) {
10154 default: break;
10155 case AMDGPU::REG_SEQUENCE:
10156 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10157 return true;
10158 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10159 case AMDGPU::INSERT_SUBREG:
10160 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10161 // inserted the subreg we're looking for
10162 RSR = getRegOrUndef(MI.getOperand(2));
10163 else { // the subreg in the rest of the reg
10164 auto R1 = getRegOrUndef(MI.getOperand(1));
10165 if (R1.SubReg) // subreg of subreg isn't supported
10166 return false;
10167 RSR.Reg = R1.Reg;
10168 }
10169 return true;
10170 }
10171 return false;
10172}
10173
10175 const MachineRegisterInfo &MRI) {
10176 assert(MRI.isSSA());
10177 if (!P.Reg.isVirtual())
10178 return nullptr;
10179
10180 auto RSR = P;
10181 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10182 while (auto *MI = DefInst) {
10183 DefInst = nullptr;
10184 switch (MI->getOpcode()) {
10185 case AMDGPU::COPY:
10186 case AMDGPU::V_MOV_B32_e32: {
10187 auto &Op1 = MI->getOperand(1);
10188 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10189 if (Op1.isUndef())
10190 return nullptr;
10191 RSR = getRegSubRegPair(Op1);
10192 DefInst = MRI.getVRegDef(RSR.Reg);
10193 }
10194 break;
10195 }
10196 default:
10197 if (followSubRegDef(*MI, RSR)) {
10198 if (!RSR.Reg)
10199 return nullptr;
10200 DefInst = MRI.getVRegDef(RSR.Reg);
10201 }
10202 }
10203 if (!DefInst)
10204 return MI;
10205 }
10206 return nullptr;
10207}
10208
10210 Register VReg,
10211 const MachineInstr &DefMI,
10212 const MachineInstr &UseMI) {
10213 assert(MRI.isSSA() && "Must be run on SSA");
10214
10215 auto *TRI = MRI.getTargetRegisterInfo();
10216 auto *DefBB = DefMI.getParent();
10217
10218 // Don't bother searching between blocks, although it is possible this block
10219 // doesn't modify exec.
10220 if (UseMI.getParent() != DefBB)
10221 return true;
10222
10223 const int MaxInstScan = 20;
10224 int NumInst = 0;
10225
10226 // Stop scan at the use.
10227 auto E = UseMI.getIterator();
10228 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10229 if (I->isDebugInstr())
10230 continue;
10231
10232 if (++NumInst > MaxInstScan)
10233 return true;
10234
10235 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10236 return true;
10237 }
10238
10239 return false;
10240}
10241
10243 Register VReg,
10244 const MachineInstr &DefMI) {
10245 assert(MRI.isSSA() && "Must be run on SSA");
10246
10247 auto *TRI = MRI.getTargetRegisterInfo();
10248 auto *DefBB = DefMI.getParent();
10249
10250 const int MaxUseScan = 10;
10251 int NumUse = 0;
10252
10253 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10254 auto &UseInst = *Use.getParent();
10255 // Don't bother searching between blocks, although it is possible this block
10256 // doesn't modify exec.
10257 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10258 return true;
10259
10260 if (++NumUse > MaxUseScan)
10261 return true;
10262 }
10263
10264 if (NumUse == 0)
10265 return false;
10266
10267 const int MaxInstScan = 20;
10268 int NumInst = 0;
10269
10270 // Stop scan when we have seen all the uses.
10271 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10272 assert(I != DefBB->end());
10273
10274 if (I->isDebugInstr())
10275 continue;
10276
10277 if (++NumInst > MaxInstScan)
10278 return true;
10279
10280 for (const MachineOperand &Op : I->operands()) {
10281 // We don't check reg masks here as they're used only on calls:
10282 // 1. EXEC is only considered const within one BB
10283 // 2. Call should be a terminator instruction if present in a BB
10284
10285 if (!Op.isReg())
10286 continue;
10287
10288 Register Reg = Op.getReg();
10289 if (Op.isUse()) {
10290 if (Reg == VReg && --NumUse == 0)
10291 return false;
10292 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10293 return true;
10294 }
10295 }
10296}
10297
10300 const DebugLoc &DL, Register Src, Register Dst) const {
10301 auto Cur = MBB.begin();
10302 if (Cur != MBB.end())
10303 do {
10304 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10305 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10306 ++Cur;
10307 } while (Cur != MBB.end() && Cur != LastPHIIt);
10308
10309 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10310 Dst);
10311}
10312
10315 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10316 if (InsPt != MBB.end() &&
10317 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10318 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10319 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10320 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10321 InsPt++;
10322 return BuildMI(MBB, InsPt, DL,
10323 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10324 .addReg(Src, 0, SrcSubReg)
10325 .addReg(AMDGPU::EXEC, RegState::Implicit);
10326 }
10327 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10328 Dst);
10329}
10330
10331bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10332
10335 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10336 VirtRegMap *VRM) const {
10337 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10338 //
10339 // %0:sreg_32 = COPY $m0
10340 //
10341 // We explicitly chose SReg_32 for the virtual register so such a copy might
10342 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10343 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10344 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10345 // TargetInstrInfo::foldMemoryOperand() is going to try.
10346 // A similar issue also exists with spilling and reloading $exec registers.
10347 //
10348 // To prevent that, constrain the %0 register class here.
10349 if (isFullCopyInstr(MI)) {
10350 Register DstReg = MI.getOperand(0).getReg();
10351 Register SrcReg = MI.getOperand(1).getReg();
10352 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10353 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10355 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10356 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10357 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10358 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10359 return nullptr;
10360 }
10361 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10362 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10363 return nullptr;
10364 }
10365 }
10366 }
10367
10368 return nullptr;
10369}
10370
10372 const MachineInstr &MI,
10373 unsigned *PredCost) const {
10374 if (MI.isBundle()) {
10376 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10377 unsigned Lat = 0, Count = 0;
10378 for (++I; I != E && I->isBundledWithPred(); ++I) {
10379 ++Count;
10380 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10381 }
10382 return Lat + Count - 1;
10383 }
10384
10385 return SchedModel.computeInstrLatency(&MI);
10386}
10387
10390 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10391 unsigned Opcode = MI.getOpcode();
10392
10393 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10394 Register Dst = MI.getOperand(0).getReg();
10395 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10396 : MI.getOperand(1).getReg();
10397 LLT DstTy = MRI.getType(Dst);
10398 LLT SrcTy = MRI.getType(Src);
10399 unsigned DstAS = DstTy.getAddressSpace();
10400 unsigned SrcAS = SrcTy.getAddressSpace();
10401 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10402 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10403 ST.hasGloballyAddressableScratch()
10406 };
10407
10408 // If the target supports globally addressable scratch, the mapping from
10409 // scratch memory to the flat aperture changes therefore an address space cast
10410 // is no longer uniform.
10411 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10412 return HandleAddrSpaceCast(MI);
10413
10414 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10415 auto IID = GI->getIntrinsicID();
10420
10421 switch (IID) {
10422 case Intrinsic::amdgcn_addrspacecast_nonnull:
10423 return HandleAddrSpaceCast(MI);
10424 case Intrinsic::amdgcn_if:
10425 case Intrinsic::amdgcn_else:
10426 // FIXME: Uniform if second result
10427 break;
10428 }
10429
10431 }
10432
10433 // Loads from the private and flat address spaces are divergent, because
10434 // threads can execute the load instruction with the same inputs and get
10435 // different results.
10436 //
10437 // All other loads are not divergent, because if threads issue loads with the
10438 // same arguments, they will always get the same result.
10439 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10440 Opcode == AMDGPU::G_SEXTLOAD) {
10441 if (MI.memoperands_empty())
10442 return InstructionUniformity::NeverUniform; // conservative assumption
10443
10444 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10445 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10446 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10447 })) {
10448 // At least one MMO in a non-global address space.
10450 }
10452 }
10453
10454 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10455 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10456 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10457 AMDGPU::isGenericAtomic(Opcode)) {
10459 }
10461}
10462
10465
10466 if (isNeverUniform(MI))
10468
10469 unsigned opcode = MI.getOpcode();
10470 if (opcode == AMDGPU::V_READLANE_B32 ||
10471 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10472 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10474
10475 if (isCopyInstr(MI)) {
10476 const MachineOperand &srcOp = MI.getOperand(1);
10477 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10478 const TargetRegisterClass *regClass =
10479 RI.getPhysRegBaseClass(srcOp.getReg());
10480 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10482 }
10484 }
10485
10486 // GMIR handling
10487 if (MI.isPreISelOpcode())
10489
10490 // Atomics are divergent because they are executed sequentially: when an
10491 // atomic operation refers to the same address in each thread, then each
10492 // thread after the first sees the value written by the previous thread as
10493 // original value.
10494
10495 if (isAtomic(MI))
10497
10498 // Loads from the private and flat address spaces are divergent, because
10499 // threads can execute the load instruction with the same inputs and get
10500 // different results.
10501 if (isFLAT(MI) && MI.mayLoad()) {
10502 if (MI.memoperands_empty())
10503 return InstructionUniformity::NeverUniform; // conservative assumption
10504
10505 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10506 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10507 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10508 })) {
10509 // At least one MMO in a non-global address space.
10511 }
10512
10514 }
10515
10516 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10517 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10518
10519 // FIXME: It's conceptually broken to report this for an instruction, and not
10520 // a specific def operand. For inline asm in particular, there could be mixed
10521 // uniform and divergent results.
10522 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10523 const MachineOperand &SrcOp = MI.getOperand(I);
10524 if (!SrcOp.isReg())
10525 continue;
10526
10527 Register Reg = SrcOp.getReg();
10528 if (!Reg || !SrcOp.readsReg())
10529 continue;
10530
10531 // If RegBank is null, this is unassigned or an unallocatable special
10532 // register, which are all scalars.
10533 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10534 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10536 }
10537
10538 // TODO: Uniformity check condtions above can be rearranged for more
10539 // redability
10540
10541 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10542 // currently turned into no-op COPYs by SelectionDAG ISel and are
10543 // therefore no longer recognizable.
10544
10546}
10547
10549 switch (MF.getFunction().getCallingConv()) {
10551 return 1;
10553 return 2;
10555 return 3;
10559 const Function &F = MF.getFunction();
10560 F.getContext().diagnose(DiagnosticInfoUnsupported(
10561 F, "ds_ordered_count unsupported for this calling conv"));
10562 [[fallthrough]];
10563 }
10566 case CallingConv::C:
10567 case CallingConv::Fast:
10568 default:
10569 // Assume other calling conventions are various compute callable functions
10570 return 0;
10571 }
10572}
10573
10575 Register &SrcReg2, int64_t &CmpMask,
10576 int64_t &CmpValue) const {
10577 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10578 return false;
10579
10580 switch (MI.getOpcode()) {
10581 default:
10582 break;
10583 case AMDGPU::S_CMP_EQ_U32:
10584 case AMDGPU::S_CMP_EQ_I32:
10585 case AMDGPU::S_CMP_LG_U32:
10586 case AMDGPU::S_CMP_LG_I32:
10587 case AMDGPU::S_CMP_LT_U32:
10588 case AMDGPU::S_CMP_LT_I32:
10589 case AMDGPU::S_CMP_GT_U32:
10590 case AMDGPU::S_CMP_GT_I32:
10591 case AMDGPU::S_CMP_LE_U32:
10592 case AMDGPU::S_CMP_LE_I32:
10593 case AMDGPU::S_CMP_GE_U32:
10594 case AMDGPU::S_CMP_GE_I32:
10595 case AMDGPU::S_CMP_EQ_U64:
10596 case AMDGPU::S_CMP_LG_U64:
10597 SrcReg = MI.getOperand(0).getReg();
10598 if (MI.getOperand(1).isReg()) {
10599 if (MI.getOperand(1).getSubReg())
10600 return false;
10601 SrcReg2 = MI.getOperand(1).getReg();
10602 CmpValue = 0;
10603 } else if (MI.getOperand(1).isImm()) {
10604 SrcReg2 = Register();
10605 CmpValue = MI.getOperand(1).getImm();
10606 } else {
10607 return false;
10608 }
10609 CmpMask = ~0;
10610 return true;
10611 case AMDGPU::S_CMPK_EQ_U32:
10612 case AMDGPU::S_CMPK_EQ_I32:
10613 case AMDGPU::S_CMPK_LG_U32:
10614 case AMDGPU::S_CMPK_LG_I32:
10615 case AMDGPU::S_CMPK_LT_U32:
10616 case AMDGPU::S_CMPK_LT_I32:
10617 case AMDGPU::S_CMPK_GT_U32:
10618 case AMDGPU::S_CMPK_GT_I32:
10619 case AMDGPU::S_CMPK_LE_U32:
10620 case AMDGPU::S_CMPK_LE_I32:
10621 case AMDGPU::S_CMPK_GE_U32:
10622 case AMDGPU::S_CMPK_GE_I32:
10623 SrcReg = MI.getOperand(0).getReg();
10624 SrcReg2 = Register();
10625 CmpValue = MI.getOperand(1).getImm();
10626 CmpMask = ~0;
10627 return true;
10628 }
10629
10630 return false;
10631}
10632
10633// SCC is already valid after SCCValid.
10634// SCCRedefine will redefine SCC to the same value already available after
10635// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10636// update kill/dead flags if necessary.
10637static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10638 const SIRegisterInfo &RI) {
10639 MachineInstr *KillsSCC = nullptr;
10640 if (SCCValid->getParent() != SCCRedefine->getParent())
10641 return false;
10642 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10643 SCCRedefine->getIterator())) {
10644 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10645 return false;
10646 if (MI.killsRegister(AMDGPU::SCC, &RI))
10647 KillsSCC = &MI;
10648 }
10649 if (MachineOperand *SccDef =
10650 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10651 SccDef->setIsDead(false);
10652 if (KillsSCC)
10653 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10654 SCCRedefine->eraseFromParent();
10655 return true;
10656}
10657
10658static bool foldableSelect(const MachineInstr &Def) {
10659 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10660 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10661 return false;
10662 bool Op1IsNonZeroImm =
10663 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10664 bool Op2IsZeroImm =
10665 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10666 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10667 return false;
10668 return true;
10669}
10670
10672 Register SrcReg2, int64_t CmpMask,
10673 int64_t CmpValue,
10674 const MachineRegisterInfo *MRI) const {
10675 if (!SrcReg || SrcReg.isPhysical())
10676 return false;
10677
10678 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10679 return false;
10680
10681 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10682 this]() -> bool {
10683 if (CmpValue != 0)
10684 return false;
10685
10686 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10687 if (!Def)
10688 return false;
10689
10690 // For S_OP that set SCC = DST!=0, do the transformation
10691 //
10692 // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10693
10694 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
10695 // for S_CSELECT* already has the same value that will be calculated by
10696 // s_cmp_lg_*
10697 //
10698 // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10699 // imm), 0)
10700 if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
10701 return false;
10702
10703 if (!optimizeSCC(Def, &CmpInstr, RI))
10704 return false;
10705
10706 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
10707 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
10708 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
10709 // sX = s_cselect_b64 (non-zero imm), 0
10710 // sLo = copy sX.sub0
10711 // sHi = copy sX.sub1
10712 // sY = s_or_b32 sLo, sHi
10713 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
10714 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
10715 const MachineOperand &OrOpnd1 = Def->getOperand(1);
10716 const MachineOperand &OrOpnd2 = Def->getOperand(2);
10717 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
10718 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
10719 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
10720 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
10721 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
10722 Def2->getOperand(1).isReg() &&
10723 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
10724 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
10725 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
10726 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
10727 if (Select && foldableSelect(*Select))
10728 optimizeSCC(Select, Def, RI);
10729 }
10730 }
10731 }
10732 return true;
10733 };
10734
10735 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10736 this](int64_t ExpectedValue, unsigned SrcSize,
10737 bool IsReversible, bool IsSigned) -> bool {
10738 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10739 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10740 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10741 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10742 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10743 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10744 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10745 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10746 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10747 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10748 //
10749 // Signed ge/gt are not used for the sign bit.
10750 //
10751 // If result of the AND is unused except in the compare:
10752 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10753 //
10754 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10755 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10756 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10757 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10758 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10759 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10760
10761 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10762 if (!Def)
10763 return false;
10764
10765 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10766 Def->getOpcode() != AMDGPU::S_AND_B64)
10767 return false;
10768
10769 int64_t Mask;
10770 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10771 if (MO->isImm())
10772 Mask = MO->getImm();
10773 else if (!getFoldableImm(MO, Mask))
10774 return false;
10775 Mask &= maxUIntN(SrcSize);
10776 return isPowerOf2_64(Mask);
10777 };
10778
10779 MachineOperand *SrcOp = &Def->getOperand(1);
10780 if (isMask(SrcOp))
10781 SrcOp = &Def->getOperand(2);
10782 else if (isMask(&Def->getOperand(2)))
10783 SrcOp = &Def->getOperand(1);
10784 else
10785 return false;
10786
10787 // A valid Mask is required to have a single bit set, hence a non-zero and
10788 // power-of-two value. This verifies that we will not do 64-bit shift below.
10789 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10790 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10791 if (IsSigned && BitNo == SrcSize - 1)
10792 return false;
10793
10794 ExpectedValue <<= BitNo;
10795
10796 bool IsReversedCC = false;
10797 if (CmpValue != ExpectedValue) {
10798 if (!IsReversible)
10799 return false;
10800 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10801 if (!IsReversedCC)
10802 return false;
10803 }
10804
10805 Register DefReg = Def->getOperand(0).getReg();
10806 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10807 return false;
10808
10809 if (!optimizeSCC(Def, &CmpInstr, RI))
10810 return false;
10811
10812 if (!MRI->use_nodbg_empty(DefReg)) {
10813 assert(!IsReversedCC);
10814 return true;
10815 }
10816
10817 // Replace AND with unused result with a S_BITCMP.
10818 MachineBasicBlock *MBB = Def->getParent();
10819
10820 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10821 : AMDGPU::S_BITCMP1_B32
10822 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10823 : AMDGPU::S_BITCMP1_B64;
10824
10825 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10826 .add(*SrcOp)
10827 .addImm(BitNo);
10828 Def->eraseFromParent();
10829
10830 return true;
10831 };
10832
10833 switch (CmpInstr.getOpcode()) {
10834 default:
10835 break;
10836 case AMDGPU::S_CMP_EQ_U32:
10837 case AMDGPU::S_CMP_EQ_I32:
10838 case AMDGPU::S_CMPK_EQ_U32:
10839 case AMDGPU::S_CMPK_EQ_I32:
10840 return optimizeCmpAnd(1, 32, true, false);
10841 case AMDGPU::S_CMP_GE_U32:
10842 case AMDGPU::S_CMPK_GE_U32:
10843 return optimizeCmpAnd(1, 32, false, false);
10844 case AMDGPU::S_CMP_GE_I32:
10845 case AMDGPU::S_CMPK_GE_I32:
10846 return optimizeCmpAnd(1, 32, false, true);
10847 case AMDGPU::S_CMP_EQ_U64:
10848 return optimizeCmpAnd(1, 64, true, false);
10849 case AMDGPU::S_CMP_LG_U32:
10850 case AMDGPU::S_CMP_LG_I32:
10851 case AMDGPU::S_CMPK_LG_U32:
10852 case AMDGPU::S_CMPK_LG_I32:
10853 return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
10854 case AMDGPU::S_CMP_GT_U32:
10855 case AMDGPU::S_CMPK_GT_U32:
10856 return optimizeCmpAnd(0, 32, false, false);
10857 case AMDGPU::S_CMP_GT_I32:
10858 case AMDGPU::S_CMPK_GT_I32:
10859 return optimizeCmpAnd(0, 32, false, true);
10860 case AMDGPU::S_CMP_LG_U64:
10861 return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
10862 }
10863
10864 return false;
10865}
10866
10868 AMDGPU::OpName OpName) const {
10869 if (!ST.needsAlignedVGPRs())
10870 return;
10871
10872 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10873 if (OpNo < 0)
10874 return;
10875 MachineOperand &Op = MI.getOperand(OpNo);
10876 if (getOpSize(MI, OpNo) > 4)
10877 return;
10878
10879 // Add implicit aligned super-reg to force alignment on the data operand.
10880 const DebugLoc &DL = MI.getDebugLoc();
10881 MachineBasicBlock *BB = MI.getParent();
10883 Register DataReg = Op.getReg();
10884 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10885 Register Undef = MRI.createVirtualRegister(
10886 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10887 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10888 Register NewVR =
10889 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10890 : &AMDGPU::VReg_64_Align2RegClass);
10891 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10892 .addReg(DataReg, 0, Op.getSubReg())
10893 .addImm(AMDGPU::sub0)
10894 .addReg(Undef)
10895 .addImm(AMDGPU::sub1);
10896 Op.setReg(NewVR);
10897 Op.setSubReg(AMDGPU::sub0);
10898 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10899}
10900
10902 if (isIGLP(*MI))
10903 return false;
10904
10906}
10907
10909 if (!isWMMA(MI) && !isSWMMAC(MI))
10910 return false;
10911
10912 if (AMDGPU::isGFX1250(ST))
10913 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10914
10915 return true;
10916}
10917
10919 unsigned Opcode = MI.getOpcode();
10920
10921 if (AMDGPU::isGFX12Plus(ST))
10922 return isDOT(MI) || isXDLWMMA(MI);
10923
10924 if (!isMAI(MI) || isDGEMM(Opcode) ||
10925 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10926 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10927 return false;
10928
10929 if (!ST.hasGFX940Insts())
10930 return true;
10931
10932 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10933}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, const SIRegisterInfo &RI)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:221
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:87
uint8_t OperandType
Information about the type of the operand.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:96
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool setsSCCifResultIsNonZero(const MachineInstr &MI)
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:578
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:580
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:577
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:579
@ TI_CONSTDATA_START
Definition AMDGPU.h:576
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:72
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:70
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:71
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:62
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:73
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.