LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197}
198
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(ToCycle)) {
223 FromCycle->getExitingBlocks(ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opc0) && isDS(Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(0) != Load1->getOperand(0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opc0).NumDefs;
279 Offset1Idx -= get(Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opc0) && isSMRD(Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Load0);
292 if (NumOps != getNumOperandsNoGlue(Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(0) != Load1->getOperand(0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
306 const ConstantSDNode *Load1Offset =
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opc0).NumDefs;
336 OffIdx1 -= get(Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(OffIdx0);
339 SDValue Off1 = Load1->getOperand(OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(LdSt)) {
378 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(64);
395 else
396 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
432 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(RSrc);
448 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(LdSt, AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
469 return true;
470 }
471
472 if (isImage(LdSt)) {
473 auto RsrcOpName =
474 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
476 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(&LdSt.getOperand(I));
482 } else {
483 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(LdSt)) {
495 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(BaseOp);
499 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(BaseOp);
514 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(BaseOp);
517 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
533 const MachineInstr &MI2,
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(Base1);
554 Base2 = getUnderlyingObject(Base2);
555
556 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
563 int64_t Offset1, bool OffsetIsScalable1,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
634 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(SrcReg, getKillRegState(KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(SrcReg, &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(DefOp.getReg(), &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(DefOp.getReg(), &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(ImpUseSuperReg,
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(std::next(MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
718 *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
732 /* RestoreAfter */ false, 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749 .addReg(SrcReg, getKillRegState(KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(ImpUseSuperReg,
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(Tmp, RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
761}
762
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, 2);
786 DestSubReg = RI.getSubReg(DestReg, SubIdx);
787 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794 .addReg(SrcSubReg)
795 .addReg(SrcReg, RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(FirstMI, LastMI);
807
808 FirstMI->addOperand(
809 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(SrcReg, &RI);
813}
814
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
821 unsigned Size = RI.getRegSizeInBits(*RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(DestReg);
843 Size = RI.getRegSizeInBits(*RC);
844 SrcRC = RI.getPhysRegBaseClass(SrcReg);
845 SrcSize = RI.getRegSizeInBits(*SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(MBB, MI, DL, get(Opc), DestReg)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(1)
865 .addImm(0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(0)
875 .addReg(SrcReg, getKillRegState(KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(1)
892 .addImm(0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(0)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
911 .addReg(SrcReg, getKillRegState(KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
924 .addReg(SrcReg, getKillRegState(KillSrc))
925 .addImm(0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
939 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(SrcReg, getKillRegState(KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(SrcReg, getKillRegState(KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
954 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
969 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
979 .addReg(NewSrcReg, getKillRegState(KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(SrcReg);
1003 } else {
1004 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(0) // src0_modifiers
1006 .addReg(SrcReg)
1007 .addImm(0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1019 .addReg(NewSrcReg, getKillRegState(KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1024 .addImm(0) // src0_modifiers
1025 .addReg(NewSrcReg)
1026 .addImm(0) // clamp
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(SrcReg, getKillRegState(KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1047 .addReg(SrcReg)
1049 .addReg(SrcReg)
1050 .addImm(0) // op_sel_lo
1051 .addImm(0) // op_sel_hi
1052 .addImm(0) // neg_lo
1053 .addImm(0) // neg_hi
1054 .addImm(0) // clamp
1055 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(SrcRC)) {
1063 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1067 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(*RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1120 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1130 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1133 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1135 .addReg(SrcSubReg)
1137 .addReg(SrcSubReg)
1138 .addImm(0) // op_sel_lo
1139 .addImm(0) // op_sel_hi
1140 .addImm(0) // neg_lo
1141 .addImm(0) // neg_hi
1142 .addImm(0) // clamp
1143 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1182 const DebugLoc &DL, Register DstReg,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1194 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1195 .add(Cond[0]);
1196 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1197 .addImm(0)
1198 .addReg(FalseReg)
1199 .addImm(0)
1200 .addReg(TrueReg)
1201 .addReg(SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1208 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1209 .addImm(0)
1210 .addReg(FalseReg)
1211 .addImm(0)
1212 .addReg(TrueReg)
1213 .addReg(SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1231 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1232 .add(RegOp);
1233 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1234 .addImm(0)
1235 .addReg(FalseReg)
1236 .addImm(0)
1237 .addReg(TrueReg)
1238 .addReg(SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1245 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1246 .add(RegOp);
1247 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addReg(SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1258 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1259 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1272 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1273 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1274 .addImm(0)
1275 .addReg(FalseReg)
1276 .addImm(0)
1277 .addReg(TrueReg)
1278 .addReg(SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1296 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1297 .addImm(Value)
1298 .addReg(SrcReg);
1299
1300 return Reg;
1301}
1302
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1309 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1310 .addImm(Value)
1311 .addReg(SrcReg);
1312
1313 return Reg;
1314}
1315
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO: {
1330 const MachineOperand &Src0 = MI.getOperand(1);
1331 if (Src0.isImm()) {
1332 ImmVal = Src0.getImm();
1333 return MI.getOperand(0).getReg() == Reg;
1334 }
1335
1336 return false;
1337 }
1338 case AMDGPU::S_BREV_B32:
1339 case AMDGPU::V_BFREV_B32_e32:
1340 case AMDGPU::V_BFREV_B32_e64: {
1341 const MachineOperand &Src0 = MI.getOperand(1);
1342 if (Src0.isImm()) {
1343 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1344 return MI.getOperand(0).getReg() == Reg;
1345 }
1346
1347 return false;
1348 }
1349 case AMDGPU::S_NOT_B32:
1350 case AMDGPU::V_NOT_B32_e32:
1351 case AMDGPU::V_NOT_B32_e64: {
1352 const MachineOperand &Src0 = MI.getOperand(1);
1353 if (Src0.isImm()) {
1354 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1355 return MI.getOperand(0).getReg() == Reg;
1356 }
1357
1358 return false;
1359 }
1360 default:
1361 return false;
1362 }
1363}
1364
1366
1367 if (RI.isAGPRClass(DstRC))
1368 return AMDGPU::COPY;
1369 if (RI.getRegSizeInBits(*DstRC) == 16) {
1370 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1371 // before RA.
1372 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1373 }
1374 if (RI.getRegSizeInBits(*DstRC) == 32)
1375 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1376 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1377 return AMDGPU::S_MOV_B64;
1378 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1379 return AMDGPU::V_MOV_B64_PSEUDO;
1380 return AMDGPU::COPY;
1381}
1382
1383const MCInstrDesc &
1385 bool IsIndirectSrc) const {
1386 if (IsIndirectSrc) {
1387 if (VecSize <= 32) // 4 bytes
1388 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1389 if (VecSize <= 64) // 8 bytes
1390 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1391 if (VecSize <= 96) // 12 bytes
1392 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1393 if (VecSize <= 128) // 16 bytes
1394 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1395 if (VecSize <= 160) // 20 bytes
1396 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1397 if (VecSize <= 256) // 32 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1399 if (VecSize <= 288) // 36 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1401 if (VecSize <= 320) // 40 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1403 if (VecSize <= 352) // 44 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1405 if (VecSize <= 384) // 48 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1407 if (VecSize <= 512) // 64 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1409 if (VecSize <= 1024) // 128 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1411
1412 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1413 }
1414
1415 if (VecSize <= 32) // 4 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1417 if (VecSize <= 64) // 8 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1419 if (VecSize <= 96) // 12 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1421 if (VecSize <= 128) // 16 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1423 if (VecSize <= 160) // 20 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1425 if (VecSize <= 256) // 32 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1427 if (VecSize <= 288) // 36 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1429 if (VecSize <= 320) // 40 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1431 if (VecSize <= 352) // 44 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1433 if (VecSize <= 384) // 48 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1435 if (VecSize <= 512) // 64 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1437 if (VecSize <= 1024) // 128 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1439
1440 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1441}
1442
1443static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1444 if (VecSize <= 32) // 4 bytes
1445 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1446 if (VecSize <= 64) // 8 bytes
1447 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1448 if (VecSize <= 96) // 12 bytes
1449 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1450 if (VecSize <= 128) // 16 bytes
1451 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1452 if (VecSize <= 160) // 20 bytes
1453 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1454 if (VecSize <= 256) // 32 bytes
1455 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1456 if (VecSize <= 288) // 36 bytes
1457 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1458 if (VecSize <= 320) // 40 bytes
1459 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1460 if (VecSize <= 352) // 44 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1462 if (VecSize <= 384) // 48 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1464 if (VecSize <= 512) // 64 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1466 if (VecSize <= 1024) // 128 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1468
1469 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1470}
1471
1472static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1473 if (VecSize <= 32) // 4 bytes
1474 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1475 if (VecSize <= 64) // 8 bytes
1476 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1477 if (VecSize <= 96) // 12 bytes
1478 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1479 if (VecSize <= 128) // 16 bytes
1480 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1481 if (VecSize <= 160) // 20 bytes
1482 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1483 if (VecSize <= 256) // 32 bytes
1484 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1485 if (VecSize <= 288) // 36 bytes
1486 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1487 if (VecSize <= 320) // 40 bytes
1488 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1489 if (VecSize <= 352) // 44 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1491 if (VecSize <= 384) // 48 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1493 if (VecSize <= 512) // 64 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1495 if (VecSize <= 1024) // 128 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1497
1498 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1499}
1500
1501static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1502 if (VecSize <= 64) // 8 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1504 if (VecSize <= 128) // 16 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1506 if (VecSize <= 256) // 32 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1508 if (VecSize <= 512) // 64 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1510 if (VecSize <= 1024) // 128 bytes
1511 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1512
1513 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1514}
1515
1516const MCInstrDesc &
1517SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1518 bool IsSGPR) const {
1519 if (IsSGPR) {
1520 switch (EltSize) {
1521 case 32:
1522 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1523 case 64:
1524 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1525 default:
1526 llvm_unreachable("invalid reg indexing elt size");
1527 }
1528 }
1529
1530 assert(EltSize == 32 && "invalid reg indexing elt size");
1532}
1533
1534static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1535 switch (Size) {
1536 case 4:
1537 return AMDGPU::SI_SPILL_S32_SAVE;
1538 case 8:
1539 return AMDGPU::SI_SPILL_S64_SAVE;
1540 case 12:
1541 return AMDGPU::SI_SPILL_S96_SAVE;
1542 case 16:
1543 return AMDGPU::SI_SPILL_S128_SAVE;
1544 case 20:
1545 return AMDGPU::SI_SPILL_S160_SAVE;
1546 case 24:
1547 return AMDGPU::SI_SPILL_S192_SAVE;
1548 case 28:
1549 return AMDGPU::SI_SPILL_S224_SAVE;
1550 case 32:
1551 return AMDGPU::SI_SPILL_S256_SAVE;
1552 case 36:
1553 return AMDGPU::SI_SPILL_S288_SAVE;
1554 case 40:
1555 return AMDGPU::SI_SPILL_S320_SAVE;
1556 case 44:
1557 return AMDGPU::SI_SPILL_S352_SAVE;
1558 case 48:
1559 return AMDGPU::SI_SPILL_S384_SAVE;
1560 case 64:
1561 return AMDGPU::SI_SPILL_S512_SAVE;
1562 case 128:
1563 return AMDGPU::SI_SPILL_S1024_SAVE;
1564 default:
1565 llvm_unreachable("unknown register size");
1566 }
1567}
1568
1569static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1570 switch (Size) {
1571 case 2:
1572 return AMDGPU::SI_SPILL_V16_SAVE;
1573 case 4:
1574 return AMDGPU::SI_SPILL_V32_SAVE;
1575 case 8:
1576 return AMDGPU::SI_SPILL_V64_SAVE;
1577 case 12:
1578 return AMDGPU::SI_SPILL_V96_SAVE;
1579 case 16:
1580 return AMDGPU::SI_SPILL_V128_SAVE;
1581 case 20:
1582 return AMDGPU::SI_SPILL_V160_SAVE;
1583 case 24:
1584 return AMDGPU::SI_SPILL_V192_SAVE;
1585 case 28:
1586 return AMDGPU::SI_SPILL_V224_SAVE;
1587 case 32:
1588 return AMDGPU::SI_SPILL_V256_SAVE;
1589 case 36:
1590 return AMDGPU::SI_SPILL_V288_SAVE;
1591 case 40:
1592 return AMDGPU::SI_SPILL_V320_SAVE;
1593 case 44:
1594 return AMDGPU::SI_SPILL_V352_SAVE;
1595 case 48:
1596 return AMDGPU::SI_SPILL_V384_SAVE;
1597 case 64:
1598 return AMDGPU::SI_SPILL_V512_SAVE;
1599 case 128:
1600 return AMDGPU::SI_SPILL_V1024_SAVE;
1601 default:
1602 llvm_unreachable("unknown register size");
1603 }
1604}
1605
1606static unsigned getAVSpillSaveOpcode(unsigned Size) {
1607 switch (Size) {
1608 case 4:
1609 return AMDGPU::SI_SPILL_AV32_SAVE;
1610 case 8:
1611 return AMDGPU::SI_SPILL_AV64_SAVE;
1612 case 12:
1613 return AMDGPU::SI_SPILL_AV96_SAVE;
1614 case 16:
1615 return AMDGPU::SI_SPILL_AV128_SAVE;
1616 case 20:
1617 return AMDGPU::SI_SPILL_AV160_SAVE;
1618 case 24:
1619 return AMDGPU::SI_SPILL_AV192_SAVE;
1620 case 28:
1621 return AMDGPU::SI_SPILL_AV224_SAVE;
1622 case 32:
1623 return AMDGPU::SI_SPILL_AV256_SAVE;
1624 case 36:
1625 return AMDGPU::SI_SPILL_AV288_SAVE;
1626 case 40:
1627 return AMDGPU::SI_SPILL_AV320_SAVE;
1628 case 44:
1629 return AMDGPU::SI_SPILL_AV352_SAVE;
1630 case 48:
1631 return AMDGPU::SI_SPILL_AV384_SAVE;
1632 case 64:
1633 return AMDGPU::SI_SPILL_AV512_SAVE;
1634 case 128:
1635 return AMDGPU::SI_SPILL_AV1024_SAVE;
1636 default:
1637 llvm_unreachable("unknown register size");
1638 }
1639}
1640
1641static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1642 bool IsVectorSuperClass) {
1643 // Currently, there is only 32-bit WWM register spills needed.
1644 if (Size != 4)
1645 llvm_unreachable("unknown wwm register spill size");
1646
1647 if (IsVectorSuperClass)
1648 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1649
1650 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1651}
1652
1654 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1655 const SIMachineFunctionInfo &MFI) const {
1656 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1657
1658 // Choose the right opcode if spilling a WWM register.
1660 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1661
1662 // TODO: Check if AGPRs are available
1663 if (ST.hasMAIInsts())
1664 return getAVSpillSaveOpcode(Size);
1665
1667}
1668
1671 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1672 MachineInstr::MIFlag Flags) const {
1673 MachineFunction *MF = MBB.getParent();
1675 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1676 const DebugLoc &DL = MBB.findDebugLoc(MI);
1677
1678 MachinePointerInfo PtrInfo
1679 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1681 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1682 FrameInfo.getObjectAlign(FrameIndex));
1683 unsigned SpillSize = RI.getSpillSize(*RC);
1684
1686 if (RI.isSGPRClass(RC)) {
1687 MFI->setHasSpilledSGPRs();
1688 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1689 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1690 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1691
1692 // We are only allowed to create one new instruction when spilling
1693 // registers, so we need to use pseudo instruction for spilling SGPRs.
1694 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1695
1696 // The SGPR spill/restore instructions only work on number sgprs, so we need
1697 // to make sure we are using the correct register class.
1698 if (SrcReg.isVirtual() && SpillSize == 4) {
1699 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1700 }
1701
1702 BuildMI(MBB, MI, DL, OpDesc)
1703 .addReg(SrcReg, getKillRegState(isKill)) // data
1704 .addFrameIndex(FrameIndex) // addr
1705 .addMemOperand(MMO)
1707
1708 if (RI.spillSGPRToVGPR())
1709 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1710 return;
1711 }
1712
1713 unsigned Opcode =
1714 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1715 MFI->setHasSpilledVGPRs();
1716
1717 BuildMI(MBB, MI, DL, get(Opcode))
1718 .addReg(SrcReg, getKillRegState(isKill)) // data
1719 .addFrameIndex(FrameIndex) // addr
1720 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1721 .addImm(0) // offset
1722 .addMemOperand(MMO);
1723}
1724
1725static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1726 switch (Size) {
1727 case 4:
1728 return AMDGPU::SI_SPILL_S32_RESTORE;
1729 case 8:
1730 return AMDGPU::SI_SPILL_S64_RESTORE;
1731 case 12:
1732 return AMDGPU::SI_SPILL_S96_RESTORE;
1733 case 16:
1734 return AMDGPU::SI_SPILL_S128_RESTORE;
1735 case 20:
1736 return AMDGPU::SI_SPILL_S160_RESTORE;
1737 case 24:
1738 return AMDGPU::SI_SPILL_S192_RESTORE;
1739 case 28:
1740 return AMDGPU::SI_SPILL_S224_RESTORE;
1741 case 32:
1742 return AMDGPU::SI_SPILL_S256_RESTORE;
1743 case 36:
1744 return AMDGPU::SI_SPILL_S288_RESTORE;
1745 case 40:
1746 return AMDGPU::SI_SPILL_S320_RESTORE;
1747 case 44:
1748 return AMDGPU::SI_SPILL_S352_RESTORE;
1749 case 48:
1750 return AMDGPU::SI_SPILL_S384_RESTORE;
1751 case 64:
1752 return AMDGPU::SI_SPILL_S512_RESTORE;
1753 case 128:
1754 return AMDGPU::SI_SPILL_S1024_RESTORE;
1755 default:
1756 llvm_unreachable("unknown register size");
1757 }
1758}
1759
1760static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1761 switch (Size) {
1762 case 2:
1763 return AMDGPU::SI_SPILL_V16_RESTORE;
1764 case 4:
1765 return AMDGPU::SI_SPILL_V32_RESTORE;
1766 case 8:
1767 return AMDGPU::SI_SPILL_V64_RESTORE;
1768 case 12:
1769 return AMDGPU::SI_SPILL_V96_RESTORE;
1770 case 16:
1771 return AMDGPU::SI_SPILL_V128_RESTORE;
1772 case 20:
1773 return AMDGPU::SI_SPILL_V160_RESTORE;
1774 case 24:
1775 return AMDGPU::SI_SPILL_V192_RESTORE;
1776 case 28:
1777 return AMDGPU::SI_SPILL_V224_RESTORE;
1778 case 32:
1779 return AMDGPU::SI_SPILL_V256_RESTORE;
1780 case 36:
1781 return AMDGPU::SI_SPILL_V288_RESTORE;
1782 case 40:
1783 return AMDGPU::SI_SPILL_V320_RESTORE;
1784 case 44:
1785 return AMDGPU::SI_SPILL_V352_RESTORE;
1786 case 48:
1787 return AMDGPU::SI_SPILL_V384_RESTORE;
1788 case 64:
1789 return AMDGPU::SI_SPILL_V512_RESTORE;
1790 case 128:
1791 return AMDGPU::SI_SPILL_V1024_RESTORE;
1792 default:
1793 llvm_unreachable("unknown register size");
1794 }
1795}
1796
1797static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1798 switch (Size) {
1799 case 4:
1800 return AMDGPU::SI_SPILL_AV32_RESTORE;
1801 case 8:
1802 return AMDGPU::SI_SPILL_AV64_RESTORE;
1803 case 12:
1804 return AMDGPU::SI_SPILL_AV96_RESTORE;
1805 case 16:
1806 return AMDGPU::SI_SPILL_AV128_RESTORE;
1807 case 20:
1808 return AMDGPU::SI_SPILL_AV160_RESTORE;
1809 case 24:
1810 return AMDGPU::SI_SPILL_AV192_RESTORE;
1811 case 28:
1812 return AMDGPU::SI_SPILL_AV224_RESTORE;
1813 case 32:
1814 return AMDGPU::SI_SPILL_AV256_RESTORE;
1815 case 36:
1816 return AMDGPU::SI_SPILL_AV288_RESTORE;
1817 case 40:
1818 return AMDGPU::SI_SPILL_AV320_RESTORE;
1819 case 44:
1820 return AMDGPU::SI_SPILL_AV352_RESTORE;
1821 case 48:
1822 return AMDGPU::SI_SPILL_AV384_RESTORE;
1823 case 64:
1824 return AMDGPU::SI_SPILL_AV512_RESTORE;
1825 case 128:
1826 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1827 default:
1828 llvm_unreachable("unknown register size");
1829 }
1830}
1831
1832static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1833 bool IsVectorSuperClass) {
1834 // Currently, there is only 32-bit WWM register spills needed.
1835 if (Size != 4)
1836 llvm_unreachable("unknown wwm register spill size");
1837
1838 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1839 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1840
1841 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1842}
1843
1845 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1846 const SIMachineFunctionInfo &MFI) const {
1847 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1848
1849 // Choose the right opcode if restoring a WWM register.
1851 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1852
1853 // TODO: Check if AGPRs are available
1854 if (ST.hasMAIInsts())
1856
1857 assert(!RI.isAGPRClass(RC));
1859}
1860
1863 Register DestReg, int FrameIndex,
1864 const TargetRegisterClass *RC,
1865 Register VReg,
1866 MachineInstr::MIFlag Flags) const {
1867 MachineFunction *MF = MBB.getParent();
1869 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1870 const DebugLoc &DL = MBB.findDebugLoc(MI);
1871 unsigned SpillSize = RI.getSpillSize(*RC);
1872
1873 MachinePointerInfo PtrInfo
1874 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1875
1877 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1878 FrameInfo.getObjectAlign(FrameIndex));
1879
1880 if (RI.isSGPRClass(RC)) {
1881 MFI->setHasSpilledSGPRs();
1882 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1883 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1884 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1885
1886 // FIXME: Maybe this should not include a memoperand because it will be
1887 // lowered to non-memory instructions.
1888 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1889 if (DestReg.isVirtual() && SpillSize == 4) {
1891 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1892 }
1893
1894 if (RI.spillSGPRToVGPR())
1895 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1896 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1897 .addFrameIndex(FrameIndex) // addr
1898 .addMemOperand(MMO)
1900
1901 return;
1902 }
1903
1904 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1905 SpillSize, *MFI);
1906 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1907 .addFrameIndex(FrameIndex) // vaddr
1908 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1909 .addImm(0) // offset
1910 .addMemOperand(MMO);
1911}
1912
1917
1920 unsigned Quantity) const {
1921 DebugLoc DL = MBB.findDebugLoc(MI);
1922 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1923 while (Quantity > 0) {
1924 unsigned Arg = std::min(Quantity, MaxSNopCount);
1925 Quantity -= Arg;
1926 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1927 }
1928}
1929
1931 auto *MF = MBB.getParent();
1932 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1933
1934 assert(Info->isEntryFunction());
1935
1936 if (MBB.succ_empty()) {
1937 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1938 if (HasNoTerminator) {
1939 if (Info->returnsVoid()) {
1940 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1941 } else {
1942 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1943 }
1944 }
1945 }
1946}
1947
1951 const DebugLoc &DL) const {
1952 MachineFunction *MF = MBB.getParent();
1953 constexpr unsigned DoorbellIDMask = 0x3ff;
1954 constexpr unsigned ECQueueWaveAbort = 0x400;
1955
1956 MachineBasicBlock *TrapBB = &MBB;
1957 MachineBasicBlock *ContBB = &MBB;
1958 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1959
1960 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1961 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1962 TrapBB = MF->CreateMachineBasicBlock();
1963 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1964 MF->push_back(TrapBB);
1965 MBB.addSuccessor(TrapBB);
1966 }
1967
1968 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1969 // will be a nop.
1970 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1971 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1972 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1973 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1974 DoorbellReg)
1976 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1977 .addUse(AMDGPU::M0);
1978 Register DoorbellRegMasked =
1979 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1980 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1981 .addUse(DoorbellReg)
1982 .addImm(DoorbellIDMask);
1983 Register SetWaveAbortBit =
1984 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1985 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1986 .addUse(DoorbellRegMasked)
1987 .addImm(ECQueueWaveAbort);
1988 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1989 .addUse(SetWaveAbortBit);
1990 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
1992 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1993 .addUse(AMDGPU::TTMP2);
1994 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
1995 TrapBB->addSuccessor(HaltLoopBB);
1996
1997 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
1998 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
1999 .addMBB(HaltLoopBB);
2000 MF->push_back(HaltLoopBB);
2001 HaltLoopBB->addSuccessor(HaltLoopBB);
2002
2003 return ContBB;
2004}
2005
2007 switch (MI.getOpcode()) {
2008 default:
2009 if (MI.isMetaInstruction())
2010 return 0;
2011 return 1; // FIXME: Do wait states equal cycles?
2012
2013 case AMDGPU::S_NOP:
2014 return MI.getOperand(0).getImm() + 1;
2015 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2016 // hazard, even if one exist, won't really be visible. Should we handle it?
2017 }
2018}
2019
2021 MachineBasicBlock &MBB = *MI.getParent();
2022 DebugLoc DL = MBB.findDebugLoc(MI);
2024 switch (MI.getOpcode()) {
2025 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2026 case AMDGPU::S_MOV_B64_term:
2027 // This is only a terminator to get the correct spill code placement during
2028 // register allocation.
2029 MI.setDesc(get(AMDGPU::S_MOV_B64));
2030 break;
2031
2032 case AMDGPU::S_MOV_B32_term:
2033 // This is only a terminator to get the correct spill code placement during
2034 // register allocation.
2035 MI.setDesc(get(AMDGPU::S_MOV_B32));
2036 break;
2037
2038 case AMDGPU::S_XOR_B64_term:
2039 // This is only a terminator to get the correct spill code placement during
2040 // register allocation.
2041 MI.setDesc(get(AMDGPU::S_XOR_B64));
2042 break;
2043
2044 case AMDGPU::S_XOR_B32_term:
2045 // This is only a terminator to get the correct spill code placement during
2046 // register allocation.
2047 MI.setDesc(get(AMDGPU::S_XOR_B32));
2048 break;
2049 case AMDGPU::S_OR_B64_term:
2050 // This is only a terminator to get the correct spill code placement during
2051 // register allocation.
2052 MI.setDesc(get(AMDGPU::S_OR_B64));
2053 break;
2054 case AMDGPU::S_OR_B32_term:
2055 // This is only a terminator to get the correct spill code placement during
2056 // register allocation.
2057 MI.setDesc(get(AMDGPU::S_OR_B32));
2058 break;
2059
2060 case AMDGPU::S_ANDN2_B64_term:
2061 // This is only a terminator to get the correct spill code placement during
2062 // register allocation.
2063 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2064 break;
2065
2066 case AMDGPU::S_ANDN2_B32_term:
2067 // This is only a terminator to get the correct spill code placement during
2068 // register allocation.
2069 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2070 break;
2071
2072 case AMDGPU::S_AND_B64_term:
2073 // This is only a terminator to get the correct spill code placement during
2074 // register allocation.
2075 MI.setDesc(get(AMDGPU::S_AND_B64));
2076 break;
2077
2078 case AMDGPU::S_AND_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(AMDGPU::S_AND_B32));
2082 break;
2083
2084 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2088 break;
2089
2090 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2094 break;
2095
2096 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2097 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2098 break;
2099
2100 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2101 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2102 break;
2103 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2104 Register Dst = MI.getOperand(0).getReg();
2105 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2106 MI.setDesc(
2107 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2108 break;
2109 }
2110 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2111 Register Dst = MI.getOperand(0).getReg();
2112 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2113 int64_t Imm = MI.getOperand(1).getImm();
2114
2115 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2116 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2117 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2120 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2121 .addImm(SignExtend64<32>(Imm >> 32))
2123 MI.eraseFromParent();
2124 break;
2125 }
2126
2127 [[fallthrough]];
2128 }
2129 case AMDGPU::V_MOV_B64_PSEUDO: {
2130 Register Dst = MI.getOperand(0).getReg();
2131 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2132 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2133
2134 const MachineOperand &SrcOp = MI.getOperand(1);
2135 // FIXME: Will this work for 64-bit floating point immediates?
2136 assert(!SrcOp.isFPImm());
2137 if (ST.hasMovB64()) {
2138 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2139 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2140 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2141 break;
2142 }
2143 if (SrcOp.isImm()) {
2144 APInt Imm(64, SrcOp.getImm());
2145 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2146 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2147 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2148 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2150 .addImm(Lo.getSExtValue())
2152 .addImm(Lo.getSExtValue())
2153 .addImm(0) // op_sel_lo
2154 .addImm(0) // op_sel_hi
2155 .addImm(0) // neg_lo
2156 .addImm(0) // neg_hi
2157 .addImm(0); // clamp
2158 } else {
2159 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2160 .addImm(Lo.getSExtValue())
2162 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2163 .addImm(Hi.getSExtValue())
2165 }
2166 } else {
2167 assert(SrcOp.isReg());
2168 if (ST.hasPkMovB32() &&
2169 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2170 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2171 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2172 .addReg(SrcOp.getReg())
2174 .addReg(SrcOp.getReg())
2175 .addImm(0) // op_sel_lo
2176 .addImm(0) // op_sel_hi
2177 .addImm(0) // neg_lo
2178 .addImm(0) // neg_hi
2179 .addImm(0); // clamp
2180 } else {
2181 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2182 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2184 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2185 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2187 }
2188 }
2189 MI.eraseFromParent();
2190 break;
2191 }
2192 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2194 break;
2195 }
2196 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2197 const MachineOperand &SrcOp = MI.getOperand(1);
2198 assert(!SrcOp.isFPImm());
2199
2200 if (ST.has64BitLiterals()) {
2201 MI.setDesc(get(AMDGPU::S_MOV_B64));
2202 break;
2203 }
2204
2205 APInt Imm(64, SrcOp.getImm());
2206 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2207 MI.setDesc(get(AMDGPU::S_MOV_B64));
2208 break;
2209 }
2210
2211 Register Dst = MI.getOperand(0).getReg();
2212 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2213 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2214
2215 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2216 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2217 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2218 .addImm(Lo.getSExtValue())
2220 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2221 .addImm(Hi.getSExtValue())
2223 MI.eraseFromParent();
2224 break;
2225 }
2226 case AMDGPU::V_SET_INACTIVE_B32: {
2227 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2228 Register DstReg = MI.getOperand(0).getReg();
2229 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2230 .add(MI.getOperand(3))
2231 .add(MI.getOperand(4))
2232 .add(MI.getOperand(1))
2233 .add(MI.getOperand(2))
2234 .add(MI.getOperand(5));
2235 MI.eraseFromParent();
2236 break;
2237 }
2238 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2239 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2240 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2241 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2242 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2243 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2244 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2245 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2246 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2247 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2248 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2249 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2250 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2251 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2252 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2253 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2254 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2255 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2256 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2257 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2258 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2259 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2260 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2261 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2262 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2263 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2267 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2268
2269 unsigned Opc;
2270 if (RI.hasVGPRs(EltRC)) {
2271 Opc = AMDGPU::V_MOVRELD_B32_e32;
2272 } else {
2273 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2274 : AMDGPU::S_MOVRELD_B32;
2275 }
2276
2277 const MCInstrDesc &OpDesc = get(Opc);
2278 Register VecReg = MI.getOperand(0).getReg();
2279 bool IsUndef = MI.getOperand(1).isUndef();
2280 unsigned SubReg = MI.getOperand(3).getImm();
2281 assert(VecReg == MI.getOperand(1).getReg());
2282
2284 BuildMI(MBB, MI, DL, OpDesc)
2285 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2286 .add(MI.getOperand(2))
2288 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2289
2290 const int ImpDefIdx =
2291 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2292 const int ImpUseIdx = ImpDefIdx + 1;
2293 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2294 MI.eraseFromParent();
2295 break;
2296 }
2297 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2309 assert(ST.useVGPRIndexMode());
2310 Register VecReg = MI.getOperand(0).getReg();
2311 bool IsUndef = MI.getOperand(1).isUndef();
2312 MachineOperand &Idx = MI.getOperand(3);
2313 Register SubReg = MI.getOperand(4).getImm();
2314
2315 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2316 .add(Idx)
2318 SetOn->getOperand(3).setIsUndef();
2319
2320 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2322 BuildMI(MBB, MI, DL, OpDesc)
2323 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2324 .add(MI.getOperand(2))
2326 .addReg(VecReg,
2327 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2328
2329 const int ImpDefIdx =
2330 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2331 const int ImpUseIdx = ImpDefIdx + 1;
2332 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2333
2334 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2335
2336 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2337
2338 MI.eraseFromParent();
2339 break;
2340 }
2341 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2342 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2343 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2344 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2345 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2346 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2347 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2348 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2349 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2350 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2351 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2352 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2353 assert(ST.useVGPRIndexMode());
2354 Register Dst = MI.getOperand(0).getReg();
2355 Register VecReg = MI.getOperand(1).getReg();
2356 bool IsUndef = MI.getOperand(1).isUndef();
2357 Register Idx = MI.getOperand(2).getReg();
2358 Register SubReg = MI.getOperand(3).getImm();
2359
2360 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2361 .addReg(Idx)
2363 SetOn->getOperand(3).setIsUndef();
2364
2365 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2366 .addDef(Dst)
2367 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2368 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2369
2370 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2371
2372 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2373
2374 MI.eraseFromParent();
2375 break;
2376 }
2377 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2378 MachineFunction &MF = *MBB.getParent();
2379 Register Reg = MI.getOperand(0).getReg();
2380 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2381 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2382 MachineOperand OpLo = MI.getOperand(1);
2383 MachineOperand OpHi = MI.getOperand(2);
2384
2385 // Create a bundle so these instructions won't be re-ordered by the
2386 // post-RA scheduler.
2387 MIBundleBuilder Bundler(MBB, MI);
2388 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2389
2390 // What we want here is an offset from the value returned by s_getpc (which
2391 // is the address of the s_add_u32 instruction) to the global variable, but
2392 // since the encoding of $symbol starts 4 bytes after the start of the
2393 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2394 // small. This requires us to add 4 to the global variable offset in order
2395 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2396 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2397 // instruction.
2398
2399 int64_t Adjust = 0;
2400 if (ST.hasGetPCZeroExtension()) {
2401 // Fix up hardware that does not sign-extend the 48-bit PC value by
2402 // inserting: s_sext_i32_i16 reghi, reghi
2403 Bundler.append(
2404 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2405 Adjust += 4;
2406 }
2407
2408 if (OpLo.isGlobal())
2409 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2410 Bundler.append(
2411 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2412
2413 if (OpHi.isGlobal())
2414 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2415 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2416 .addReg(RegHi)
2417 .add(OpHi));
2418
2419 finalizeBundle(MBB, Bundler.begin());
2420
2421 MI.eraseFromParent();
2422 break;
2423 }
2424 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2425 MachineFunction &MF = *MBB.getParent();
2426 Register Reg = MI.getOperand(0).getReg();
2427 MachineOperand Op = MI.getOperand(1);
2428
2429 // Create a bundle so these instructions won't be re-ordered by the
2430 // post-RA scheduler.
2431 MIBundleBuilder Bundler(MBB, MI);
2432 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2433 if (Op.isGlobal())
2434 Op.setOffset(Op.getOffset() + 4);
2435 Bundler.append(
2436 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2437
2438 finalizeBundle(MBB, Bundler.begin());
2439
2440 MI.eraseFromParent();
2441 break;
2442 }
2443 case AMDGPU::ENTER_STRICT_WWM: {
2444 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2445 // Whole Wave Mode is entered.
2446 MI.setDesc(get(LMC.OrSaveExecOpc));
2447 break;
2448 }
2449 case AMDGPU::ENTER_STRICT_WQM: {
2450 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2451 // STRICT_WQM is entered.
2452 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2453 .addReg(LMC.ExecReg);
2454 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2455
2456 MI.eraseFromParent();
2457 break;
2458 }
2459 case AMDGPU::EXIT_STRICT_WWM:
2460 case AMDGPU::EXIT_STRICT_WQM: {
2461 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2462 // WWM/STICT_WQM is exited.
2463 MI.setDesc(get(LMC.MovOpc));
2464 break;
2465 }
2466 case AMDGPU::SI_RETURN: {
2467 const MachineFunction *MF = MBB.getParent();
2468 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2469 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2470 // Hiding the return address use with SI_RETURN may lead to extra kills in
2471 // the function and missing live-ins. We are fine in practice because callee
2472 // saved register handling ensures the register value is restored before
2473 // RET, but we need the undef flag here to appease the MachineVerifier
2474 // liveness checks.
2476 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2477 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2478
2479 MIB.copyImplicitOps(MI);
2480 MI.eraseFromParent();
2481 break;
2482 }
2483
2484 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2485 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2486 MI.setDesc(get(AMDGPU::S_MUL_U64));
2487 break;
2488
2489 case AMDGPU::S_GETPC_B64_pseudo:
2490 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2491 if (ST.hasGetPCZeroExtension()) {
2492 Register Dst = MI.getOperand(0).getReg();
2493 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2494 // Fix up hardware that does not sign-extend the 48-bit PC value by
2495 // inserting: s_sext_i32_i16 dsthi, dsthi
2496 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2497 DstHi)
2498 .addReg(DstHi);
2499 }
2500 break;
2501
2502 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2503 assert(ST.hasBF16PackedInsts());
2504 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2505 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2506 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2507 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2508 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2509 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2510 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2511 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2512 break;
2513 }
2514
2515 return true;
2516}
2517
2520 unsigned SubIdx,
2521 const MachineInstr &Orig) const {
2522
2523 // Try shrinking the instruction to remat only the part needed for current
2524 // context.
2525 // TODO: Handle more cases.
2526 unsigned Opcode = Orig.getOpcode();
2527 switch (Opcode) {
2528 case AMDGPU::S_LOAD_DWORDX16_IMM:
2529 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2530 if (SubIdx != 0)
2531 break;
2532
2533 if (I == MBB.end())
2534 break;
2535
2536 if (I->isBundled())
2537 break;
2538
2539 // Look for a single use of the register that is also a subreg.
2540 Register RegToFind = Orig.getOperand(0).getReg();
2541 MachineOperand *UseMO = nullptr;
2542 for (auto &CandMO : I->operands()) {
2543 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2544 continue;
2545 if (UseMO) {
2546 UseMO = nullptr;
2547 break;
2548 }
2549 UseMO = &CandMO;
2550 }
2551 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2552 break;
2553
2554 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2555 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2556
2557 MachineFunction *MF = MBB.getParent();
2559 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2560
2561 unsigned NewOpcode = -1;
2562 if (SubregSize == 256)
2563 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2564 else if (SubregSize == 128)
2565 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2566 else
2567 break;
2568
2569 const MCInstrDesc &TID = get(NewOpcode);
2570 const TargetRegisterClass *NewRC =
2571 RI.getAllocatableClass(getRegClass(TID, 0));
2572 MRI.setRegClass(DestReg, NewRC);
2573
2574 UseMO->setReg(DestReg);
2575 UseMO->setSubReg(AMDGPU::NoSubRegister);
2576
2577 // Use a smaller load with the desired size, possibly with updated offset.
2578 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2579 MI->setDesc(TID);
2580 MI->getOperand(0).setReg(DestReg);
2581 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2582 if (Offset) {
2583 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2584 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2585 OffsetMO->setImm(FinalOffset);
2586 }
2588 for (const MachineMemOperand *MemOp : Orig.memoperands())
2589 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2590 SubregSize / 8));
2591 MI->setMemRefs(*MF, NewMMOs);
2592
2593 MBB.insert(I, MI);
2594 return;
2595 }
2596
2597 default:
2598 break;
2599 }
2600
2601 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2602}
2603
2604std::pair<MachineInstr*, MachineInstr*>
2606 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2607
2608 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2610 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2611 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2612 return std::pair(&MI, nullptr);
2613 }
2614
2615 MachineBasicBlock &MBB = *MI.getParent();
2616 DebugLoc DL = MBB.findDebugLoc(MI);
2617 MachineFunction *MF = MBB.getParent();
2619 Register Dst = MI.getOperand(0).getReg();
2620 unsigned Part = 0;
2621 MachineInstr *Split[2];
2622
2623 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2624 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2625 if (Dst.isPhysical()) {
2626 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2627 } else {
2628 assert(MRI.isSSA());
2629 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2630 MovDPP.addDef(Tmp);
2631 }
2632
2633 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2634 const MachineOperand &SrcOp = MI.getOperand(I);
2635 assert(!SrcOp.isFPImm());
2636 if (SrcOp.isImm()) {
2637 APInt Imm(64, SrcOp.getImm());
2638 Imm.ashrInPlace(Part * 32);
2639 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2640 } else {
2641 assert(SrcOp.isReg());
2642 Register Src = SrcOp.getReg();
2643 if (Src.isPhysical())
2644 MovDPP.addReg(RI.getSubReg(Src, Sub));
2645 else
2646 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2647 }
2648 }
2649
2650 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2651 MovDPP.addImm(MO.getImm());
2652
2653 Split[Part] = MovDPP;
2654 ++Part;
2655 }
2656
2657 if (Dst.isVirtual())
2658 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2659 .addReg(Split[0]->getOperand(0).getReg())
2660 .addImm(AMDGPU::sub0)
2661 .addReg(Split[1]->getOperand(0).getReg())
2662 .addImm(AMDGPU::sub1);
2663
2664 MI.eraseFromParent();
2665 return std::pair(Split[0], Split[1]);
2666}
2667
2668std::optional<DestSourcePair>
2670 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2671 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2672
2673 return std::nullopt;
2674}
2675
2677 AMDGPU::OpName Src0OpName,
2678 MachineOperand &Src1,
2679 AMDGPU::OpName Src1OpName) const {
2680 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2681 if (!Src0Mods)
2682 return false;
2683
2684 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2685 assert(Src1Mods &&
2686 "All commutable instructions have both src0 and src1 modifiers");
2687
2688 int Src0ModsVal = Src0Mods->getImm();
2689 int Src1ModsVal = Src1Mods->getImm();
2690
2691 Src1Mods->setImm(Src0ModsVal);
2692 Src0Mods->setImm(Src1ModsVal);
2693 return true;
2694}
2695
2697 MachineOperand &RegOp,
2698 MachineOperand &NonRegOp) {
2699 Register Reg = RegOp.getReg();
2700 unsigned SubReg = RegOp.getSubReg();
2701 bool IsKill = RegOp.isKill();
2702 bool IsDead = RegOp.isDead();
2703 bool IsUndef = RegOp.isUndef();
2704 bool IsDebug = RegOp.isDebug();
2705
2706 if (NonRegOp.isImm())
2707 RegOp.ChangeToImmediate(NonRegOp.getImm());
2708 else if (NonRegOp.isFI())
2709 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2710 else if (NonRegOp.isGlobal()) {
2711 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2712 NonRegOp.getTargetFlags());
2713 } else
2714 return nullptr;
2715
2716 // Make sure we don't reinterpret a subreg index in the target flags.
2717 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2718
2719 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2720 NonRegOp.setSubReg(SubReg);
2721
2722 return &MI;
2723}
2724
2726 MachineOperand &NonRegOp1,
2727 MachineOperand &NonRegOp2) {
2728 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2729 int64_t NonRegVal = NonRegOp1.getImm();
2730
2731 NonRegOp1.setImm(NonRegOp2.getImm());
2732 NonRegOp2.setImm(NonRegVal);
2733 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2734 NonRegOp2.setTargetFlags(TargetFlags);
2735 return &MI;
2736}
2737
2738bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2739 unsigned OpIdx1) const {
2740 const MCInstrDesc &InstDesc = MI.getDesc();
2741 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2742 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2743
2744 unsigned Opc = MI.getOpcode();
2745 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2746
2747 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2748 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2749
2750 // Swap doesn't breach constant bus or literal limits
2751 // It may move literal to position other than src0, this is not allowed
2752 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2753 // FIXME: After gfx9, literal can be in place other than Src0
2754 if (isVALU(MI)) {
2755 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2756 !isInlineConstant(MO0, OpInfo1))
2757 return false;
2758 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2759 !isInlineConstant(MO1, OpInfo0))
2760 return false;
2761 }
2762
2763 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2764 if (OpInfo1.RegClass == -1)
2765 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2766 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2767 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2768 }
2769 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2770 if (OpInfo0.RegClass == -1)
2771 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2772 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2773 isLegalRegOperand(MI, OpIdx0, MO1);
2774 }
2775
2776 // No need to check 64-bit literals since swapping does not bring new
2777 // 64-bit literals into current instruction to fold to 32-bit
2778
2779 return isImmOperandLegal(MI, OpIdx1, MO0);
2780}
2781
2783 unsigned Src0Idx,
2784 unsigned Src1Idx) const {
2785 assert(!NewMI && "this should never be used");
2786
2787 unsigned Opc = MI.getOpcode();
2788 int CommutedOpcode = commuteOpcode(Opc);
2789 if (CommutedOpcode == -1)
2790 return nullptr;
2791
2792 if (Src0Idx > Src1Idx)
2793 std::swap(Src0Idx, Src1Idx);
2794
2795 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2796 static_cast<int>(Src0Idx) &&
2797 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2798 static_cast<int>(Src1Idx) &&
2799 "inconsistency with findCommutedOpIndices");
2800
2801 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2802 return nullptr;
2803
2804 MachineInstr *CommutedMI = nullptr;
2805 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2806 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2807 if (Src0.isReg() && Src1.isReg()) {
2808 // Be sure to copy the source modifiers to the right place.
2809 CommutedMI =
2810 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2811 } else if (Src0.isReg() && !Src1.isReg()) {
2812 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2813 } else if (!Src0.isReg() && Src1.isReg()) {
2814 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2815 } else if (Src0.isImm() && Src1.isImm()) {
2816 CommutedMI = swapImmOperands(MI, Src0, Src1);
2817 } else {
2818 // FIXME: Found two non registers to commute. This does happen.
2819 return nullptr;
2820 }
2821
2822 if (CommutedMI) {
2823 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2824 Src1, AMDGPU::OpName::src1_modifiers);
2825
2826 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2827 AMDGPU::OpName::src1_sel);
2828
2829 CommutedMI->setDesc(get(CommutedOpcode));
2830 }
2831
2832 return CommutedMI;
2833}
2834
2835// This needs to be implemented because the source modifiers may be inserted
2836// between the true commutable operands, and the base
2837// TargetInstrInfo::commuteInstruction uses it.
2839 unsigned &SrcOpIdx0,
2840 unsigned &SrcOpIdx1) const {
2841 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2842}
2843
2845 unsigned &SrcOpIdx0,
2846 unsigned &SrcOpIdx1) const {
2847 if (!Desc.isCommutable())
2848 return false;
2849
2850 unsigned Opc = Desc.getOpcode();
2851 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2852 if (Src0Idx == -1)
2853 return false;
2854
2855 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2856 if (Src1Idx == -1)
2857 return false;
2858
2859 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2860}
2861
2863 int64_t BrOffset) const {
2864 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2865 // because its dest block is unanalyzable.
2866 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2867
2868 // Convert to dwords.
2869 BrOffset /= 4;
2870
2871 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2872 // from the next instruction.
2873 BrOffset -= 1;
2874
2875 return isIntN(BranchOffsetBits, BrOffset);
2876}
2877
2880 return MI.getOperand(0).getMBB();
2881}
2882
2884 for (const MachineInstr &MI : MBB->terminators()) {
2885 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2886 MI.getOpcode() == AMDGPU::SI_LOOP)
2887 return true;
2888 }
2889 return false;
2890}
2891
2893 MachineBasicBlock &DestBB,
2894 MachineBasicBlock &RestoreBB,
2895 const DebugLoc &DL, int64_t BrOffset,
2896 RegScavenger *RS) const {
2897 assert(MBB.empty() &&
2898 "new block should be inserted for expanding unconditional branch");
2899 assert(MBB.pred_size() == 1);
2900 assert(RestoreBB.empty() &&
2901 "restore block should be inserted for restoring clobbered registers");
2902
2903 MachineFunction *MF = MBB.getParent();
2906 auto I = MBB.end();
2907 auto &MCCtx = MF->getContext();
2908
2909 if (ST.hasAddPC64Inst()) {
2910 MCSymbol *Offset =
2911 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2912 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2914 MCSymbol *PostAddPCLabel =
2915 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2916 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2917 auto *OffsetExpr = MCBinaryExpr::createSub(
2918 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2919 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2920 Offset->setVariableValue(OffsetExpr);
2921 return;
2922 }
2923
2924 assert(RS && "RegScavenger required for long branching");
2925
2926 // FIXME: Virtual register workaround for RegScavenger not working with empty
2927 // blocks.
2928 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2929
2930 // Note: as this is used after hazard recognizer we need to apply some hazard
2931 // workarounds directly.
2932 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2933 ST.hasVALUReadSGPRHazard();
2934 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2935 if (FlushSGPRWrites)
2936 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2938 };
2939
2940 // We need to compute the offset relative to the instruction immediately after
2941 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2942 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2943 ApplyHazardWorkarounds();
2944
2945 MCSymbol *PostGetPCLabel =
2946 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2947 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2948
2949 MCSymbol *OffsetLo =
2950 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2951 MCSymbol *OffsetHi =
2952 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2953 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2954 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2955 .addReg(PCReg, 0, AMDGPU::sub0)
2956 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2957 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2958 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2959 .addReg(PCReg, 0, AMDGPU::sub1)
2960 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2961 ApplyHazardWorkarounds();
2962
2963 // Insert the indirect branch after the other terminator.
2964 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2965 .addReg(PCReg);
2966
2967 // If a spill is needed for the pc register pair, we need to insert a spill
2968 // restore block right before the destination block, and insert a short branch
2969 // into the old destination block's fallthrough predecessor.
2970 // e.g.:
2971 //
2972 // s_cbranch_scc0 skip_long_branch:
2973 //
2974 // long_branch_bb:
2975 // spill s[8:9]
2976 // s_getpc_b64 s[8:9]
2977 // s_add_u32 s8, s8, restore_bb
2978 // s_addc_u32 s9, s9, 0
2979 // s_setpc_b64 s[8:9]
2980 //
2981 // skip_long_branch:
2982 // foo;
2983 //
2984 // .....
2985 //
2986 // dest_bb_fallthrough_predecessor:
2987 // bar;
2988 // s_branch dest_bb
2989 //
2990 // restore_bb:
2991 // restore s[8:9]
2992 // fallthrough dest_bb
2993 ///
2994 // dest_bb:
2995 // buzz;
2996
2997 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2998 Register Scav;
2999
3000 // If we've previously reserved a register for long branches
3001 // avoid running the scavenger and just use those registers
3002 if (LongBranchReservedReg) {
3003 RS->enterBasicBlock(MBB);
3004 Scav = LongBranchReservedReg;
3005 } else {
3006 RS->enterBasicBlockEnd(MBB);
3007 Scav = RS->scavengeRegisterBackwards(
3008 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3009 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3010 }
3011 if (Scav) {
3012 RS->setRegUsed(Scav);
3013 MRI.replaceRegWith(PCReg, Scav);
3014 MRI.clearVirtRegs();
3015 } else {
3016 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3017 // SGPR spill.
3018 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3019 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3020 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3021 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3022 MRI.clearVirtRegs();
3023 }
3024
3025 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3026 // Now, the distance could be defined.
3028 MCSymbolRefExpr::create(DestLabel, MCCtx),
3029 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3030 // Add offset assignments.
3031 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3032 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3033 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3034 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3035}
3036
3037unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3038 switch (Cond) {
3039 case SIInstrInfo::SCC_TRUE:
3040 return AMDGPU::S_CBRANCH_SCC1;
3041 case SIInstrInfo::SCC_FALSE:
3042 return AMDGPU::S_CBRANCH_SCC0;
3043 case SIInstrInfo::VCCNZ:
3044 return AMDGPU::S_CBRANCH_VCCNZ;
3045 case SIInstrInfo::VCCZ:
3046 return AMDGPU::S_CBRANCH_VCCZ;
3047 case SIInstrInfo::EXECNZ:
3048 return AMDGPU::S_CBRANCH_EXECNZ;
3049 case SIInstrInfo::EXECZ:
3050 return AMDGPU::S_CBRANCH_EXECZ;
3051 default:
3052 llvm_unreachable("invalid branch predicate");
3053 }
3054}
3055
3056SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3057 switch (Opcode) {
3058 case AMDGPU::S_CBRANCH_SCC0:
3059 return SCC_FALSE;
3060 case AMDGPU::S_CBRANCH_SCC1:
3061 return SCC_TRUE;
3062 case AMDGPU::S_CBRANCH_VCCNZ:
3063 return VCCNZ;
3064 case AMDGPU::S_CBRANCH_VCCZ:
3065 return VCCZ;
3066 case AMDGPU::S_CBRANCH_EXECNZ:
3067 return EXECNZ;
3068 case AMDGPU::S_CBRANCH_EXECZ:
3069 return EXECZ;
3070 default:
3071 return INVALID_BR;
3072 }
3073}
3074
3078 MachineBasicBlock *&FBB,
3080 bool AllowModify) const {
3081 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3082 // Unconditional Branch
3083 TBB = I->getOperand(0).getMBB();
3084 return false;
3085 }
3086
3087 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3088 if (Pred == INVALID_BR)
3089 return true;
3090
3091 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3092 Cond.push_back(MachineOperand::CreateImm(Pred));
3093 Cond.push_back(I->getOperand(1)); // Save the branch register.
3094
3095 ++I;
3096
3097 if (I == MBB.end()) {
3098 // Conditional branch followed by fall-through.
3099 TBB = CondBB;
3100 return false;
3101 }
3102
3103 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3104 TBB = CondBB;
3105 FBB = I->getOperand(0).getMBB();
3106 return false;
3107 }
3108
3109 return true;
3110}
3111
3113 MachineBasicBlock *&FBB,
3115 bool AllowModify) const {
3116 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3117 auto E = MBB.end();
3118 if (I == E)
3119 return false;
3120
3121 // Skip over the instructions that are artificially terminators for special
3122 // exec management.
3123 while (I != E && !I->isBranch() && !I->isReturn()) {
3124 switch (I->getOpcode()) {
3125 case AMDGPU::S_MOV_B64_term:
3126 case AMDGPU::S_XOR_B64_term:
3127 case AMDGPU::S_OR_B64_term:
3128 case AMDGPU::S_ANDN2_B64_term:
3129 case AMDGPU::S_AND_B64_term:
3130 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3131 case AMDGPU::S_MOV_B32_term:
3132 case AMDGPU::S_XOR_B32_term:
3133 case AMDGPU::S_OR_B32_term:
3134 case AMDGPU::S_ANDN2_B32_term:
3135 case AMDGPU::S_AND_B32_term:
3136 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3137 break;
3138 case AMDGPU::SI_IF:
3139 case AMDGPU::SI_ELSE:
3140 case AMDGPU::SI_KILL_I1_TERMINATOR:
3141 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3142 // FIXME: It's messy that these need to be considered here at all.
3143 return true;
3144 default:
3145 llvm_unreachable("unexpected non-branch terminator inst");
3146 }
3147
3148 ++I;
3149 }
3150
3151 if (I == E)
3152 return false;
3153
3154 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3155}
3156
3158 int *BytesRemoved) const {
3159 unsigned Count = 0;
3160 unsigned RemovedSize = 0;
3161 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3162 // Skip over artificial terminators when removing instructions.
3163 if (MI.isBranch() || MI.isReturn()) {
3164 RemovedSize += getInstSizeInBytes(MI);
3165 MI.eraseFromParent();
3166 ++Count;
3167 }
3168 }
3169
3170 if (BytesRemoved)
3171 *BytesRemoved = RemovedSize;
3172
3173 return Count;
3174}
3175
3176// Copy the flags onto the implicit condition register operand.
3178 const MachineOperand &OrigCond) {
3179 CondReg.setIsUndef(OrigCond.isUndef());
3180 CondReg.setIsKill(OrigCond.isKill());
3181}
3182
3185 MachineBasicBlock *FBB,
3187 const DebugLoc &DL,
3188 int *BytesAdded) const {
3189 if (!FBB && Cond.empty()) {
3190 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3191 .addMBB(TBB);
3192 if (BytesAdded)
3193 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3194 return 1;
3195 }
3196
3197 assert(TBB && Cond[0].isImm());
3198
3199 unsigned Opcode
3200 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3201
3202 if (!FBB) {
3203 MachineInstr *CondBr =
3204 BuildMI(&MBB, DL, get(Opcode))
3205 .addMBB(TBB);
3206
3207 // Copy the flags onto the implicit condition register operand.
3208 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3209 fixImplicitOperands(*CondBr);
3210
3211 if (BytesAdded)
3212 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3213 return 1;
3214 }
3215
3216 assert(TBB && FBB);
3217
3218 MachineInstr *CondBr =
3219 BuildMI(&MBB, DL, get(Opcode))
3220 .addMBB(TBB);
3221 fixImplicitOperands(*CondBr);
3222 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3223 .addMBB(FBB);
3224
3225 MachineOperand &CondReg = CondBr->getOperand(1);
3226 CondReg.setIsUndef(Cond[1].isUndef());
3227 CondReg.setIsKill(Cond[1].isKill());
3228
3229 if (BytesAdded)
3230 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3231
3232 return 2;
3233}
3234
3237 if (Cond.size() != 2) {
3238 return true;
3239 }
3240
3241 if (Cond[0].isImm()) {
3242 Cond[0].setImm(-Cond[0].getImm());
3243 return false;
3244 }
3245
3246 return true;
3247}
3248
3251 Register DstReg, Register TrueReg,
3252 Register FalseReg, int &CondCycles,
3253 int &TrueCycles, int &FalseCycles) const {
3254 switch (Cond[0].getImm()) {
3255 case VCCNZ:
3256 case VCCZ: {
3257 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3258 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3259 if (MRI.getRegClass(FalseReg) != RC)
3260 return false;
3261
3262 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3263 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3264
3265 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3266 return RI.hasVGPRs(RC) && NumInsts <= 6;
3267 }
3268 case SCC_TRUE:
3269 case SCC_FALSE: {
3270 // FIXME: We could insert for VGPRs if we could replace the original compare
3271 // with a vector one.
3272 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3273 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3274 if (MRI.getRegClass(FalseReg) != RC)
3275 return false;
3276
3277 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3278
3279 // Multiples of 8 can do s_cselect_b64
3280 if (NumInsts % 2 == 0)
3281 NumInsts /= 2;
3282
3283 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3284 return RI.isSGPRClass(RC);
3285 }
3286 default:
3287 return false;
3288 }
3289}
3290
3294 Register TrueReg, Register FalseReg) const {
3295 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3296 if (Pred == VCCZ || Pred == SCC_FALSE) {
3297 Pred = static_cast<BranchPredicate>(-Pred);
3298 std::swap(TrueReg, FalseReg);
3299 }
3300
3301 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3302 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3303 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3304
3305 if (DstSize == 32) {
3307 if (Pred == SCC_TRUE) {
3308 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3309 .addReg(TrueReg)
3310 .addReg(FalseReg);
3311 } else {
3312 // Instruction's operands are backwards from what is expected.
3313 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3314 .addReg(FalseReg)
3315 .addReg(TrueReg);
3316 }
3317
3318 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3319 return;
3320 }
3321
3322 if (DstSize == 64 && Pred == SCC_TRUE) {
3324 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3325 .addReg(TrueReg)
3326 .addReg(FalseReg);
3327
3328 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3329 return;
3330 }
3331
3332 static const int16_t Sub0_15[] = {
3333 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3334 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3335 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3336 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3337 };
3338
3339 static const int16_t Sub0_15_64[] = {
3340 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3341 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3342 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3343 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3344 };
3345
3346 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3347 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3348 const int16_t *SubIndices = Sub0_15;
3349 int NElts = DstSize / 32;
3350
3351 // 64-bit select is only available for SALU.
3352 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3353 if (Pred == SCC_TRUE) {
3354 if (NElts % 2) {
3355 SelOp = AMDGPU::S_CSELECT_B32;
3356 EltRC = &AMDGPU::SGPR_32RegClass;
3357 } else {
3358 SelOp = AMDGPU::S_CSELECT_B64;
3359 EltRC = &AMDGPU::SGPR_64RegClass;
3360 SubIndices = Sub0_15_64;
3361 NElts /= 2;
3362 }
3363 }
3364
3366 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3367
3368 I = MIB->getIterator();
3369
3371 for (int Idx = 0; Idx != NElts; ++Idx) {
3372 Register DstElt = MRI.createVirtualRegister(EltRC);
3373 Regs.push_back(DstElt);
3374
3375 unsigned SubIdx = SubIndices[Idx];
3376
3378 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3379 Select =
3380 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3381 .addReg(FalseReg, 0, SubIdx)
3382 .addReg(TrueReg, 0, SubIdx);
3383 } else {
3384 Select =
3385 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3386 .addReg(TrueReg, 0, SubIdx)
3387 .addReg(FalseReg, 0, SubIdx);
3388 }
3389
3390 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3392
3393 MIB.addReg(DstElt)
3394 .addImm(SubIdx);
3395 }
3396}
3397
3399 switch (MI.getOpcode()) {
3400 case AMDGPU::V_MOV_B16_t16_e32:
3401 case AMDGPU::V_MOV_B16_t16_e64:
3402 case AMDGPU::V_MOV_B32_e32:
3403 case AMDGPU::V_MOV_B32_e64:
3404 case AMDGPU::V_MOV_B64_PSEUDO:
3405 case AMDGPU::V_MOV_B64_e32:
3406 case AMDGPU::V_MOV_B64_e64:
3407 case AMDGPU::S_MOV_B32:
3408 case AMDGPU::S_MOV_B64:
3409 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3410 case AMDGPU::COPY:
3411 case AMDGPU::WWM_COPY:
3412 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3413 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3414 case AMDGPU::V_ACCVGPR_MOV_B32:
3415 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3416 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3417 return true;
3418 default:
3419 return false;
3420 }
3421}
3422
3424 switch (MI.getOpcode()) {
3425 case AMDGPU::V_MOV_B16_t16_e32:
3426 case AMDGPU::V_MOV_B16_t16_e64:
3427 return 2;
3428 case AMDGPU::V_MOV_B32_e32:
3429 case AMDGPU::V_MOV_B32_e64:
3430 case AMDGPU::V_MOV_B64_PSEUDO:
3431 case AMDGPU::V_MOV_B64_e32:
3432 case AMDGPU::V_MOV_B64_e64:
3433 case AMDGPU::S_MOV_B32:
3434 case AMDGPU::S_MOV_B64:
3435 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3436 case AMDGPU::COPY:
3437 case AMDGPU::WWM_COPY:
3438 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3439 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3440 case AMDGPU::V_ACCVGPR_MOV_B32:
3441 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3442 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3443 return 1;
3444 default:
3445 llvm_unreachable("MI is not a foldable copy");
3446 }
3447}
3448
3449static constexpr AMDGPU::OpName ModifierOpNames[] = {
3450 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3451 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3452 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3453
3455 unsigned Opc = MI.getOpcode();
3456 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3457 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3458 if (Idx >= 0)
3459 MI.removeOperand(Idx);
3460 }
3461}
3462
3464 const MCInstrDesc &NewDesc) const {
3465 MI.setDesc(NewDesc);
3466
3467 // Remove any leftover implicit operands from mutating the instruction. e.g.
3468 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3469 // anymore.
3470 const MCInstrDesc &Desc = MI.getDesc();
3471 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3472 Desc.implicit_defs().size();
3473
3474 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3475 MI.removeOperand(I);
3476}
3477
3478std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3479 unsigned SubRegIndex) {
3480 switch (SubRegIndex) {
3481 case AMDGPU::NoSubRegister:
3482 return Imm;
3483 case AMDGPU::sub0:
3484 return SignExtend64<32>(Imm);
3485 case AMDGPU::sub1:
3486 return SignExtend64<32>(Imm >> 32);
3487 case AMDGPU::lo16:
3488 return SignExtend64<16>(Imm);
3489 case AMDGPU::hi16:
3490 return SignExtend64<16>(Imm >> 16);
3491 case AMDGPU::sub1_lo16:
3492 return SignExtend64<16>(Imm >> 32);
3493 case AMDGPU::sub1_hi16:
3494 return SignExtend64<16>(Imm >> 48);
3495 default:
3496 return std::nullopt;
3497 }
3498
3499 llvm_unreachable("covered subregister switch");
3500}
3501
3502static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3503 switch (Opc) {
3504 case AMDGPU::V_MAC_F16_e32:
3505 case AMDGPU::V_MAC_F16_e64:
3506 case AMDGPU::V_MAD_F16_e64:
3507 return AMDGPU::V_MADAK_F16;
3508 case AMDGPU::V_MAC_F32_e32:
3509 case AMDGPU::V_MAC_F32_e64:
3510 case AMDGPU::V_MAD_F32_e64:
3511 return AMDGPU::V_MADAK_F32;
3512 case AMDGPU::V_FMAC_F32_e32:
3513 case AMDGPU::V_FMAC_F32_e64:
3514 case AMDGPU::V_FMA_F32_e64:
3515 return AMDGPU::V_FMAAK_F32;
3516 case AMDGPU::V_FMAC_F16_e32:
3517 case AMDGPU::V_FMAC_F16_e64:
3518 case AMDGPU::V_FMAC_F16_t16_e64:
3519 case AMDGPU::V_FMAC_F16_fake16_e64:
3520 case AMDGPU::V_FMA_F16_e64:
3521 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3522 ? AMDGPU::V_FMAAK_F16_t16
3523 : AMDGPU::V_FMAAK_F16_fake16
3524 : AMDGPU::V_FMAAK_F16;
3525 case AMDGPU::V_FMAC_F64_e32:
3526 case AMDGPU::V_FMAC_F64_e64:
3527 case AMDGPU::V_FMA_F64_e64:
3528 return AMDGPU::V_FMAAK_F64;
3529 default:
3530 llvm_unreachable("invalid instruction");
3531 }
3532}
3533
3534static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3535 switch (Opc) {
3536 case AMDGPU::V_MAC_F16_e32:
3537 case AMDGPU::V_MAC_F16_e64:
3538 case AMDGPU::V_MAD_F16_e64:
3539 return AMDGPU::V_MADMK_F16;
3540 case AMDGPU::V_MAC_F32_e32:
3541 case AMDGPU::V_MAC_F32_e64:
3542 case AMDGPU::V_MAD_F32_e64:
3543 return AMDGPU::V_MADMK_F32;
3544 case AMDGPU::V_FMAC_F32_e32:
3545 case AMDGPU::V_FMAC_F32_e64:
3546 case AMDGPU::V_FMA_F32_e64:
3547 return AMDGPU::V_FMAMK_F32;
3548 case AMDGPU::V_FMAC_F16_e32:
3549 case AMDGPU::V_FMAC_F16_e64:
3550 case AMDGPU::V_FMAC_F16_t16_e64:
3551 case AMDGPU::V_FMAC_F16_fake16_e64:
3552 case AMDGPU::V_FMA_F16_e64:
3553 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3554 ? AMDGPU::V_FMAMK_F16_t16
3555 : AMDGPU::V_FMAMK_F16_fake16
3556 : AMDGPU::V_FMAMK_F16;
3557 case AMDGPU::V_FMAC_F64_e32:
3558 case AMDGPU::V_FMAC_F64_e64:
3559 case AMDGPU::V_FMA_F64_e64:
3560 return AMDGPU::V_FMAMK_F64;
3561 default:
3562 llvm_unreachable("invalid instruction");
3563 }
3564}
3565
3567 Register Reg, MachineRegisterInfo *MRI) const {
3568 int64_t Imm;
3569 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3570 return false;
3571
3572 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3573
3574 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3575
3576 unsigned Opc = UseMI.getOpcode();
3577 if (Opc == AMDGPU::COPY) {
3578 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3579
3580 Register DstReg = UseMI.getOperand(0).getReg();
3581 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3582
3583 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3584
3585 if (HasMultipleUses) {
3586 // TODO: This should fold in more cases with multiple use, but we need to
3587 // more carefully consider what those uses are.
3588 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3589
3590 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3591 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3592 return false;
3593
3594 // Most of the time folding a 32-bit inline constant is free (though this
3595 // might not be true if we can't later fold it into a real user).
3596 //
3597 // FIXME: This isInlineConstant check is imprecise if
3598 // getConstValDefinedInReg handled the tricky non-mov cases.
3599 if (ImmDefSize == 32 &&
3601 return false;
3602 }
3603
3604 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3605 RI.getSubRegIdxSize(UseSubReg) == 16;
3606
3607 if (Is16Bit) {
3608 if (RI.hasVGPRs(DstRC))
3609 return false; // Do not clobber vgpr_hi16
3610
3611 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3612 return false;
3613 }
3614
3615 MachineFunction *MF = UseMI.getMF();
3616
3617 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3618 MCRegister MovDstPhysReg =
3619 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3620
3621 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3622
3623 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3624 for (unsigned MovOp :
3625 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3626 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3627 const MCInstrDesc &MovDesc = get(MovOp);
3628
3629 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3630 if (Is16Bit) {
3631 // We just need to find a correctly sized register class, so the
3632 // subregister index compatibility doesn't matter since we're statically
3633 // extracting the immediate value.
3634 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3635 if (!MovDstRC)
3636 continue;
3637
3638 if (MovDstPhysReg) {
3639 // FIXME: We probably should not do this. If there is a live value in
3640 // the high half of the register, it will be corrupted.
3641 MovDstPhysReg =
3642 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3643 if (!MovDstPhysReg)
3644 continue;
3645 }
3646 }
3647
3648 // Result class isn't the right size, try the next instruction.
3649 if (MovDstPhysReg) {
3650 if (!MovDstRC->contains(MovDstPhysReg))
3651 return false;
3652 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3653 // TODO: This will be overly conservative in the case of 16-bit virtual
3654 // SGPRs. We could hack up the virtual register uses to use a compatible
3655 // 32-bit class.
3656 continue;
3657 }
3658
3659 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3660
3661 // Ensure the interpreted immediate value is a valid operand in the new
3662 // mov.
3663 //
3664 // FIXME: isImmOperandLegal should have form that doesn't require existing
3665 // MachineInstr or MachineOperand
3666 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3667 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3668 break;
3669
3670 NewOpc = MovOp;
3671 break;
3672 }
3673
3674 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3675 return false;
3676
3677 if (Is16Bit) {
3678 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3679 if (MovDstPhysReg)
3680 UseMI.getOperand(0).setReg(MovDstPhysReg);
3681 assert(UseMI.getOperand(1).getReg().isVirtual());
3682 }
3683
3684 const MCInstrDesc &NewMCID = get(NewOpc);
3685 UseMI.setDesc(NewMCID);
3686 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3687 UseMI.addImplicitDefUseOperands(*MF);
3688 return true;
3689 }
3690
3691 if (HasMultipleUses)
3692 return false;
3693
3694 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3695 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3696 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3697 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3698 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3699 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3700 Opc == AMDGPU::V_FMAC_F64_e64) {
3701 // Don't fold if we are using source or output modifiers. The new VOP2
3702 // instructions don't have them.
3704 return false;
3705
3706 // If this is a free constant, there's no reason to do this.
3707 // TODO: We could fold this here instead of letting SIFoldOperands do it
3708 // later.
3709 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3710
3711 // Any src operand can be used for the legality check.
3712 if (isInlineConstant(UseMI, Src0Idx, Imm))
3713 return false;
3714
3715 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3716
3717 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3718 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3719
3720 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3721 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3722 (Src1->isReg() && Src1->getReg() == Reg)) {
3723 MachineOperand *RegSrc =
3724 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3725 if (!RegSrc->isReg())
3726 return false;
3727 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3728 ST.getConstantBusLimit(Opc) < 2)
3729 return false;
3730
3731 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3732 return false;
3733
3734 // If src2 is also a literal constant then we have to choose which one to
3735 // fold. In general it is better to choose madak so that the other literal
3736 // can be materialized in an sgpr instead of a vgpr:
3737 // s_mov_b32 s0, literal
3738 // v_madak_f32 v0, s0, v0, literal
3739 // Instead of:
3740 // v_mov_b32 v1, literal
3741 // v_madmk_f32 v0, v0, literal, v1
3742 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3743 if (Def && Def->isMoveImmediate() &&
3744 !isInlineConstant(Def->getOperand(1)))
3745 return false;
3746
3747 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3748 if (pseudoToMCOpcode(NewOpc) == -1)
3749 return false;
3750
3751 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3752 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3753 // restricting their register classes. For now just bail out.
3754 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3755 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3756 return false;
3757
3758 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3759 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3760
3761 // FIXME: This would be a lot easier if we could return a new instruction
3762 // instead of having to modify in place.
3763
3764 Register SrcReg = RegSrc->getReg();
3765 unsigned SrcSubReg = RegSrc->getSubReg();
3766 Src0->setReg(SrcReg);
3767 Src0->setSubReg(SrcSubReg);
3768 Src0->setIsKill(RegSrc->isKill());
3769
3770 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3771 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3772 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3773 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3774 UseMI.untieRegOperand(
3775 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3776
3777 Src1->ChangeToImmediate(*SubRegImm);
3778
3780 UseMI.setDesc(get(NewOpc));
3781
3782 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3783 if (DeleteDef)
3784 DefMI.eraseFromParent();
3785
3786 return true;
3787 }
3788
3789 // Added part is the constant: Use v_madak_{f16, f32}.
3790 if (Src2->isReg() && Src2->getReg() == Reg) {
3791 if (ST.getConstantBusLimit(Opc) < 2) {
3792 // Not allowed to use constant bus for another operand.
3793 // We can however allow an inline immediate as src0.
3794 bool Src0Inlined = false;
3795 if (Src0->isReg()) {
3796 // Try to inline constant if possible.
3797 // If the Def moves immediate and the use is single
3798 // We are saving VGPR here.
3799 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3800 if (Def && Def->isMoveImmediate() &&
3801 isInlineConstant(Def->getOperand(1)) &&
3802 MRI->hasOneNonDBGUse(Src0->getReg())) {
3803 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3804 Src0Inlined = true;
3805 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3806 RI.isSGPRReg(*MRI, Src0->getReg())) {
3807 return false;
3808 }
3809 // VGPR is okay as Src0 - fallthrough
3810 }
3811
3812 if (Src1->isReg() && !Src0Inlined) {
3813 // We have one slot for inlinable constant so far - try to fill it
3814 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3815 if (Def && Def->isMoveImmediate() &&
3816 isInlineConstant(Def->getOperand(1)) &&
3817 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3818 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3819 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3820 return false;
3821 // VGPR is okay as Src1 - fallthrough
3822 }
3823 }
3824
3825 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3826 if (pseudoToMCOpcode(NewOpc) == -1)
3827 return false;
3828
3829 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3830 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3831 // restricting their register classes. For now just bail out.
3832 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3833 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3834 return false;
3835
3836 // FIXME: This would be a lot easier if we could return a new instruction
3837 // instead of having to modify in place.
3838
3839 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3840 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3841 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3842 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3843 UseMI.untieRegOperand(
3844 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3845
3846 const std::optional<int64_t> SubRegImm =
3847 extractSubregFromImm(Imm, Src2->getSubReg());
3848
3849 // ChangingToImmediate adds Src2 back to the instruction.
3850 Src2->ChangeToImmediate(*SubRegImm);
3851
3852 // These come before src2.
3854 UseMI.setDesc(get(NewOpc));
3855 // It might happen that UseMI was commuted
3856 // and we now have SGPR as SRC1. If so 2 inlined
3857 // constant and SGPR are illegal.
3859
3860 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3861 if (DeleteDef)
3862 DefMI.eraseFromParent();
3863
3864 return true;
3865 }
3866 }
3867
3868 return false;
3869}
3870
3871static bool
3874 if (BaseOps1.size() != BaseOps2.size())
3875 return false;
3876 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3877 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3878 return false;
3879 }
3880 return true;
3881}
3882
3883static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3884 LocationSize WidthB, int OffsetB) {
3885 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3886 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3887 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3888 return LowWidth.hasValue() &&
3889 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3890}
3891
3892bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3893 const MachineInstr &MIb) const {
3894 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3895 int64_t Offset0, Offset1;
3896 LocationSize Dummy0 = LocationSize::precise(0);
3897 LocationSize Dummy1 = LocationSize::precise(0);
3898 bool Offset0IsScalable, Offset1IsScalable;
3899 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3900 Dummy0, &RI) ||
3901 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3902 Dummy1, &RI))
3903 return false;
3904
3905 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3906 return false;
3907
3908 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3909 // FIXME: Handle ds_read2 / ds_write2.
3910 return false;
3911 }
3912 LocationSize Width0 = MIa.memoperands().front()->getSize();
3913 LocationSize Width1 = MIb.memoperands().front()->getSize();
3914 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3915}
3916
3918 const MachineInstr &MIb) const {
3919 assert(MIa.mayLoadOrStore() &&
3920 "MIa must load from or modify a memory location");
3921 assert(MIb.mayLoadOrStore() &&
3922 "MIb must load from or modify a memory location");
3923
3925 return false;
3926
3927 // XXX - Can we relax this between address spaces?
3928 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3929 return false;
3930
3931 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3932 return false;
3933
3934 if (MIa.isBundle() || MIb.isBundle())
3935 return false;
3936
3937 // TODO: Should we check the address space from the MachineMemOperand? That
3938 // would allow us to distinguish objects we know don't alias based on the
3939 // underlying address space, even if it was lowered to a different one,
3940 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3941 // buffer.
3942 if (isDS(MIa)) {
3943 if (isDS(MIb))
3944 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3945
3946 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3947 }
3948
3949 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3950 if (isMUBUF(MIb) || isMTBUF(MIb))
3951 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3952
3953 if (isFLAT(MIb))
3954 return isFLATScratch(MIb);
3955
3956 return !isSMRD(MIb);
3957 }
3958
3959 if (isSMRD(MIa)) {
3960 if (isSMRD(MIb))
3961 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3962
3963 if (isFLAT(MIb))
3964 return isFLATScratch(MIb);
3965
3966 return !isMUBUF(MIb) && !isMTBUF(MIb);
3967 }
3968
3969 if (isFLAT(MIa)) {
3970 if (isFLAT(MIb)) {
3971 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3972 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3973 return true;
3974
3975 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3976 }
3977
3978 return false;
3979 }
3980
3981 return false;
3982}
3983
3985 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3986 if (Reg.isPhysical())
3987 return false;
3988 auto *Def = MRI.getUniqueVRegDef(Reg);
3989 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3990 Imm = Def->getOperand(1).getImm();
3991 if (DefMI)
3992 *DefMI = Def;
3993 return true;
3994 }
3995 return false;
3996}
3997
3998static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3999 MachineInstr **DefMI = nullptr) {
4000 if (!MO->isReg())
4001 return false;
4002 const MachineFunction *MF = MO->getParent()->getMF();
4003 const MachineRegisterInfo &MRI = MF->getRegInfo();
4004 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4005}
4006
4008 MachineInstr &NewMI) {
4009 if (LV) {
4010 unsigned NumOps = MI.getNumOperands();
4011 for (unsigned I = 1; I < NumOps; ++I) {
4012 MachineOperand &Op = MI.getOperand(I);
4013 if (Op.isReg() && Op.isKill())
4014 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4015 }
4016 }
4017}
4018
4019static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4020 switch (Opc) {
4021 case AMDGPU::V_MAC_F16_e32:
4022 case AMDGPU::V_MAC_F16_e64:
4023 return AMDGPU::V_MAD_F16_e64;
4024 case AMDGPU::V_MAC_F32_e32:
4025 case AMDGPU::V_MAC_F32_e64:
4026 return AMDGPU::V_MAD_F32_e64;
4027 case AMDGPU::V_MAC_LEGACY_F32_e32:
4028 case AMDGPU::V_MAC_LEGACY_F32_e64:
4029 return AMDGPU::V_MAD_LEGACY_F32_e64;
4030 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4031 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4032 return AMDGPU::V_FMA_LEGACY_F32_e64;
4033 case AMDGPU::V_FMAC_F16_e32:
4034 case AMDGPU::V_FMAC_F16_e64:
4035 case AMDGPU::V_FMAC_F16_t16_e64:
4036 case AMDGPU::V_FMAC_F16_fake16_e64:
4037 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4038 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4039 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4040 : AMDGPU::V_FMA_F16_gfx9_e64;
4041 case AMDGPU::V_FMAC_F32_e32:
4042 case AMDGPU::V_FMAC_F32_e64:
4043 return AMDGPU::V_FMA_F32_e64;
4044 case AMDGPU::V_FMAC_F64_e32:
4045 case AMDGPU::V_FMAC_F64_e64:
4046 return AMDGPU::V_FMA_F64_e64;
4047 default:
4048 llvm_unreachable("invalid instruction");
4049 }
4050}
4051
4052/// Helper struct for the implementation of 3-address conversion to communicate
4053/// updates made to instruction operands.
4055 /// Other instruction whose def is no longer used by the converted
4056 /// instruction.
4058};
4059
4061 LiveVariables *LV,
4062 LiveIntervals *LIS) const {
4063 MachineBasicBlock &MBB = *MI.getParent();
4064 MachineInstr *CandidateMI = &MI;
4065
4066 if (MI.isBundle()) {
4067 // This is a temporary placeholder for bundle handling that enables us to
4068 // exercise the relevant code paths in the two-address instruction pass.
4069 if (MI.getBundleSize() != 1)
4070 return nullptr;
4071 CandidateMI = MI.getNextNode();
4072 }
4073
4075 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4076 if (!NewMI)
4077 return nullptr;
4078
4079 if (MI.isBundle()) {
4080 CandidateMI->eraseFromBundle();
4081
4082 for (MachineOperand &MO : MI.all_defs()) {
4083 if (MO.isTied())
4084 MI.untieRegOperand(MO.getOperandNo());
4085 }
4086 } else {
4087 updateLiveVariables(LV, MI, *NewMI);
4088 if (LIS) {
4089 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4090 // SlotIndex of defs needs to be updated when converting to early-clobber
4091 MachineOperand &Def = NewMI->getOperand(0);
4092 if (Def.isEarlyClobber() && Def.isReg() &&
4093 LIS->hasInterval(Def.getReg())) {
4094 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4095 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4096 auto &LI = LIS->getInterval(Def.getReg());
4097 auto UpdateDefIndex = [&](LiveRange &LR) {
4098 auto *S = LR.find(OldIndex);
4099 if (S != LR.end() && S->start == OldIndex) {
4100 assert(S->valno && S->valno->def == OldIndex);
4101 S->start = NewIndex;
4102 S->valno->def = NewIndex;
4103 }
4104 };
4105 UpdateDefIndex(LI);
4106 for (auto &SR : LI.subranges())
4107 UpdateDefIndex(SR);
4108 }
4109 }
4110 }
4111
4112 if (U.RemoveMIUse) {
4113 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4114 // The only user is the instruction which will be killed.
4115 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4116
4117 if (MRI.hasOneNonDBGUse(DefReg)) {
4118 // We cannot just remove the DefMI here, calling pass will crash.
4119 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4120 U.RemoveMIUse->getOperand(0).setIsDead(true);
4121 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4122 U.RemoveMIUse->removeOperand(I);
4123 if (LV)
4124 LV->getVarInfo(DefReg).AliveBlocks.clear();
4125 }
4126
4127 if (MI.isBundle()) {
4128 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4129 if (!VRI.Reads && !VRI.Writes) {
4130 for (MachineOperand &MO : MI.all_uses()) {
4131 if (MO.isReg() && MO.getReg() == DefReg) {
4132 assert(MO.getSubReg() == 0 &&
4133 "tied sub-registers in bundles currently not supported");
4134 MI.removeOperand(MO.getOperandNo());
4135 break;
4136 }
4137 }
4138
4139 if (LIS)
4140 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4141 }
4142 } else if (LIS) {
4143 LiveInterval &DefLI = LIS->getInterval(DefReg);
4144
4145 // We cannot delete the original instruction here, so hack out the use
4146 // in the original instruction with a dummy register so we can use
4147 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4148 // not have the complexity of deleting a use to consider here.
4149 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4150 for (MachineOperand &MIOp : MI.uses()) {
4151 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4152 MIOp.setIsUndef(true);
4153 MIOp.setReg(DummyReg);
4154 }
4155 }
4156
4157 if (MI.isBundle()) {
4158 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4159 if (!VRI.Reads && !VRI.Writes) {
4160 for (MachineOperand &MIOp : MI.uses()) {
4161 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4162 MIOp.setIsUndef(true);
4163 MIOp.setReg(DummyReg);
4164 }
4165 }
4166 }
4167
4168 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4169 false, /*isUndef=*/true));
4170 }
4171
4172 LIS->shrinkToUses(&DefLI);
4173 }
4174 }
4175
4176 return MI.isBundle() ? &MI : NewMI;
4177}
4178
4180SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4181 ThreeAddressUpdates &U) const {
4182 MachineBasicBlock &MBB = *MI.getParent();
4183 unsigned Opc = MI.getOpcode();
4184
4185 // Handle MFMA.
4186 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4187 if (NewMFMAOpc != -1) {
4189 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4190 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4191 MIB.add(MI.getOperand(I));
4192 return MIB;
4193 }
4194
4195 if (SIInstrInfo::isWMMA(MI)) {
4196 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4197 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4198 .setMIFlags(MI.getFlags());
4199 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4200 MIB->addOperand(MI.getOperand(I));
4201 return MIB;
4202 }
4203
4204 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4205 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4206 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4207 "present pre-RA");
4208
4209 // Handle MAC/FMAC.
4210 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4211 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4212 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4213 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4214 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4215 bool Src0Literal = false;
4216
4217 switch (Opc) {
4218 default:
4219 return nullptr;
4220 case AMDGPU::V_MAC_F16_e64:
4221 case AMDGPU::V_FMAC_F16_e64:
4222 case AMDGPU::V_FMAC_F16_t16_e64:
4223 case AMDGPU::V_FMAC_F16_fake16_e64:
4224 case AMDGPU::V_MAC_F32_e64:
4225 case AMDGPU::V_MAC_LEGACY_F32_e64:
4226 case AMDGPU::V_FMAC_F32_e64:
4227 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4228 case AMDGPU::V_FMAC_F64_e64:
4229 break;
4230 case AMDGPU::V_MAC_F16_e32:
4231 case AMDGPU::V_FMAC_F16_e32:
4232 case AMDGPU::V_MAC_F32_e32:
4233 case AMDGPU::V_MAC_LEGACY_F32_e32:
4234 case AMDGPU::V_FMAC_F32_e32:
4235 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4236 case AMDGPU::V_FMAC_F64_e32: {
4237 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4238 AMDGPU::OpName::src0);
4239 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4240 if (!Src0->isReg() && !Src0->isImm())
4241 return nullptr;
4242
4243 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4244 Src0Literal = true;
4245
4246 break;
4247 }
4248 }
4249
4250 MachineInstrBuilder MIB;
4251 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4252 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4253 const MachineOperand *Src0Mods =
4254 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4255 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4256 const MachineOperand *Src1Mods =
4257 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4258 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4259 const MachineOperand *Src2Mods =
4260 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4261 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4262 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4263 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4264
4265 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4266 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4267 // If we have an SGPR input, we will violate the constant bus restriction.
4268 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4269 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4270 MachineInstr *DefMI;
4271
4272 int64_t Imm;
4273 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4274 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4275 if (pseudoToMCOpcode(NewOpc) != -1) {
4276 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4277 .add(*Dst)
4278 .add(*Src0)
4279 .add(*Src1)
4280 .addImm(Imm)
4281 .setMIFlags(MI.getFlags());
4282 U.RemoveMIUse = DefMI;
4283 return MIB;
4284 }
4285 }
4286 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4287 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4288 if (pseudoToMCOpcode(NewOpc) != -1) {
4289 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4290 .add(*Dst)
4291 .add(*Src0)
4292 .addImm(Imm)
4293 .add(*Src2)
4294 .setMIFlags(MI.getFlags());
4295 U.RemoveMIUse = DefMI;
4296 return MIB;
4297 }
4298 }
4299 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4300 if (Src0Literal) {
4301 Imm = Src0->getImm();
4302 DefMI = nullptr;
4303 }
4304 if (pseudoToMCOpcode(NewOpc) != -1 &&
4306 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4307 Src1)) {
4308 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4309 .add(*Dst)
4310 .add(*Src1)
4311 .addImm(Imm)
4312 .add(*Src2)
4313 .setMIFlags(MI.getFlags());
4314 U.RemoveMIUse = DefMI;
4315 return MIB;
4316 }
4317 }
4318 }
4319
4320 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4321 // if VOP3 does not allow a literal operand.
4322 if (Src0Literal && !ST.hasVOP3Literal())
4323 return nullptr;
4324
4325 unsigned NewOpc = getNewFMAInst(ST, Opc);
4326
4327 if (pseudoToMCOpcode(NewOpc) == -1)
4328 return nullptr;
4329
4330 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4331 .add(*Dst)
4332 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4333 .add(*Src0)
4334 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4335 .add(*Src1)
4336 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4337 .add(*Src2)
4338 .addImm(Clamp ? Clamp->getImm() : 0)
4339 .addImm(Omod ? Omod->getImm() : 0)
4340 .setMIFlags(MI.getFlags());
4341 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4342 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4343 return MIB;
4344}
4345
4346// It's not generally safe to move VALU instructions across these since it will
4347// start using the register as a base index rather than directly.
4348// XXX - Why isn't hasSideEffects sufficient for these?
4350 switch (MI.getOpcode()) {
4351 case AMDGPU::S_SET_GPR_IDX_ON:
4352 case AMDGPU::S_SET_GPR_IDX_MODE:
4353 case AMDGPU::S_SET_GPR_IDX_OFF:
4354 return true;
4355 default:
4356 return false;
4357 }
4358}
4359
4361 const MachineBasicBlock *MBB,
4362 const MachineFunction &MF) const {
4363 // Skipping the check for SP writes in the base implementation. The reason it
4364 // was added was apparently due to compile time concerns.
4365 //
4366 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4367 // but is probably avoidable.
4368
4369 // Copied from base implementation.
4370 // Terminators and labels can't be scheduled around.
4371 if (MI.isTerminator() || MI.isPosition())
4372 return true;
4373
4374 // INLINEASM_BR can jump to another block
4375 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4376 return true;
4377
4378 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4379 return true;
4380
4381 // Target-independent instructions do not have an implicit-use of EXEC, even
4382 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4383 // boundaries prevents incorrect movements of such instructions.
4384 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4385 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4386 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4387 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4388 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4390}
4391
4393 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4394 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4395 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4396}
4397
4399 if (!isFLAT(MI) || isFLATGlobal(MI))
4400 return false;
4401
4402 // If scratch is not initialized, we can never access it.
4403 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4404 return false;
4405
4406 // SCRATCH instructions always access scratch.
4407 if (isFLATScratch(MI))
4408 return true;
4409
4410 // If there are no memory operands then conservatively assume the flat
4411 // operation may access scratch.
4412 if (MI.memoperands_empty())
4413 return true;
4414
4415 // See if any memory operand specifies an address space that involves scratch.
4416 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4417 unsigned AS = Memop->getAddrSpace();
4418 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4419 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4420 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4421 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4422 }
4423 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4424 });
4425}
4426
4428 assert(isFLAT(MI));
4429
4430 // All flat instructions use the VMEM counter except prefetch.
4431 if (!usesVM_CNT(MI))
4432 return false;
4433
4434 // If there are no memory operands then conservatively assume the flat
4435 // operation may access VMEM.
4436 if (MI.memoperands_empty())
4437 return true;
4438
4439 // See if any memory operand specifies an address space that involves VMEM.
4440 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4441 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4442 // (GDS) address space is not supported by flat operations. Therefore, simply
4443 // return true unless only the LDS address space is found.
4444 for (const MachineMemOperand *Memop : MI.memoperands()) {
4445 unsigned AS = Memop->getAddrSpace();
4447 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4448 return true;
4449 }
4450
4451 return false;
4452}
4453
4455 assert(isFLAT(MI));
4456
4457 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4458 if (!usesLGKM_CNT(MI))
4459 return false;
4460
4461 // If in tgsplit mode then there can be no use of LDS.
4462 if (ST.isTgSplitEnabled())
4463 return false;
4464
4465 // If there are no memory operands then conservatively assume the flat
4466 // operation may access LDS.
4467 if (MI.memoperands_empty())
4468 return true;
4469
4470 // See if any memory operand specifies an address space that involves LDS.
4471 for (const MachineMemOperand *Memop : MI.memoperands()) {
4472 unsigned AS = Memop->getAddrSpace();
4474 return true;
4475 }
4476
4477 return false;
4478}
4479
4481 // Skip the full operand and register alias search modifiesRegister
4482 // does. There's only a handful of instructions that touch this, it's only an
4483 // implicit def, and doesn't alias any other registers.
4484 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4485}
4486
4488 unsigned Opcode = MI.getOpcode();
4489
4490 if (MI.mayStore() && isSMRD(MI))
4491 return true; // scalar store or atomic
4492
4493 // This will terminate the function when other lanes may need to continue.
4494 if (MI.isReturn())
4495 return true;
4496
4497 // These instructions cause shader I/O that may cause hardware lockups
4498 // when executed with an empty EXEC mask.
4499 //
4500 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4501 // EXEC = 0, but checking for that case here seems not worth it
4502 // given the typical code patterns.
4503 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4504 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4505 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4506 return true;
4507
4508 if (MI.isCall() || MI.isInlineAsm())
4509 return true; // conservative assumption
4510
4511 // Assume that barrier interactions are only intended with active lanes.
4512 if (isBarrier(Opcode))
4513 return true;
4514
4515 // A mode change is a scalar operation that influences vector instructions.
4517 return true;
4518
4519 // These are like SALU instructions in terms of effects, so it's questionable
4520 // whether we should return true for those.
4521 //
4522 // However, executing them with EXEC = 0 causes them to operate on undefined
4523 // data, which we avoid by returning true here.
4524 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4525 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4526 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4527 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4528 return true;
4529
4530 return false;
4531}
4532
4534 const MachineInstr &MI) const {
4535 if (MI.isMetaInstruction())
4536 return false;
4537
4538 // This won't read exec if this is an SGPR->SGPR copy.
4539 if (MI.isCopyLike()) {
4540 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4541 return true;
4542
4543 // Make sure this isn't copying exec as a normal operand
4544 return MI.readsRegister(AMDGPU::EXEC, &RI);
4545 }
4546
4547 // Make a conservative assumption about the callee.
4548 if (MI.isCall())
4549 return true;
4550
4551 // Be conservative with any unhandled generic opcodes.
4552 if (!isTargetSpecificOpcode(MI.getOpcode()))
4553 return true;
4554
4555 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4556}
4557
4558bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4559 switch (Imm.getBitWidth()) {
4560 case 1: // This likely will be a condition code mask.
4561 return true;
4562
4563 case 32:
4564 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4565 ST.hasInv2PiInlineImm());
4566 case 64:
4567 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4568 ST.hasInv2PiInlineImm());
4569 case 16:
4570 return ST.has16BitInsts() &&
4571 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4572 ST.hasInv2PiInlineImm());
4573 default:
4574 llvm_unreachable("invalid bitwidth");
4575 }
4576}
4577
4579 APInt IntImm = Imm.bitcastToAPInt();
4580 int64_t IntImmVal = IntImm.getSExtValue();
4581 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4582 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4583 default:
4584 llvm_unreachable("invalid fltSemantics");
4587 return isInlineConstant(IntImm);
4589 return ST.has16BitInsts() &&
4590 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4592 return ST.has16BitInsts() &&
4593 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4594 }
4595}
4596
4597bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4598 // MachineOperand provides no way to tell the true operand size, since it only
4599 // records a 64-bit value. We need to know the size to determine if a 32-bit
4600 // floating point immediate bit pattern is legal for an integer immediate. It
4601 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4602 switch (OperandType) {
4612 int32_t Trunc = static_cast<int32_t>(Imm);
4613 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4614 }
4620 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4623 // We would expect inline immediates to not be concerned with an integer/fp
4624 // distinction. However, in the case of 16-bit integer operations, the
4625 // "floating point" values appear to not work. It seems read the low 16-bits
4626 // of 32-bit immediates, which happens to always work for the integer
4627 // values.
4628 //
4629 // See llvm bugzilla 46302.
4630 //
4631 // TODO: Theoretically we could use op-sel to use the high bits of the
4632 // 32-bit FP values.
4644 return false;
4647 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4648 // A few special case instructions have 16-bit operands on subtargets
4649 // where 16-bit instructions are not legal.
4650 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4651 // constants in these cases
4652 int16_t Trunc = static_cast<int16_t>(Imm);
4653 return ST.has16BitInsts() &&
4654 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4655 }
4656
4657 return false;
4658 }
4661 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4662 int16_t Trunc = static_cast<int16_t>(Imm);
4663 return ST.has16BitInsts() &&
4664 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4665 }
4666 return false;
4667 }
4671 return false;
4673 return isLegalAV64PseudoImm(Imm);
4676 // Always embedded in the instruction for free.
4677 return true;
4687 // Just ignore anything else.
4688 return true;
4689 default:
4690 llvm_unreachable("invalid operand type");
4691 }
4692}
4693
4694static bool compareMachineOp(const MachineOperand &Op0,
4695 const MachineOperand &Op1) {
4696 if (Op0.getType() != Op1.getType())
4697 return false;
4698
4699 switch (Op0.getType()) {
4701 return Op0.getReg() == Op1.getReg();
4703 return Op0.getImm() == Op1.getImm();
4704 default:
4705 llvm_unreachable("Didn't expect to be comparing these operand types");
4706 }
4707}
4708
4710 const MCOperandInfo &OpInfo) const {
4711 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4712 return true;
4713
4714 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4715 return false;
4716
4717 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4718 return true;
4719
4720 return ST.hasVOP3Literal();
4721}
4722
4723bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4724 int64_t ImmVal) const {
4725 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4726 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4727 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4728 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4729 AMDGPU::OpName::src2))
4730 return false;
4731 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4732 }
4733
4734 return isLiteralOperandLegal(InstDesc, OpInfo);
4735}
4736
4737bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4738 const MachineOperand &MO) const {
4739 if (MO.isImm())
4740 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4741
4742 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4743 "unexpected imm-like operand kind");
4744 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4745 return isLiteralOperandLegal(InstDesc, OpInfo);
4746}
4747
4749 // 2 32-bit inline constants packed into one.
4750 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4751 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4752}
4753
4754bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4755 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4756 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4757 return false;
4758
4759 int Op32 = AMDGPU::getVOPe32(Opcode);
4760 if (Op32 == -1)
4761 return false;
4762
4763 return pseudoToMCOpcode(Op32) != -1;
4764}
4765
4766bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4767 // The src0_modifier operand is present on all instructions
4768 // that have modifiers.
4769
4770 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4771}
4772
4774 AMDGPU::OpName OpName) const {
4775 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4776 return Mods && Mods->getImm();
4777}
4778
4780 return any_of(ModifierOpNames,
4781 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4782}
4783
4785 const MachineRegisterInfo &MRI) const {
4786 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4787 // Can't shrink instruction with three operands.
4788 if (Src2) {
4789 switch (MI.getOpcode()) {
4790 default: return false;
4791
4792 case AMDGPU::V_ADDC_U32_e64:
4793 case AMDGPU::V_SUBB_U32_e64:
4794 case AMDGPU::V_SUBBREV_U32_e64: {
4795 const MachineOperand *Src1
4796 = getNamedOperand(MI, AMDGPU::OpName::src1);
4797 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4798 return false;
4799 // Additional verification is needed for sdst/src2.
4800 return true;
4801 }
4802 case AMDGPU::V_MAC_F16_e64:
4803 case AMDGPU::V_MAC_F32_e64:
4804 case AMDGPU::V_MAC_LEGACY_F32_e64:
4805 case AMDGPU::V_FMAC_F16_e64:
4806 case AMDGPU::V_FMAC_F16_t16_e64:
4807 case AMDGPU::V_FMAC_F16_fake16_e64:
4808 case AMDGPU::V_FMAC_F32_e64:
4809 case AMDGPU::V_FMAC_F64_e64:
4810 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4811 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4812 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4813 return false;
4814 break;
4815
4816 case AMDGPU::V_CNDMASK_B32_e64:
4817 break;
4818 }
4819 }
4820
4821 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4822 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4823 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4824 return false;
4825
4826 // We don't need to check src0, all input types are legal, so just make sure
4827 // src0 isn't using any modifiers.
4828 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4829 return false;
4830
4831 // Can it be shrunk to a valid 32 bit opcode?
4832 if (!hasVALU32BitEncoding(MI.getOpcode()))
4833 return false;
4834
4835 // Check output modifiers
4836 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4837 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4838 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4839 // TODO: Can we avoid checking bound_ctrl/fi here?
4840 // They are only used by permlane*_swap special case.
4841 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4842 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4843}
4844
4845// Set VCC operand with all flags from \p Orig, except for setting it as
4846// implicit.
4848 const MachineOperand &Orig) {
4849
4850 for (MachineOperand &Use : MI.implicit_operands()) {
4851 if (Use.isUse() &&
4852 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4853 Use.setIsUndef(Orig.isUndef());
4854 Use.setIsKill(Orig.isKill());
4855 return;
4856 }
4857 }
4858}
4859
4861 unsigned Op32) const {
4862 MachineBasicBlock *MBB = MI.getParent();
4863
4864 const MCInstrDesc &Op32Desc = get(Op32);
4865 MachineInstrBuilder Inst32 =
4866 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4867 .setMIFlags(MI.getFlags());
4868
4869 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4870 // For VOPC instructions, this is replaced by an implicit def of vcc.
4871
4872 // We assume the defs of the shrunk opcode are in the same order, and the
4873 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4874 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4875 Inst32.add(MI.getOperand(I));
4876
4877 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4878
4879 int Idx = MI.getNumExplicitDefs();
4880 for (const MachineOperand &Use : MI.explicit_uses()) {
4881 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4883 continue;
4884
4885 if (&Use == Src2) {
4886 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4887 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4888 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4889 // of vcc was already added during the initial BuildMI, but we
4890 // 1) may need to change vcc to vcc_lo to preserve the original register
4891 // 2) have to preserve the original flags.
4892 copyFlagsToImplicitVCC(*Inst32, *Src2);
4893 continue;
4894 }
4895 }
4896
4897 Inst32.add(Use);
4898 }
4899
4900 // FIXME: Losing implicit operands
4901 fixImplicitOperands(*Inst32);
4902 return Inst32;
4903}
4904
4906 // Null is free
4907 Register Reg = RegOp.getReg();
4908 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4909 return false;
4910
4911 // SGPRs use the constant bus
4912
4913 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4914 // physical register operands should also count, except for exec.
4915 if (RegOp.isImplicit())
4916 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4917
4918 // SGPRs use the constant bus
4919 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4920 AMDGPU::SReg_64RegClass.contains(Reg);
4921}
4922
4924 const MachineRegisterInfo &MRI) const {
4925 Register Reg = RegOp.getReg();
4926 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4927 : physRegUsesConstantBus(RegOp);
4928}
4929
4931 const MachineOperand &MO,
4932 const MCOperandInfo &OpInfo) const {
4933 // Literal constants use the constant bus.
4934 if (!MO.isReg())
4935 return !isInlineConstant(MO, OpInfo);
4936
4937 Register Reg = MO.getReg();
4938 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4940}
4941
4943 for (const MachineOperand &MO : MI.implicit_operands()) {
4944 // We only care about reads.
4945 if (MO.isDef())
4946 continue;
4947
4948 switch (MO.getReg()) {
4949 case AMDGPU::VCC:
4950 case AMDGPU::VCC_LO:
4951 case AMDGPU::VCC_HI:
4952 case AMDGPU::M0:
4953 case AMDGPU::FLAT_SCR:
4954 return MO.getReg();
4955
4956 default:
4957 break;
4958 }
4959 }
4960
4961 return Register();
4962}
4963
4964static bool shouldReadExec(const MachineInstr &MI) {
4965 if (SIInstrInfo::isVALU(MI)) {
4966 switch (MI.getOpcode()) {
4967 case AMDGPU::V_READLANE_B32:
4968 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4969 case AMDGPU::V_WRITELANE_B32:
4970 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4971 return false;
4972 }
4973
4974 return true;
4975 }
4976
4977 if (MI.isPreISelOpcode() ||
4978 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4981 return false;
4982
4983 return true;
4984}
4985
4986static bool isRegOrFI(const MachineOperand &MO) {
4987 return MO.isReg() || MO.isFI();
4988}
4989
4990static bool isSubRegOf(const SIRegisterInfo &TRI,
4991 const MachineOperand &SuperVec,
4992 const MachineOperand &SubReg) {
4993 if (SubReg.getReg().isPhysical())
4994 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4995
4996 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4997 SubReg.getReg() == SuperVec.getReg();
4998}
4999
5000// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5001bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5002 const MachineRegisterInfo &MRI,
5003 StringRef &ErrInfo) const {
5004 Register DstReg = MI.getOperand(0).getReg();
5005 Register SrcReg = MI.getOperand(1).getReg();
5006 // This is a check for copy from vector register to SGPR
5007 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5008 ErrInfo = "illegal copy from vector register to SGPR";
5009 return false;
5010 }
5011 return true;
5012}
5013
5015 StringRef &ErrInfo) const {
5016 uint16_t Opcode = MI.getOpcode();
5017 const MachineFunction *MF = MI.getMF();
5018 const MachineRegisterInfo &MRI = MF->getRegInfo();
5019
5020 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5021 // Find a better property to recognize the point where instruction selection
5022 // is just done.
5023 // We can only enforce this check after SIFixSGPRCopies pass so that the
5024 // illegal copies are legalized and thereafter we don't expect a pass
5025 // inserting similar copies.
5026 if (!MRI.isSSA() && MI.isCopy())
5027 return verifyCopy(MI, MRI, ErrInfo);
5028
5029 if (SIInstrInfo::isGenericOpcode(Opcode))
5030 return true;
5031
5032 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5033 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5034 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5035 int Src3Idx = -1;
5036 if (Src0Idx == -1) {
5037 // VOPD V_DUAL_* instructions use different operand names.
5038 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5039 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5040 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5041 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5042 }
5043
5044 // Make sure the number of operands is correct.
5045 const MCInstrDesc &Desc = get(Opcode);
5046 if (!Desc.isVariadic() &&
5047 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5048 ErrInfo = "Instruction has wrong number of operands.";
5049 return false;
5050 }
5051
5052 if (MI.isInlineAsm()) {
5053 // Verify register classes for inlineasm constraints.
5054 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5055 I != E; ++I) {
5056 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5057 if (!RC)
5058 continue;
5059
5060 const MachineOperand &Op = MI.getOperand(I);
5061 if (!Op.isReg())
5062 continue;
5063
5064 Register Reg = Op.getReg();
5065 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5066 ErrInfo = "inlineasm operand has incorrect register class.";
5067 return false;
5068 }
5069 }
5070
5071 return true;
5072 }
5073
5074 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5075 ErrInfo = "missing memory operand from image instruction.";
5076 return false;
5077 }
5078
5079 // Make sure the register classes are correct.
5080 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5081 const MachineOperand &MO = MI.getOperand(i);
5082 if (MO.isFPImm()) {
5083 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5084 "all fp values to integers.";
5085 return false;
5086 }
5087
5088 const MCOperandInfo &OpInfo = Desc.operands()[i];
5089 int16_t RegClass = getOpRegClassID(OpInfo);
5090
5091 switch (OpInfo.OperandType) {
5093 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5094 ErrInfo = "Illegal immediate value for operand.";
5095 return false;
5096 }
5097 break;
5110 break;
5112 break;
5113 break;
5127 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5128 ErrInfo = "Illegal immediate value for operand.";
5129 return false;
5130 }
5131 break;
5132 }
5134 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5135 ErrInfo = "Expected inline constant for operand.";
5136 return false;
5137 }
5138 break;
5142 break;
5147 // Check if this operand is an immediate.
5148 // FrameIndex operands will be replaced by immediates, so they are
5149 // allowed.
5150 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5151 ErrInfo = "Expected immediate, but got non-immediate";
5152 return false;
5153 }
5154 break;
5158 break;
5159 default:
5160 if (OpInfo.isGenericType())
5161 continue;
5162 break;
5163 }
5164
5165 if (!MO.isReg())
5166 continue;
5167 Register Reg = MO.getReg();
5168 if (!Reg)
5169 continue;
5170
5171 // FIXME: Ideally we would have separate instruction definitions with the
5172 // aligned register constraint.
5173 // FIXME: We do not verify inline asm operands, but custom inline asm
5174 // verification is broken anyway
5175 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5176 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5177 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5178 if (const TargetRegisterClass *SubRC =
5179 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5180 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5181 if (RC)
5182 RC = SubRC;
5183 }
5184 }
5185
5186 // Check that this is the aligned version of the class.
5187 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5188 ErrInfo = "Subtarget requires even aligned vector registers";
5189 return false;
5190 }
5191 }
5192
5193 if (RegClass != -1) {
5194 if (Reg.isVirtual())
5195 continue;
5196
5197 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5198 if (!RC->contains(Reg)) {
5199 ErrInfo = "Operand has incorrect register class.";
5200 return false;
5201 }
5202 }
5203 }
5204
5205 // Verify SDWA
5206 if (isSDWA(MI)) {
5207 if (!ST.hasSDWA()) {
5208 ErrInfo = "SDWA is not supported on this target";
5209 return false;
5210 }
5211
5212 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5213 AMDGPU::OpName::dst_sel}) {
5214 const MachineOperand *MO = getNamedOperand(MI, Op);
5215 if (!MO)
5216 continue;
5217 int64_t Imm = MO->getImm();
5218 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5219 ErrInfo = "Invalid SDWA selection";
5220 return false;
5221 }
5222 }
5223
5224 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5225
5226 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5227 if (OpIdx == -1)
5228 continue;
5229 const MachineOperand &MO = MI.getOperand(OpIdx);
5230
5231 if (!ST.hasSDWAScalar()) {
5232 // Only VGPRS on VI
5233 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5234 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5235 return false;
5236 }
5237 } else {
5238 // No immediates on GFX9
5239 if (!MO.isReg()) {
5240 ErrInfo =
5241 "Only reg allowed as operands in SDWA instructions on GFX9+";
5242 return false;
5243 }
5244 }
5245 }
5246
5247 if (!ST.hasSDWAOmod()) {
5248 // No omod allowed on VI
5249 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5250 if (OMod != nullptr &&
5251 (!OMod->isImm() || OMod->getImm() != 0)) {
5252 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5253 return false;
5254 }
5255 }
5256
5257 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5258 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5259 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5260 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5261 const MachineOperand *Src0ModsMO =
5262 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5263 unsigned Mods = Src0ModsMO->getImm();
5264 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5265 Mods & SISrcMods::SEXT) {
5266 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5267 return false;
5268 }
5269 }
5270
5271 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5272 if (isVOPC(BasicOpcode)) {
5273 if (!ST.hasSDWASdst() && DstIdx != -1) {
5274 // Only vcc allowed as dst on VI for VOPC
5275 const MachineOperand &Dst = MI.getOperand(DstIdx);
5276 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5277 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5278 return false;
5279 }
5280 } else if (!ST.hasSDWAOutModsVOPC()) {
5281 // No clamp allowed on GFX9 for VOPC
5282 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5283 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5284 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5285 return false;
5286 }
5287
5288 // No omod allowed on GFX9 for VOPC
5289 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5290 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5291 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5292 return false;
5293 }
5294 }
5295 }
5296
5297 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5298 if (DstUnused && DstUnused->isImm() &&
5299 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5300 const MachineOperand &Dst = MI.getOperand(DstIdx);
5301 if (!Dst.isReg() || !Dst.isTied()) {
5302 ErrInfo = "Dst register should have tied register";
5303 return false;
5304 }
5305
5306 const MachineOperand &TiedMO =
5307 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5308 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5309 ErrInfo =
5310 "Dst register should be tied to implicit use of preserved register";
5311 return false;
5312 }
5313 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5314 ErrInfo = "Dst register should use same physical register as preserved";
5315 return false;
5316 }
5317 }
5318 }
5319
5320 // Verify MIMG / VIMAGE / VSAMPLE
5321 if (isImage(Opcode) && !MI.mayStore()) {
5322 // Ensure that the return type used is large enough for all the options
5323 // being used TFE/LWE require an extra result register.
5324 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5325 if (DMask) {
5326 uint64_t DMaskImm = DMask->getImm();
5327 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5328 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5329 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5330 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5331
5332 // Adjust for packed 16 bit values
5333 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5334 RegCount = divideCeil(RegCount, 2);
5335
5336 // Adjust if using LWE or TFE
5337 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5338 RegCount += 1;
5339
5340 const uint32_t DstIdx =
5341 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5342 const MachineOperand &Dst = MI.getOperand(DstIdx);
5343 if (Dst.isReg()) {
5344 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5345 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5346 if (RegCount > DstSize) {
5347 ErrInfo = "Image instruction returns too many registers for dst "
5348 "register class";
5349 return false;
5350 }
5351 }
5352 }
5353 }
5354
5355 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5356 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5357 unsigned ConstantBusCount = 0;
5358 bool UsesLiteral = false;
5359 const MachineOperand *LiteralVal = nullptr;
5360
5361 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5362 if (ImmIdx != -1) {
5363 ++ConstantBusCount;
5364 UsesLiteral = true;
5365 LiteralVal = &MI.getOperand(ImmIdx);
5366 }
5367
5368 SmallVector<Register, 2> SGPRsUsed;
5369 Register SGPRUsed;
5370
5371 // Only look at the true operands. Only a real operand can use the constant
5372 // bus, and we don't want to check pseudo-operands like the source modifier
5373 // flags.
5374 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5375 if (OpIdx == -1)
5376 continue;
5377 const MachineOperand &MO = MI.getOperand(OpIdx);
5378 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5379 if (MO.isReg()) {
5380 SGPRUsed = MO.getReg();
5381 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5382 ++ConstantBusCount;
5383 SGPRsUsed.push_back(SGPRUsed);
5384 }
5385 } else if (!MO.isFI()) { // Treat FI like a register.
5386 if (!UsesLiteral) {
5387 ++ConstantBusCount;
5388 UsesLiteral = true;
5389 LiteralVal = &MO;
5390 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5391 assert(isVOP2(MI) || isVOP3(MI));
5392 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5393 return false;
5394 }
5395 }
5396 }
5397 }
5398
5399 SGPRUsed = findImplicitSGPRRead(MI);
5400 if (SGPRUsed) {
5401 // Implicit uses may safely overlap true operands
5402 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5403 return !RI.regsOverlap(SGPRUsed, SGPR);
5404 })) {
5405 ++ConstantBusCount;
5406 SGPRsUsed.push_back(SGPRUsed);
5407 }
5408 }
5409
5410 // v_writelane_b32 is an exception from constant bus restriction:
5411 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5412 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5413 Opcode != AMDGPU::V_WRITELANE_B32) {
5414 ErrInfo = "VOP* instruction violates constant bus restriction";
5415 return false;
5416 }
5417
5418 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5419 ErrInfo = "VOP3 instruction uses literal";
5420 return false;
5421 }
5422 }
5423
5424 // Special case for writelane - this can break the multiple constant bus rule,
5425 // but still can't use more than one SGPR register
5426 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5427 unsigned SGPRCount = 0;
5428 Register SGPRUsed;
5429
5430 for (int OpIdx : {Src0Idx, Src1Idx}) {
5431 if (OpIdx == -1)
5432 break;
5433
5434 const MachineOperand &MO = MI.getOperand(OpIdx);
5435
5436 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5437 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5438 if (MO.getReg() != SGPRUsed)
5439 ++SGPRCount;
5440 SGPRUsed = MO.getReg();
5441 }
5442 }
5443 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5444 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5445 return false;
5446 }
5447 }
5448 }
5449
5450 // Verify misc. restrictions on specific instructions.
5451 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5452 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5453 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5454 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5455 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5456 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5457 if (!compareMachineOp(Src0, Src1) &&
5458 !compareMachineOp(Src0, Src2)) {
5459 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5460 return false;
5461 }
5462 }
5463 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5464 SISrcMods::ABS) ||
5465 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5466 SISrcMods::ABS) ||
5467 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5468 SISrcMods::ABS)) {
5469 ErrInfo = "ABS not allowed in VOP3B instructions";
5470 return false;
5471 }
5472 }
5473
5474 if (isSOP2(MI) || isSOPC(MI)) {
5475 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5476 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5477
5478 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5479 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5480 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5481 !Src0.isIdenticalTo(Src1)) {
5482 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5483 return false;
5484 }
5485 }
5486
5487 if (isSOPK(MI)) {
5488 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5489 if (Desc.isBranch()) {
5490 if (!Op->isMBB()) {
5491 ErrInfo = "invalid branch target for SOPK instruction";
5492 return false;
5493 }
5494 } else {
5495 uint64_t Imm = Op->getImm();
5496 if (sopkIsZext(Opcode)) {
5497 if (!isUInt<16>(Imm)) {
5498 ErrInfo = "invalid immediate for SOPK instruction";
5499 return false;
5500 }
5501 } else {
5502 if (!isInt<16>(Imm)) {
5503 ErrInfo = "invalid immediate for SOPK instruction";
5504 return false;
5505 }
5506 }
5507 }
5508 }
5509
5510 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5511 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5512 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5513 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5514 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5515 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5516
5517 const unsigned StaticNumOps =
5518 Desc.getNumOperands() + Desc.implicit_uses().size();
5519 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5520
5521 // Allow additional implicit operands. This allows a fixup done by the post
5522 // RA scheduler where the main implicit operand is killed and implicit-defs
5523 // are added for sub-registers that remain live after this instruction.
5524 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5525 ErrInfo = "missing implicit register operands";
5526 return false;
5527 }
5528
5529 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5530 if (IsDst) {
5531 if (!Dst->isUse()) {
5532 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5533 return false;
5534 }
5535
5536 unsigned UseOpIdx;
5537 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5538 UseOpIdx != StaticNumOps + 1) {
5539 ErrInfo = "movrel implicit operands should be tied";
5540 return false;
5541 }
5542 }
5543
5544 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5545 const MachineOperand &ImpUse
5546 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5547 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5548 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5549 ErrInfo = "src0 should be subreg of implicit vector use";
5550 return false;
5551 }
5552 }
5553
5554 // Make sure we aren't losing exec uses in the td files. This mostly requires
5555 // being careful when using let Uses to try to add other use registers.
5556 if (shouldReadExec(MI)) {
5557 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5558 ErrInfo = "VALU instruction does not implicitly read exec mask";
5559 return false;
5560 }
5561 }
5562
5563 if (isSMRD(MI)) {
5564 if (MI.mayStore() &&
5565 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5566 // The register offset form of scalar stores may only use m0 as the
5567 // soffset register.
5568 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5569 if (Soff && Soff->getReg() != AMDGPU::M0) {
5570 ErrInfo = "scalar stores must use m0 as offset register";
5571 return false;
5572 }
5573 }
5574 }
5575
5576 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5577 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5578 if (Offset->getImm() != 0) {
5579 ErrInfo = "subtarget does not support offsets in flat instructions";
5580 return false;
5581 }
5582 }
5583
5584 if (isDS(MI) && !ST.hasGDS()) {
5585 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5586 if (GDSOp && GDSOp->getImm() != 0) {
5587 ErrInfo = "GDS is not supported on this subtarget";
5588 return false;
5589 }
5590 }
5591
5592 if (isImage(MI)) {
5593 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5594 if (DimOp) {
5595 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5596 AMDGPU::OpName::vaddr0);
5597 AMDGPU::OpName RSrcOpName =
5598 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5599 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5600 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5601 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5602 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5603 const AMDGPU::MIMGDimInfo *Dim =
5605
5606 if (!Dim) {
5607 ErrInfo = "dim is out of range";
5608 return false;
5609 }
5610
5611 bool IsA16 = false;
5612 if (ST.hasR128A16()) {
5613 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5614 IsA16 = R128A16->getImm() != 0;
5615 } else if (ST.hasA16()) {
5616 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5617 IsA16 = A16->getImm() != 0;
5618 }
5619
5620 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5621
5622 unsigned AddrWords =
5623 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5624
5625 unsigned VAddrWords;
5626 if (IsNSA) {
5627 VAddrWords = RsrcIdx - VAddr0Idx;
5628 if (ST.hasPartialNSAEncoding() &&
5629 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5630 unsigned LastVAddrIdx = RsrcIdx - 1;
5631 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5632 }
5633 } else {
5634 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5635 if (AddrWords > 12)
5636 AddrWords = 16;
5637 }
5638
5639 if (VAddrWords != AddrWords) {
5640 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5641 << " but got " << VAddrWords << "\n");
5642 ErrInfo = "bad vaddr size";
5643 return false;
5644 }
5645 }
5646 }
5647
5648 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5649 if (DppCt) {
5650 using namespace AMDGPU::DPP;
5651
5652 unsigned DC = DppCt->getImm();
5653 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5654 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5655 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5656 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5657 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5658 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5659 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5660 ErrInfo = "Invalid dpp_ctrl value";
5661 return false;
5662 }
5663 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5664 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5665 ErrInfo = "Invalid dpp_ctrl value: "
5666 "wavefront shifts are not supported on GFX10+";
5667 return false;
5668 }
5669 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5670 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5671 ErrInfo = "Invalid dpp_ctrl value: "
5672 "broadcasts are not supported on GFX10+";
5673 return false;
5674 }
5675 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5676 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5677 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5678 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5679 !ST.hasGFX90AInsts()) {
5680 ErrInfo = "Invalid dpp_ctrl value: "
5681 "row_newbroadcast/row_share is not supported before "
5682 "GFX90A/GFX10";
5683 return false;
5684 }
5685 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5686 ErrInfo = "Invalid dpp_ctrl value: "
5687 "row_share and row_xmask are not supported before GFX10";
5688 return false;
5689 }
5690 }
5691
5692 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5694 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5695 ErrInfo = "Invalid dpp_ctrl value: "
5696 "DP ALU dpp only support row_newbcast";
5697 return false;
5698 }
5699 }
5700
5701 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5702 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5703 AMDGPU::OpName DataName =
5704 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5705 const MachineOperand *Data = getNamedOperand(MI, DataName);
5706 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5707 if (Data && !Data->isReg())
5708 Data = nullptr;
5709
5710 if (ST.hasGFX90AInsts()) {
5711 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5712 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5713 ErrInfo = "Invalid register class: "
5714 "vdata and vdst should be both VGPR or AGPR";
5715 return false;
5716 }
5717 if (Data && Data2 &&
5718 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5719 ErrInfo = "Invalid register class: "
5720 "both data operands should be VGPR or AGPR";
5721 return false;
5722 }
5723 } else {
5724 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5725 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5726 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5727 ErrInfo = "Invalid register class: "
5728 "agpr loads and stores not supported on this GPU";
5729 return false;
5730 }
5731 }
5732 }
5733
5734 if (ST.needsAlignedVGPRs()) {
5735 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5737 if (!Op)
5738 return true;
5739 Register Reg = Op->getReg();
5740 if (Reg.isPhysical())
5741 return !(RI.getHWRegIndex(Reg) & 1);
5742 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5743 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5744 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5745 };
5746
5747 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5748 Opcode == AMDGPU::DS_GWS_BARRIER) {
5749
5750 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5751 ErrInfo = "Subtarget requires even aligned vector registers "
5752 "for DS_GWS instructions";
5753 return false;
5754 }
5755 }
5756
5757 if (isMIMG(MI)) {
5758 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5759 ErrInfo = "Subtarget requires even aligned vector registers "
5760 "for vaddr operand of image instructions";
5761 return false;
5762 }
5763 }
5764 }
5765
5766 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5767 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5768 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5769 ErrInfo = "Invalid register class: "
5770 "v_accvgpr_write with an SGPR is not supported on this GPU";
5771 return false;
5772 }
5773 }
5774
5775 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5776 const MachineOperand &SrcOp = MI.getOperand(1);
5777 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5778 ErrInfo = "pseudo expects only physical SGPRs";
5779 return false;
5780 }
5781 }
5782
5783 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5784 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5785 if (!ST.hasScaleOffset()) {
5786 ErrInfo = "Subtarget does not support offset scaling";
5787 return false;
5788 }
5789 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5790 ErrInfo = "Instruction does not support offset scaling";
5791 return false;
5792 }
5793 }
5794 }
5795
5796 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5797 // information.
5798 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5799 for (unsigned I = 0; I < 3; ++I) {
5801 return false;
5802 }
5803 }
5804
5805 return true;
5806}
5807
5808// It is more readable to list mapped opcodes on the same line.
5809// clang-format off
5810
5812 switch (MI.getOpcode()) {
5813 default: return AMDGPU::INSTRUCTION_LIST_END;
5814 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5815 case AMDGPU::COPY: return AMDGPU::COPY;
5816 case AMDGPU::PHI: return AMDGPU::PHI;
5817 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5818 case AMDGPU::WQM: return AMDGPU::WQM;
5819 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5820 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5821 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5822 case AMDGPU::S_MOV_B32: {
5823 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5824 return MI.getOperand(1).isReg() ||
5825 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5826 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5827 }
5828 case AMDGPU::S_ADD_I32:
5829 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5830 case AMDGPU::S_ADDC_U32:
5831 return AMDGPU::V_ADDC_U32_e32;
5832 case AMDGPU::S_SUB_I32:
5833 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5834 // FIXME: These are not consistently handled, and selected when the carry is
5835 // used.
5836 case AMDGPU::S_ADD_U32:
5837 return AMDGPU::V_ADD_CO_U32_e32;
5838 case AMDGPU::S_SUB_U32:
5839 return AMDGPU::V_SUB_CO_U32_e32;
5840 case AMDGPU::S_ADD_U64_PSEUDO:
5841 return AMDGPU::V_ADD_U64_PSEUDO;
5842 case AMDGPU::S_SUB_U64_PSEUDO:
5843 return AMDGPU::V_SUB_U64_PSEUDO;
5844 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5845 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5846 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5847 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5848 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5849 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5850 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5851 case AMDGPU::S_XNOR_B32:
5852 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5853 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5854 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5855 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5856 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5857 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5858 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5859 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5860 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5861 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5862 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5863 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5864 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5865 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5866 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5867 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5868 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5869 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5870 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5871 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5872 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5873 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5874 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5875 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5876 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5877 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5878 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5879 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5880 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5881 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5882 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5883 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5884 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5885 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5886 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5887 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5888 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5889 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5890 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5891 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5892 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5893 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5894 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5895 case AMDGPU::S_CVT_F32_F16:
5896 case AMDGPU::S_CVT_HI_F32_F16:
5897 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5898 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5899 case AMDGPU::S_CVT_F16_F32:
5900 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5901 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5902 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5903 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5904 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5905 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5906 case AMDGPU::S_CEIL_F16:
5907 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5908 : AMDGPU::V_CEIL_F16_fake16_e64;
5909 case AMDGPU::S_FLOOR_F16:
5910 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5911 : AMDGPU::V_FLOOR_F16_fake16_e64;
5912 case AMDGPU::S_TRUNC_F16:
5913 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5914 : AMDGPU::V_TRUNC_F16_fake16_e64;
5915 case AMDGPU::S_RNDNE_F16:
5916 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5917 : AMDGPU::V_RNDNE_F16_fake16_e64;
5918 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5919 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5920 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5921 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5922 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5923 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5924 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5925 case AMDGPU::S_ADD_F16:
5926 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5927 : AMDGPU::V_ADD_F16_fake16_e64;
5928 case AMDGPU::S_SUB_F16:
5929 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5930 : AMDGPU::V_SUB_F16_fake16_e64;
5931 case AMDGPU::S_MIN_F16:
5932 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5933 : AMDGPU::V_MIN_F16_fake16_e64;
5934 case AMDGPU::S_MAX_F16:
5935 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5936 : AMDGPU::V_MAX_F16_fake16_e64;
5937 case AMDGPU::S_MINIMUM_F16:
5938 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5939 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5940 case AMDGPU::S_MAXIMUM_F16:
5941 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5942 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5943 case AMDGPU::S_MUL_F16:
5944 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5945 : AMDGPU::V_MUL_F16_fake16_e64;
5946 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5947 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5948 case AMDGPU::S_FMAC_F16:
5949 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5950 : AMDGPU::V_FMAC_F16_fake16_e64;
5951 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5952 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5953 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5954 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5955 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5956 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5957 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5958 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5959 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5960 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5961 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5962 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5963 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5964 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5965 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5966 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5967 case AMDGPU::S_CMP_LT_F16:
5968 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5969 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5970 case AMDGPU::S_CMP_EQ_F16:
5971 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5972 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5973 case AMDGPU::S_CMP_LE_F16:
5974 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5975 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5976 case AMDGPU::S_CMP_GT_F16:
5977 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5978 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5979 case AMDGPU::S_CMP_LG_F16:
5980 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5981 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5982 case AMDGPU::S_CMP_GE_F16:
5983 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5984 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5985 case AMDGPU::S_CMP_O_F16:
5986 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5987 : AMDGPU::V_CMP_O_F16_fake16_e64;
5988 case AMDGPU::S_CMP_U_F16:
5989 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5990 : AMDGPU::V_CMP_U_F16_fake16_e64;
5991 case AMDGPU::S_CMP_NGE_F16:
5992 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5993 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5994 case AMDGPU::S_CMP_NLG_F16:
5995 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5996 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5997 case AMDGPU::S_CMP_NGT_F16:
5998 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5999 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6000 case AMDGPU::S_CMP_NLE_F16:
6001 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6002 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6003 case AMDGPU::S_CMP_NEQ_F16:
6004 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6005 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6006 case AMDGPU::S_CMP_NLT_F16:
6007 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6008 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6009 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6010 case AMDGPU::V_S_EXP_F16_e64:
6011 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6012 : AMDGPU::V_EXP_F16_fake16_e64;
6013 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6014 case AMDGPU::V_S_LOG_F16_e64:
6015 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6016 : AMDGPU::V_LOG_F16_fake16_e64;
6017 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6018 case AMDGPU::V_S_RCP_F16_e64:
6019 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6020 : AMDGPU::V_RCP_F16_fake16_e64;
6021 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6022 case AMDGPU::V_S_RSQ_F16_e64:
6023 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6024 : AMDGPU::V_RSQ_F16_fake16_e64;
6025 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6026 case AMDGPU::V_S_SQRT_F16_e64:
6027 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6028 : AMDGPU::V_SQRT_F16_fake16_e64;
6029 }
6031 "Unexpected scalar opcode without corresponding vector one!");
6032}
6033
6034// clang-format on
6035
6039 const DebugLoc &DL, Register Reg,
6040 bool IsSCCLive,
6041 SlotIndexes *Indexes) const {
6042 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6043 const SIInstrInfo *TII = ST.getInstrInfo();
6045 if (IsSCCLive) {
6046 // Insert two move instructions, one to save the original value of EXEC and
6047 // the other to turn on all bits in EXEC. This is required as we can't use
6048 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6049 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6051 auto FlipExecMI =
6052 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6053 if (Indexes) {
6054 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6055 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6056 }
6057 } else {
6058 auto SaveExec =
6059 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6060 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6061 if (Indexes)
6062 Indexes->insertMachineInstrInMaps(*SaveExec);
6063 }
6064}
6065
6068 const DebugLoc &DL, Register Reg,
6069 SlotIndexes *Indexes) const {
6071 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6072 .addReg(Reg, RegState::Kill);
6073 if (Indexes)
6074 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6075}
6076
6080 "Not a whole wave func");
6081 MachineBasicBlock &MBB = *MF.begin();
6082 for (MachineInstr &MI : MBB)
6083 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6084 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6085 return &MI;
6086
6087 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6088}
6089
6091 unsigned OpNo) const {
6092 const MCInstrDesc &Desc = get(MI.getOpcode());
6093 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6094 Desc.operands()[OpNo].RegClass == -1) {
6095 Register Reg = MI.getOperand(OpNo).getReg();
6096
6097 if (Reg.isVirtual()) {
6098 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6099 return MRI.getRegClass(Reg);
6100 }
6101 return RI.getPhysRegBaseClass(Reg);
6102 }
6103
6104 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6105 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6106}
6107
6110 MachineBasicBlock *MBB = MI.getParent();
6111 MachineOperand &MO = MI.getOperand(OpIdx);
6112 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6113 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6114 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6115 unsigned Size = RI.getRegSizeInBits(*RC);
6116 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6117 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6118 : AMDGPU::V_MOV_B32_e32;
6119 if (MO.isReg())
6120 Opcode = AMDGPU::COPY;
6121 else if (RI.isSGPRClass(RC))
6122 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6123
6124 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6125 Register Reg = MRI.createVirtualRegister(VRC);
6126 DebugLoc DL = MBB->findDebugLoc(I);
6127 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6128 MO.ChangeToRegister(Reg, false);
6129}
6130
6133 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6134 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6135 if (!SuperReg.getReg().isVirtual())
6136 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6137
6138 MachineBasicBlock *MBB = MI->getParent();
6139 const DebugLoc &DL = MI->getDebugLoc();
6140 Register SubReg = MRI.createVirtualRegister(SubRC);
6141
6142 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6143 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6144 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6145 return SubReg;
6146}
6147
6150 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6151 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6152 if (Op.isImm()) {
6153 if (SubIdx == AMDGPU::sub0)
6154 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6155 if (SubIdx == AMDGPU::sub1)
6156 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6157
6158 llvm_unreachable("Unhandled register index for immediate");
6159 }
6160
6161 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6162 SubIdx, SubRC);
6163 return MachineOperand::CreateReg(SubReg, false);
6164}
6165
6166// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6167void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6168 assert(Inst.getNumExplicitOperands() == 3);
6169 MachineOperand Op1 = Inst.getOperand(1);
6170 Inst.removeOperand(1);
6171 Inst.addOperand(Op1);
6172}
6173
6175 const MCOperandInfo &OpInfo,
6176 const MachineOperand &MO) const {
6177 if (!MO.isReg())
6178 return false;
6179
6180 Register Reg = MO.getReg();
6181
6182 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6183 if (Reg.isPhysical())
6184 return DRC->contains(Reg);
6185
6186 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6187
6188 if (MO.getSubReg()) {
6189 const MachineFunction *MF = MO.getParent()->getMF();
6190 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6191 if (!SuperRC)
6192 return false;
6193 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6194 }
6195
6196 return RI.getCommonSubClass(DRC, RC) != nullptr;
6197}
6198
6200 const MachineOperand &MO) const {
6201 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6202 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6203 unsigned Opc = MI.getOpcode();
6204
6205 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6206 // information.
6207 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6208 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6209 constexpr AMDGPU::OpName OpNames[] = {
6210 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6211
6212 for (auto [I, OpName] : enumerate(OpNames)) {
6213 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6214 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6216 return false;
6217 }
6218 }
6219
6220 if (!isLegalRegOperand(MRI, OpInfo, MO))
6221 return false;
6222
6223 // check Accumulate GPR operand
6224 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6225 if (IsAGPR && !ST.hasMAIInsts())
6226 return false;
6227 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6228 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6229 return false;
6230 // Atomics should have both vdst and vdata either vgpr or agpr.
6231 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6232 const int DataIdx = AMDGPU::getNamedOperandIdx(
6233 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6234 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6235 MI.getOperand(DataIdx).isReg() &&
6236 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6237 return false;
6238 if ((int)OpIdx == DataIdx) {
6239 if (VDstIdx != -1 &&
6240 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6241 return false;
6242 // DS instructions with 2 src operands also must have tied RC.
6243 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6244 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6245 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6246 return false;
6247 }
6248
6249 // Check V_ACCVGPR_WRITE_B32_e64
6250 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6251 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6252 RI.isSGPRReg(MRI, MO.getReg()))
6253 return false;
6254 return true;
6255}
6256
6258 const MCOperandInfo &OpInfo,
6259 const MachineOperand &MO) const {
6260 if (MO.isReg())
6261 return isLegalRegOperand(MRI, OpInfo, MO);
6262
6263 // Handle non-register types that are treated like immediates.
6264 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6265 return true;
6266}
6267
6269 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6270 const MachineOperand *MO) const {
6271 constexpr unsigned NumOps = 3;
6272 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6273 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6274 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6275 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6276
6277 assert(SrcN < NumOps);
6278
6279 if (!MO) {
6280 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6281 if (SrcIdx == -1)
6282 return true;
6283 MO = &MI.getOperand(SrcIdx);
6284 }
6285
6286 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6287 return true;
6288
6289 int ModsIdx =
6290 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6291 if (ModsIdx == -1)
6292 return true;
6293
6294 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6295 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6296 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6297
6298 return !OpSel && !OpSelHi;
6299}
6300
6302 const MachineOperand *MO) const {
6303 const MachineFunction &MF = *MI.getMF();
6304 const MachineRegisterInfo &MRI = MF.getRegInfo();
6305 const MCInstrDesc &InstDesc = MI.getDesc();
6306 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6307 int64_t RegClass = getOpRegClassID(OpInfo);
6308 const TargetRegisterClass *DefinedRC =
6309 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6310 if (!MO)
6311 MO = &MI.getOperand(OpIdx);
6312
6313 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6314
6315 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6316 const MachineOperand *UsedLiteral = nullptr;
6317
6318 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6319 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6320
6321 // TODO: Be more permissive with frame indexes.
6322 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6323 if (!LiteralLimit--)
6324 return false;
6325
6326 UsedLiteral = MO;
6327 }
6328
6330 if (MO->isReg())
6331 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6332
6333 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6334 if (i == OpIdx)
6335 continue;
6336 const MachineOperand &Op = MI.getOperand(i);
6337 if (Op.isReg()) {
6338 if (Op.isUse()) {
6339 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6340 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6341 if (--ConstantBusLimit <= 0)
6342 return false;
6343 }
6344 }
6345 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6346 !isInlineConstant(Op, InstDesc.operands()[i])) {
6347 // The same literal may be used multiple times.
6348 if (!UsedLiteral)
6349 UsedLiteral = &Op;
6350 else if (UsedLiteral->isIdenticalTo(Op))
6351 continue;
6352
6353 if (!LiteralLimit--)
6354 return false;
6355 if (--ConstantBusLimit <= 0)
6356 return false;
6357 }
6358 }
6359 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6360 // There can be at most one literal operand, but it can be repeated.
6361 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6362 if (i == OpIdx)
6363 continue;
6364 const MachineOperand &Op = MI.getOperand(i);
6365 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6366 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6367 !Op.isIdenticalTo(*MO))
6368 return false;
6369
6370 // Do not fold a non-inlineable and non-register operand into an
6371 // instruction that already has a frame index. The frame index handling
6372 // code could not handle well when a frame index co-exists with another
6373 // non-register operand, unless that operand is an inlineable immediate.
6374 if (Op.isFI())
6375 return false;
6376 }
6377 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6378 isF16PseudoScalarTrans(MI.getOpcode())) {
6379 return false;
6380 }
6381
6382 if (MO->isReg()) {
6383 if (!DefinedRC)
6384 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6385 return isLegalRegOperand(MI, OpIdx, *MO);
6386 }
6387
6388 if (MO->isImm()) {
6389 uint64_t Imm = MO->getImm();
6390 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6391 bool Is64BitOp = Is64BitFPOp ||
6392 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6393 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6394 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6395 if (Is64BitOp &&
6396 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6397 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6398 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6399 return false;
6400
6401 // FIXME: We can use sign extended 64-bit literals, but only for signed
6402 // operands. At the moment we do not know if an operand is signed.
6403 // Such operand will be encoded as its low 32 bits and then either
6404 // correctly sign extended or incorrectly zero extended by HW.
6405 // If 64-bit literals are supported and the literal will be encoded
6406 // as full 64 bit we still can use it.
6407 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6408 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6409 return false;
6410 }
6411 }
6412
6413 // Handle non-register types that are treated like immediates.
6414 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6415
6416 if (!DefinedRC) {
6417 // This operand expects an immediate.
6418 return true;
6419 }
6420
6421 return isImmOperandLegal(MI, OpIdx, *MO);
6422}
6423
6425 bool IsGFX950Only = ST.hasGFX950Insts();
6426 bool IsGFX940Only = ST.hasGFX940Insts();
6427
6428 if (!IsGFX950Only && !IsGFX940Only)
6429 return false;
6430
6431 if (!isVALU(MI))
6432 return false;
6433
6434 // V_COS, V_EXP, V_RCP, etc.
6435 if (isTRANS(MI))
6436 return true;
6437
6438 // DOT2, DOT2C, DOT4, etc.
6439 if (isDOT(MI))
6440 return true;
6441
6442 // MFMA, SMFMA
6443 if (isMFMA(MI))
6444 return true;
6445
6446 unsigned Opcode = MI.getOpcode();
6447 switch (Opcode) {
6448 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6449 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6450 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6451 case AMDGPU::V_MQSAD_U32_U8_e64:
6452 case AMDGPU::V_PK_ADD_F16:
6453 case AMDGPU::V_PK_ADD_F32:
6454 case AMDGPU::V_PK_ADD_I16:
6455 case AMDGPU::V_PK_ADD_U16:
6456 case AMDGPU::V_PK_ASHRREV_I16:
6457 case AMDGPU::V_PK_FMA_F16:
6458 case AMDGPU::V_PK_FMA_F32:
6459 case AMDGPU::V_PK_FMAC_F16_e32:
6460 case AMDGPU::V_PK_FMAC_F16_e64:
6461 case AMDGPU::V_PK_LSHLREV_B16:
6462 case AMDGPU::V_PK_LSHRREV_B16:
6463 case AMDGPU::V_PK_MAD_I16:
6464 case AMDGPU::V_PK_MAD_U16:
6465 case AMDGPU::V_PK_MAX_F16:
6466 case AMDGPU::V_PK_MAX_I16:
6467 case AMDGPU::V_PK_MAX_U16:
6468 case AMDGPU::V_PK_MIN_F16:
6469 case AMDGPU::V_PK_MIN_I16:
6470 case AMDGPU::V_PK_MIN_U16:
6471 case AMDGPU::V_PK_MOV_B32:
6472 case AMDGPU::V_PK_MUL_F16:
6473 case AMDGPU::V_PK_MUL_F32:
6474 case AMDGPU::V_PK_MUL_LO_U16:
6475 case AMDGPU::V_PK_SUB_I16:
6476 case AMDGPU::V_PK_SUB_U16:
6477 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6478 return true;
6479 default:
6480 return false;
6481 }
6482}
6483
6485 MachineInstr &MI) const {
6486 unsigned Opc = MI.getOpcode();
6487 const MCInstrDesc &InstrDesc = get(Opc);
6488
6489 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6490 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6491
6492 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6493 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6494
6495 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6496 // we need to only have one constant bus use before GFX10.
6497 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6498 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6499 RI.isSGPRReg(MRI, Src0.getReg()))
6500 legalizeOpWithMove(MI, Src0Idx);
6501
6502 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6503 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6504 // src0/src1 with V_READFIRSTLANE.
6505 if (Opc == AMDGPU::V_WRITELANE_B32) {
6506 const DebugLoc &DL = MI.getDebugLoc();
6507 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6508 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6509 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6510 .add(Src0);
6511 Src0.ChangeToRegister(Reg, false);
6512 }
6513 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6514 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6515 const DebugLoc &DL = MI.getDebugLoc();
6516 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6517 .add(Src1);
6518 Src1.ChangeToRegister(Reg, false);
6519 }
6520 return;
6521 }
6522
6523 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6524 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6525 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6526 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6527 legalizeOpWithMove(MI, Src2Idx);
6528 }
6529
6530 // VOP2 src0 instructions support all operand types, so we don't need to check
6531 // their legality. If src1 is already legal, we don't need to do anything.
6532 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6533 return;
6534
6535 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6536 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6537 // select is uniform.
6538 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6539 RI.isVGPR(MRI, Src1.getReg())) {
6540 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6541 const DebugLoc &DL = MI.getDebugLoc();
6542 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6543 .add(Src1);
6544 Src1.ChangeToRegister(Reg, false);
6545 return;
6546 }
6547
6548 // We do not use commuteInstruction here because it is too aggressive and will
6549 // commute if it is possible. We only want to commute here if it improves
6550 // legality. This can be called a fairly large number of times so don't waste
6551 // compile time pointlessly swapping and checking legality again.
6552 if (HasImplicitSGPR || !MI.isCommutable()) {
6553 legalizeOpWithMove(MI, Src1Idx);
6554 return;
6555 }
6556
6557 // If src0 can be used as src1, commuting will make the operands legal.
6558 // Otherwise we have to give up and insert a move.
6559 //
6560 // TODO: Other immediate-like operand kinds could be commuted if there was a
6561 // MachineOperand::ChangeTo* for them.
6562 if ((!Src1.isImm() && !Src1.isReg()) ||
6563 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6564 legalizeOpWithMove(MI, Src1Idx);
6565 return;
6566 }
6567
6568 int CommutedOpc = commuteOpcode(MI);
6569 if (CommutedOpc == -1) {
6570 legalizeOpWithMove(MI, Src1Idx);
6571 return;
6572 }
6573
6574 MI.setDesc(get(CommutedOpc));
6575
6576 Register Src0Reg = Src0.getReg();
6577 unsigned Src0SubReg = Src0.getSubReg();
6578 bool Src0Kill = Src0.isKill();
6579
6580 if (Src1.isImm())
6581 Src0.ChangeToImmediate(Src1.getImm());
6582 else if (Src1.isReg()) {
6583 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6584 Src0.setSubReg(Src1.getSubReg());
6585 } else
6586 llvm_unreachable("Should only have register or immediate operands");
6587
6588 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6589 Src1.setSubReg(Src0SubReg);
6591}
6592
6593// Legalize VOP3 operands. All operand types are supported for any operand
6594// but only one literal constant and only starting from GFX10.
6596 MachineInstr &MI) const {
6597 unsigned Opc = MI.getOpcode();
6598
6599 int VOP3Idx[3] = {
6600 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6601 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6602 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6603 };
6604
6605 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6606 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6607 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6608 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6609 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6610 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6611 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6612 // src1 and src2 must be scalar
6613 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6614 const DebugLoc &DL = MI.getDebugLoc();
6615 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6616 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6617 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6618 .add(Src1);
6619 Src1.ChangeToRegister(Reg, false);
6620 }
6621 if (VOP3Idx[2] != -1) {
6622 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6623 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6624 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6625 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6626 .add(Src2);
6627 Src2.ChangeToRegister(Reg, false);
6628 }
6629 }
6630 }
6631
6632 // Find the one SGPR operand we are allowed to use.
6633 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6634 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6635 SmallDenseSet<unsigned> SGPRsUsed;
6636 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6637 if (SGPRReg) {
6638 SGPRsUsed.insert(SGPRReg);
6639 --ConstantBusLimit;
6640 }
6641
6642 for (int Idx : VOP3Idx) {
6643 if (Idx == -1)
6644 break;
6645 MachineOperand &MO = MI.getOperand(Idx);
6646
6647 if (!MO.isReg()) {
6648 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6649 continue;
6650
6651 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6652 --LiteralLimit;
6653 --ConstantBusLimit;
6654 continue;
6655 }
6656
6657 --LiteralLimit;
6658 --ConstantBusLimit;
6659 legalizeOpWithMove(MI, Idx);
6660 continue;
6661 }
6662
6663 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6664 continue; // VGPRs are legal
6665
6666 // We can use one SGPR in each VOP3 instruction prior to GFX10
6667 // and two starting from GFX10.
6668 if (SGPRsUsed.count(MO.getReg()))
6669 continue;
6670 if (ConstantBusLimit > 0) {
6671 SGPRsUsed.insert(MO.getReg());
6672 --ConstantBusLimit;
6673 continue;
6674 }
6675
6676 // If we make it this far, then the operand is not legal and we must
6677 // legalize it.
6678 legalizeOpWithMove(MI, Idx);
6679 }
6680
6681 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6682 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6683 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6684 legalizeOpWithMove(MI, VOP3Idx[2]);
6685
6686 // Fix the register class of packed FP32 instructions on gfx12+. See
6687 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6689 for (unsigned I = 0; I < 3; ++I) {
6691 legalizeOpWithMove(MI, VOP3Idx[I]);
6692 }
6693 }
6694}
6695
6698 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6699 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6700 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6701 if (DstRC)
6702 SRC = RI.getCommonSubClass(SRC, DstRC);
6703
6704 Register DstReg = MRI.createVirtualRegister(SRC);
6705 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6706
6707 if (RI.hasAGPRs(VRC)) {
6708 VRC = RI.getEquivalentVGPRClass(VRC);
6709 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6710 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6711 get(TargetOpcode::COPY), NewSrcReg)
6712 .addReg(SrcReg);
6713 SrcReg = NewSrcReg;
6714 }
6715
6716 if (SubRegs == 1) {
6717 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6718 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6719 .addReg(SrcReg);
6720 return DstReg;
6721 }
6722
6724 for (unsigned i = 0; i < SubRegs; ++i) {
6725 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6726 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6727 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6728 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6729 SRegs.push_back(SGPR);
6730 }
6731
6733 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6734 get(AMDGPU::REG_SEQUENCE), DstReg);
6735 for (unsigned i = 0; i < SubRegs; ++i) {
6736 MIB.addReg(SRegs[i]);
6737 MIB.addImm(RI.getSubRegFromChannel(i));
6738 }
6739 return DstReg;
6740}
6741
6743 MachineInstr &MI) const {
6744
6745 // If the pointer is store in VGPRs, then we need to move them to
6746 // SGPRs using v_readfirstlane. This is safe because we only select
6747 // loads with uniform pointers to SMRD instruction so we know the
6748 // pointer value is uniform.
6749 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6750 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6751 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6752 SBase->setReg(SGPR);
6753 }
6754 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6755 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6756 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6757 SOff->setReg(SGPR);
6758 }
6759}
6760
6762 unsigned Opc = Inst.getOpcode();
6763 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6764 if (OldSAddrIdx < 0)
6765 return false;
6766
6767 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6768
6769 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6770 if (NewOpc < 0)
6772 if (NewOpc < 0)
6773 return false;
6774
6776 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6777 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6778 return false;
6779
6780 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6781 if (NewVAddrIdx < 0)
6782 return false;
6783
6784 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6785
6786 // Check vaddr, it shall be zero or absent.
6787 MachineInstr *VAddrDef = nullptr;
6788 if (OldVAddrIdx >= 0) {
6789 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6790 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6791 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6792 !VAddrDef->getOperand(1).isImm() ||
6793 VAddrDef->getOperand(1).getImm() != 0)
6794 return false;
6795 }
6796
6797 const MCInstrDesc &NewDesc = get(NewOpc);
6798 Inst.setDesc(NewDesc);
6799
6800 // Callers expect iterator to be valid after this call, so modify the
6801 // instruction in place.
6802 if (OldVAddrIdx == NewVAddrIdx) {
6803 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6804 // Clear use list from the old vaddr holding a zero register.
6805 MRI.removeRegOperandFromUseList(&NewVAddr);
6806 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6807 Inst.removeOperand(OldSAddrIdx);
6808 // Update the use list with the pointer we have just moved from vaddr to
6809 // saddr position. Otherwise new vaddr will be missing from the use list.
6810 MRI.removeRegOperandFromUseList(&NewVAddr);
6811 MRI.addRegOperandToUseList(&NewVAddr);
6812 } else {
6813 assert(OldSAddrIdx == NewVAddrIdx);
6814
6815 if (OldVAddrIdx >= 0) {
6816 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6817 AMDGPU::OpName::vdst_in);
6818
6819 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6820 // it asserts. Untie the operands for now and retie them afterwards.
6821 if (NewVDstIn != -1) {
6822 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6823 Inst.untieRegOperand(OldVDstIn);
6824 }
6825
6826 Inst.removeOperand(OldVAddrIdx);
6827
6828 if (NewVDstIn != -1) {
6829 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6830 Inst.tieOperands(NewVDst, NewVDstIn);
6831 }
6832 }
6833 }
6834
6835 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6836 VAddrDef->eraseFromParent();
6837
6838 return true;
6839}
6840
6841// FIXME: Remove this when SelectionDAG is obsoleted.
6843 MachineInstr &MI) const {
6844 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6845 return;
6846
6847 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6848 // thinks they are uniform, so a readfirstlane should be valid.
6849 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6850 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6851 return;
6852
6854 return;
6855
6856 const TargetRegisterClass *DeclaredRC =
6857 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6858
6859 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6860 SAddr->setReg(ToSGPR);
6861}
6862
6865 const TargetRegisterClass *DstRC,
6868 const DebugLoc &DL) const {
6869 Register OpReg = Op.getReg();
6870 unsigned OpSubReg = Op.getSubReg();
6871
6872 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6873 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6874
6875 // Check if operand is already the correct register class.
6876 if (DstRC == OpRC)
6877 return;
6878
6879 Register DstReg = MRI.createVirtualRegister(DstRC);
6880 auto Copy =
6881 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6882 Op.setReg(DstReg);
6883
6884 MachineInstr *Def = MRI.getVRegDef(OpReg);
6885 if (!Def)
6886 return;
6887
6888 // Try to eliminate the copy if it is copying an immediate value.
6889 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6890 foldImmediate(*Copy, *Def, OpReg, &MRI);
6891
6892 bool ImpDef = Def->isImplicitDef();
6893 while (!ImpDef && Def && Def->isCopy()) {
6894 if (Def->getOperand(1).getReg().isPhysical())
6895 break;
6896 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6897 ImpDef = Def && Def->isImplicitDef();
6898 }
6899 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6900 !ImpDef)
6901 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6902}
6903
6904// Emit the actual waterfall loop, executing the wrapped instruction for each
6905// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6906// iteration, in the worst case we execute 64 (once per lane).
6907static void
6910 MachineBasicBlock &LoopBB,
6911 MachineBasicBlock &BodyBB,
6912 const DebugLoc &DL,
6913 ArrayRef<MachineOperand *> ScalarOps) {
6914 MachineFunction &MF = *LoopBB.getParent();
6915 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6916 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6918 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6919
6921 Register CondReg;
6922
6923 for (MachineOperand *ScalarOp : ScalarOps) {
6924 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6925 unsigned NumSubRegs = RegSize / 32;
6926 Register VScalarOp = ScalarOp->getReg();
6927
6928 if (NumSubRegs == 1) {
6929 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6930
6931 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6932 .addReg(VScalarOp);
6933
6934 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6935
6936 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6937 .addReg(CurReg)
6938 .addReg(VScalarOp);
6939
6940 // Combine the comparison results with AND.
6941 if (!CondReg) // First.
6942 CondReg = NewCondReg;
6943 else { // If not the first, we create an AND.
6944 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6945 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6946 .addReg(CondReg)
6947 .addReg(NewCondReg);
6948 CondReg = AndReg;
6949 }
6950
6951 // Update ScalarOp operand to use the SGPR ScalarOp.
6952 ScalarOp->setReg(CurReg);
6953 ScalarOp->setIsKill();
6954 } else {
6955 SmallVector<Register, 8> ReadlanePieces;
6956 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6957 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6958 "Unhandled register size");
6959
6960 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6961 Register CurRegLo =
6962 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6963 Register CurRegHi =
6964 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6965
6966 // Read the next variant <- also loop target.
6967 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6968 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6969
6970 // Read the next variant <- also loop target.
6971 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6972 .addReg(VScalarOp, VScalarOpUndef,
6973 TRI->getSubRegFromChannel(Idx + 1));
6974
6975 ReadlanePieces.push_back(CurRegLo);
6976 ReadlanePieces.push_back(CurRegHi);
6977
6978 // Comparison is to be done as 64-bit.
6979 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6980 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6981 .addReg(CurRegLo)
6982 .addImm(AMDGPU::sub0)
6983 .addReg(CurRegHi)
6984 .addImm(AMDGPU::sub1);
6985
6986 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6987 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6988 NewCondReg)
6989 .addReg(CurReg);
6990 if (NumSubRegs <= 2)
6991 Cmp.addReg(VScalarOp);
6992 else
6993 Cmp.addReg(VScalarOp, VScalarOpUndef,
6994 TRI->getSubRegFromChannel(Idx, 2));
6995
6996 // Combine the comparison results with AND.
6997 if (!CondReg) // First.
6998 CondReg = NewCondReg;
6999 else { // If not the first, we create an AND.
7000 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7001 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7002 .addReg(CondReg)
7003 .addReg(NewCondReg);
7004 CondReg = AndReg;
7005 }
7006 } // End for loop.
7007
7008 const auto *SScalarOpRC =
7009 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7010 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7011
7012 // Build scalar ScalarOp.
7013 auto Merge =
7014 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7015 unsigned Channel = 0;
7016 for (Register Piece : ReadlanePieces) {
7017 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7018 }
7019
7020 // Update ScalarOp operand to use the SGPR ScalarOp.
7021 ScalarOp->setReg(SScalarOp);
7022 ScalarOp->setIsKill();
7023 }
7024 }
7025
7026 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7027 MRI.setSimpleHint(SaveExec, CondReg);
7028
7029 // Update EXEC to matching lanes, saving original to SaveExec.
7030 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7031 .addReg(CondReg, RegState::Kill);
7032
7033 // The original instruction is here; we insert the terminators after it.
7034 I = BodyBB.end();
7035
7036 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7037 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7038 .addReg(LMC.ExecReg)
7039 .addReg(SaveExec);
7040
7041 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7042}
7043
7044// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7045// with SGPRs by iterating over all unique values across all lanes.
7046// Returns the loop basic block that now contains \p MI.
7047static MachineBasicBlock *
7051 MachineBasicBlock::iterator Begin = nullptr,
7052 MachineBasicBlock::iterator End = nullptr) {
7053 MachineBasicBlock &MBB = *MI.getParent();
7054 MachineFunction &MF = *MBB.getParent();
7055 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7056 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7058 if (!Begin.isValid())
7059 Begin = &MI;
7060 if (!End.isValid()) {
7061 End = &MI;
7062 ++End;
7063 }
7064 const DebugLoc &DL = MI.getDebugLoc();
7066 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7067
7068 // Save SCC. Waterfall Loop may overwrite SCC.
7069 Register SaveSCCReg;
7070
7071 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7072 // rather than unlimited scan everywhere
7073 bool SCCNotDead =
7074 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7075 std::numeric_limits<unsigned>::max()) !=
7077 if (SCCNotDead) {
7078 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7079 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7080 .addImm(1)
7081 .addImm(0);
7082 }
7083
7084 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7085
7086 // Save the EXEC mask
7087 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7088
7089 // Killed uses in the instruction we are waterfalling around will be
7090 // incorrect due to the added control-flow.
7092 ++AfterMI;
7093 for (auto I = Begin; I != AfterMI; I++) {
7094 for (auto &MO : I->all_uses())
7095 MRI.clearKillFlags(MO.getReg());
7096 }
7097
7098 // To insert the loop we need to split the block. Move everything after this
7099 // point to a new block, and insert a new empty block between the two.
7102 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7104 ++MBBI;
7105
7106 MF.insert(MBBI, LoopBB);
7107 MF.insert(MBBI, BodyBB);
7108 MF.insert(MBBI, RemainderBB);
7109
7110 LoopBB->addSuccessor(BodyBB);
7111 BodyBB->addSuccessor(LoopBB);
7112 BodyBB->addSuccessor(RemainderBB);
7113
7114 // Move Begin to MI to the BodyBB, and the remainder of the block to
7115 // RemainderBB.
7116 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7117 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7118 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7119
7120 MBB.addSuccessor(LoopBB);
7121
7122 // Update dominators. We know that MBB immediately dominates LoopBB, that
7123 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7124 // RemainderBB. RemainderBB immediately dominates all of the successors
7125 // transferred to it from MBB that MBB used to properly dominate.
7126 if (MDT) {
7127 MDT->addNewBlock(LoopBB, &MBB);
7128 MDT->addNewBlock(BodyBB, LoopBB);
7129 MDT->addNewBlock(RemainderBB, BodyBB);
7130 for (auto &Succ : RemainderBB->successors()) {
7131 if (MDT->properlyDominates(&MBB, Succ)) {
7132 MDT->changeImmediateDominator(Succ, RemainderBB);
7133 }
7134 }
7135 }
7136
7137 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7138
7139 MachineBasicBlock::iterator First = RemainderBB->begin();
7140 // Restore SCC
7141 if (SCCNotDead) {
7142 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7143 .addReg(SaveSCCReg, RegState::Kill)
7144 .addImm(0);
7145 }
7146
7147 // Restore the EXEC mask
7148 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7149 .addReg(SaveExec);
7150 return BodyBB;
7151}
7152
7153// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7154static std::tuple<unsigned, unsigned>
7156 MachineBasicBlock &MBB = *MI.getParent();
7157 MachineFunction &MF = *MBB.getParent();
7159
7160 // Extract the ptr from the resource descriptor.
7161 unsigned RsrcPtr =
7162 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7163 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7164
7165 // Create an empty resource descriptor
7166 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7167 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7168 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7169 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7170 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7171
7172 // Zero64 = 0
7173 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7174 .addImm(0);
7175
7176 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7177 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7178 .addImm(Lo_32(RsrcDataFormat));
7179
7180 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7181 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7182 .addImm(Hi_32(RsrcDataFormat));
7183
7184 // NewSRsrc = {Zero64, SRsrcFormat}
7185 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7186 .addReg(Zero64)
7187 .addImm(AMDGPU::sub0_sub1)
7188 .addReg(SRsrcFormatLo)
7189 .addImm(AMDGPU::sub2)
7190 .addReg(SRsrcFormatHi)
7191 .addImm(AMDGPU::sub3);
7192
7193 return std::tuple(RsrcPtr, NewSRsrc);
7194}
7195
7198 MachineDominatorTree *MDT) const {
7199 MachineFunction &MF = *MI.getMF();
7201 MachineBasicBlock *CreatedBB = nullptr;
7202
7203 // Legalize VOP2
7204 if (isVOP2(MI) || isVOPC(MI)) {
7206 return CreatedBB;
7207 }
7208
7209 // Legalize VOP3
7210 if (isVOP3(MI)) {
7212 return CreatedBB;
7213 }
7214
7215 // Legalize SMRD
7216 if (isSMRD(MI)) {
7218 return CreatedBB;
7219 }
7220
7221 // Legalize FLAT
7222 if (isFLAT(MI)) {
7224 return CreatedBB;
7225 }
7226
7227 // Legalize REG_SEQUENCE and PHI
7228 // The register class of the operands much be the same type as the register
7229 // class of the output.
7230 if (MI.getOpcode() == AMDGPU::PHI) {
7231 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7232 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7233 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7234 continue;
7235 const TargetRegisterClass *OpRC =
7236 MRI.getRegClass(MI.getOperand(i).getReg());
7237 if (RI.hasVectorRegisters(OpRC)) {
7238 VRC = OpRC;
7239 } else {
7240 SRC = OpRC;
7241 }
7242 }
7243
7244 // If any of the operands are VGPR registers, then they all most be
7245 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7246 // them.
7247 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7248 if (!VRC) {
7249 assert(SRC);
7250 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7251 VRC = &AMDGPU::VReg_1RegClass;
7252 } else
7253 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7254 ? RI.getEquivalentAGPRClass(SRC)
7255 : RI.getEquivalentVGPRClass(SRC);
7256 } else {
7257 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7258 ? RI.getEquivalentAGPRClass(VRC)
7259 : RI.getEquivalentVGPRClass(VRC);
7260 }
7261 RC = VRC;
7262 } else {
7263 RC = SRC;
7264 }
7265
7266 // Update all the operands so they have the same type.
7267 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7268 MachineOperand &Op = MI.getOperand(I);
7269 if (!Op.isReg() || !Op.getReg().isVirtual())
7270 continue;
7271
7272 // MI is a PHI instruction.
7273 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7275
7276 // Avoid creating no-op copies with the same src and dst reg class. These
7277 // confuse some of the machine passes.
7278 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7279 }
7280 }
7281
7282 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7283 // VGPR dest type and SGPR sources, insert copies so all operands are
7284 // VGPRs. This seems to help operand folding / the register coalescer.
7285 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7286 MachineBasicBlock *MBB = MI.getParent();
7287 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7288 if (RI.hasVGPRs(DstRC)) {
7289 // Update all the operands so they are VGPR register classes. These may
7290 // not be the same register class because REG_SEQUENCE supports mixing
7291 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7292 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7293 MachineOperand &Op = MI.getOperand(I);
7294 if (!Op.isReg() || !Op.getReg().isVirtual())
7295 continue;
7296
7297 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7298 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7299 if (VRC == OpRC)
7300 continue;
7301
7302 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7303 Op.setIsKill();
7304 }
7305 }
7306
7307 return CreatedBB;
7308 }
7309
7310 // Legalize INSERT_SUBREG
7311 // src0 must have the same register class as dst
7312 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7313 Register Dst = MI.getOperand(0).getReg();
7314 Register Src0 = MI.getOperand(1).getReg();
7315 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7316 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7317 if (DstRC != Src0RC) {
7318 MachineBasicBlock *MBB = MI.getParent();
7319 MachineOperand &Op = MI.getOperand(1);
7320 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7321 }
7322 return CreatedBB;
7323 }
7324
7325 // Legalize SI_INIT_M0
7326 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7327 MachineOperand &Src = MI.getOperand(0);
7328 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7329 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7330 return CreatedBB;
7331 }
7332
7333 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7334 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7335 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7336 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7337 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7338 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7339 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7340 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7341 MachineOperand &Src = MI.getOperand(1);
7342 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7343 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7344 return CreatedBB;
7345 }
7346
7347 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7348 //
7349 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7350 // scratch memory access. In both cases, the legalization never involves
7351 // conversion to the addr64 form.
7353 (isMUBUF(MI) || isMTBUF(MI)))) {
7354 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7355 ? AMDGPU::OpName::rsrc
7356 : AMDGPU::OpName::srsrc;
7357 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7358 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7359 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7360
7361 AMDGPU::OpName SampOpName =
7362 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7363 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7364 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7365 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7366
7367 return CreatedBB;
7368 }
7369
7370 // Legalize SI_CALL
7371 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7372 MachineOperand *Dest = &MI.getOperand(0);
7373 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7374 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7375 // following copies, we also need to move copies from and to physical
7376 // registers into the loop block.
7377 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7378 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7379
7380 // Also move the copies to physical registers into the loop block
7381 MachineBasicBlock &MBB = *MI.getParent();
7383 while (Start->getOpcode() != FrameSetupOpcode)
7384 --Start;
7386 while (End->getOpcode() != FrameDestroyOpcode)
7387 ++End;
7388 // Also include following copies of the return value
7389 ++End;
7390 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7391 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7392 ++End;
7393 CreatedBB =
7394 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7395 }
7396 }
7397
7398 // Legalize s_sleep_var.
7399 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7400 const DebugLoc &DL = MI.getDebugLoc();
7401 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7402 int Src0Idx =
7403 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7404 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7405 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7406 .add(Src0);
7407 Src0.ChangeToRegister(Reg, false);
7408 return nullptr;
7409 }
7410
7411 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7412 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7413 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7414 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7415 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7416 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7417 for (MachineOperand &Src : MI.explicit_operands()) {
7418 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7419 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7420 }
7421 return CreatedBB;
7422 }
7423
7424 // Legalize MUBUF instructions.
7425 bool isSoffsetLegal = true;
7426 int SoffsetIdx =
7427 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7428 if (SoffsetIdx != -1) {
7429 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7430 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7431 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7432 isSoffsetLegal = false;
7433 }
7434 }
7435
7436 bool isRsrcLegal = true;
7437 int RsrcIdx =
7438 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7439 if (RsrcIdx != -1) {
7440 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7441 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7442 isRsrcLegal = false;
7443 }
7444
7445 // The operands are legal.
7446 if (isRsrcLegal && isSoffsetLegal)
7447 return CreatedBB;
7448
7449 if (!isRsrcLegal) {
7450 // Legalize a VGPR Rsrc
7451 //
7452 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7453 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7454 // a zero-value SRsrc.
7455 //
7456 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7457 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7458 // above.
7459 //
7460 // Otherwise we are on non-ADDR64 hardware, and/or we have
7461 // idxen/offen/bothen and we fall back to a waterfall loop.
7462
7463 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7464 MachineBasicBlock &MBB = *MI.getParent();
7465
7466 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7467 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7468 // This is already an ADDR64 instruction so we need to add the pointer
7469 // extracted from the resource descriptor to the current value of VAddr.
7470 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7471 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7472 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7473
7474 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7475 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7476 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7477
7478 unsigned RsrcPtr, NewSRsrc;
7479 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7480
7481 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7482 const DebugLoc &DL = MI.getDebugLoc();
7483 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7484 .addDef(CondReg0)
7485 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7486 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7487 .addImm(0);
7488
7489 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7490 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7491 .addDef(CondReg1, RegState::Dead)
7492 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7493 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7494 .addReg(CondReg0, RegState::Kill)
7495 .addImm(0);
7496
7497 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7498 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7499 .addReg(NewVAddrLo)
7500 .addImm(AMDGPU::sub0)
7501 .addReg(NewVAddrHi)
7502 .addImm(AMDGPU::sub1);
7503
7504 VAddr->setReg(NewVAddr);
7505 Rsrc->setReg(NewSRsrc);
7506 } else if (!VAddr && ST.hasAddr64()) {
7507 // This instructions is the _OFFSET variant, so we need to convert it to
7508 // ADDR64.
7509 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7510 "FIXME: Need to emit flat atomics here");
7511
7512 unsigned RsrcPtr, NewSRsrc;
7513 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7514
7515 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7516 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7517 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7518 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7519 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7520
7521 // Atomics with return have an additional tied operand and are
7522 // missing some of the special bits.
7523 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7524 MachineInstr *Addr64;
7525
7526 if (!VDataIn) {
7527 // Regular buffer load / store.
7529 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7530 .add(*VData)
7531 .addReg(NewVAddr)
7532 .addReg(NewSRsrc)
7533 .add(*SOffset)
7534 .add(*Offset);
7535
7536 if (const MachineOperand *CPol =
7537 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7538 MIB.addImm(CPol->getImm());
7539 }
7540
7541 if (const MachineOperand *TFE =
7542 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7543 MIB.addImm(TFE->getImm());
7544 }
7545
7546 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7547
7548 MIB.cloneMemRefs(MI);
7549 Addr64 = MIB;
7550 } else {
7551 // Atomics with return.
7552 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7553 .add(*VData)
7554 .add(*VDataIn)
7555 .addReg(NewVAddr)
7556 .addReg(NewSRsrc)
7557 .add(*SOffset)
7558 .add(*Offset)
7559 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7560 .cloneMemRefs(MI);
7561 }
7562
7563 MI.removeFromParent();
7564
7565 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7566 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7567 NewVAddr)
7568 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7569 .addImm(AMDGPU::sub0)
7570 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7571 .addImm(AMDGPU::sub1);
7572 } else {
7573 // Legalize a VGPR Rsrc and soffset together.
7574 if (!isSoffsetLegal) {
7575 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7576 CreatedBB =
7577 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7578 return CreatedBB;
7579 }
7580 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7581 return CreatedBB;
7582 }
7583 }
7584
7585 // Legalize a VGPR soffset.
7586 if (!isSoffsetLegal) {
7587 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7588 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7589 return CreatedBB;
7590 }
7591 return CreatedBB;
7592}
7593
7595 InstrList.insert(MI);
7596 // Add MBUF instructiosn to deferred list.
7597 int RsrcIdx =
7598 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7599 if (RsrcIdx != -1) {
7600 DeferredList.insert(MI);
7601 }
7602}
7603
7605 return DeferredList.contains(MI);
7606}
7607
7608// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7609// lowering (change spgr to vgpr).
7610// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7611// size. Need to legalize the size of the operands during the vgpr lowering
7612// chain. This can be removed after we have sgpr16 in place
7614 MachineRegisterInfo &MRI) const {
7615 if (!ST.useRealTrue16Insts())
7616 return;
7617
7618 unsigned Opcode = MI.getOpcode();
7619 MachineBasicBlock *MBB = MI.getParent();
7620 // Legalize operands and check for size mismatch
7621 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7622 OpIdx >= get(Opcode).getNumOperands() ||
7623 get(Opcode).operands()[OpIdx].RegClass == -1)
7624 return;
7625
7626 MachineOperand &Op = MI.getOperand(OpIdx);
7627 if (!Op.isReg() || !Op.getReg().isVirtual())
7628 return;
7629
7630 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7631 if (!RI.isVGPRClass(CurrRC))
7632 return;
7633
7634 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7635 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7636 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7637 Op.setSubReg(AMDGPU::lo16);
7638 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7639 const DebugLoc &DL = MI.getDebugLoc();
7640 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7641 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7642 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7643 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7644 .addReg(Op.getReg())
7645 .addImm(AMDGPU::lo16)
7646 .addReg(Undef)
7647 .addImm(AMDGPU::hi16);
7648 Op.setReg(NewDstReg);
7649 }
7650}
7652 MachineRegisterInfo &MRI) const {
7653 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7655}
7656
7658 MachineDominatorTree *MDT) const {
7659
7660 while (!Worklist.empty()) {
7661 MachineInstr &Inst = *Worklist.top();
7662 Worklist.erase_top();
7663 // Skip MachineInstr in the deferred list.
7664 if (Worklist.isDeferred(&Inst))
7665 continue;
7666 moveToVALUImpl(Worklist, MDT, Inst);
7667 }
7668
7669 // Deferred list of instructions will be processed once
7670 // all the MachineInstr in the worklist are done.
7671 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7672 moveToVALUImpl(Worklist, MDT, *Inst);
7673 assert(Worklist.empty() &&
7674 "Deferred MachineInstr are not supposed to re-populate worklist");
7675 }
7676}
7677
7680 MachineInstr &Inst) const {
7681
7683 if (!MBB)
7684 return;
7685 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7686 unsigned Opcode = Inst.getOpcode();
7687 unsigned NewOpcode = getVALUOp(Inst);
7688 const DebugLoc &DL = Inst.getDebugLoc();
7689
7690 // Handle some special cases
7691 switch (Opcode) {
7692 default:
7693 break;
7694 case AMDGPU::S_ADD_I32:
7695 case AMDGPU::S_SUB_I32: {
7696 // FIXME: The u32 versions currently selected use the carry.
7697 bool Changed;
7698 MachineBasicBlock *CreatedBBTmp = nullptr;
7699 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7700 if (Changed)
7701 return;
7702
7703 // Default handling
7704 break;
7705 }
7706
7707 case AMDGPU::S_MUL_U64:
7708 if (ST.hasVectorMulU64()) {
7709 NewOpcode = AMDGPU::V_MUL_U64_e64;
7710 break;
7711 }
7712 // Split s_mul_u64 in 32-bit vector multiplications.
7713 splitScalarSMulU64(Worklist, Inst, MDT);
7714 Inst.eraseFromParent();
7715 return;
7716
7717 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7718 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7719 // This is a special case of s_mul_u64 where all the operands are either
7720 // zero extended or sign extended.
7721 splitScalarSMulPseudo(Worklist, Inst, MDT);
7722 Inst.eraseFromParent();
7723 return;
7724
7725 case AMDGPU::S_AND_B64:
7726 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7727 Inst.eraseFromParent();
7728 return;
7729
7730 case AMDGPU::S_OR_B64:
7731 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7732 Inst.eraseFromParent();
7733 return;
7734
7735 case AMDGPU::S_XOR_B64:
7736 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7737 Inst.eraseFromParent();
7738 return;
7739
7740 case AMDGPU::S_NAND_B64:
7741 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7742 Inst.eraseFromParent();
7743 return;
7744
7745 case AMDGPU::S_NOR_B64:
7746 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7747 Inst.eraseFromParent();
7748 return;
7749
7750 case AMDGPU::S_XNOR_B64:
7751 if (ST.hasDLInsts())
7752 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7753 else
7754 splitScalar64BitXnor(Worklist, Inst, MDT);
7755 Inst.eraseFromParent();
7756 return;
7757
7758 case AMDGPU::S_ANDN2_B64:
7759 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7760 Inst.eraseFromParent();
7761 return;
7762
7763 case AMDGPU::S_ORN2_B64:
7764 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7765 Inst.eraseFromParent();
7766 return;
7767
7768 case AMDGPU::S_BREV_B64:
7769 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7770 Inst.eraseFromParent();
7771 return;
7772
7773 case AMDGPU::S_NOT_B64:
7774 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7775 Inst.eraseFromParent();
7776 return;
7777
7778 case AMDGPU::S_BCNT1_I32_B64:
7779 splitScalar64BitBCNT(Worklist, Inst);
7780 Inst.eraseFromParent();
7781 return;
7782
7783 case AMDGPU::S_BFE_I64:
7784 splitScalar64BitBFE(Worklist, Inst);
7785 Inst.eraseFromParent();
7786 return;
7787
7788 case AMDGPU::S_FLBIT_I32_B64:
7789 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7790 Inst.eraseFromParent();
7791 return;
7792 case AMDGPU::S_FF1_I32_B64:
7793 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7794 Inst.eraseFromParent();
7795 return;
7796
7797 case AMDGPU::S_LSHL_B32:
7798 if (ST.hasOnlyRevVALUShifts()) {
7799 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7800 swapOperands(Inst);
7801 }
7802 break;
7803 case AMDGPU::S_ASHR_I32:
7804 if (ST.hasOnlyRevVALUShifts()) {
7805 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7806 swapOperands(Inst);
7807 }
7808 break;
7809 case AMDGPU::S_LSHR_B32:
7810 if (ST.hasOnlyRevVALUShifts()) {
7811 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7812 swapOperands(Inst);
7813 }
7814 break;
7815 case AMDGPU::S_LSHL_B64:
7816 if (ST.hasOnlyRevVALUShifts()) {
7817 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7818 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7819 : AMDGPU::V_LSHLREV_B64_e64;
7820 swapOperands(Inst);
7821 }
7822 break;
7823 case AMDGPU::S_ASHR_I64:
7824 if (ST.hasOnlyRevVALUShifts()) {
7825 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7826 swapOperands(Inst);
7827 }
7828 break;
7829 case AMDGPU::S_LSHR_B64:
7830 if (ST.hasOnlyRevVALUShifts()) {
7831 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7832 swapOperands(Inst);
7833 }
7834 break;
7835
7836 case AMDGPU::S_ABS_I32:
7837 lowerScalarAbs(Worklist, Inst);
7838 Inst.eraseFromParent();
7839 return;
7840
7841 case AMDGPU::S_ABSDIFF_I32:
7842 lowerScalarAbsDiff(Worklist, Inst);
7843 Inst.eraseFromParent();
7844 return;
7845
7846 case AMDGPU::S_CBRANCH_SCC0:
7847 case AMDGPU::S_CBRANCH_SCC1: {
7848 // Clear unused bits of vcc
7849 Register CondReg = Inst.getOperand(1).getReg();
7850 bool IsSCC = CondReg == AMDGPU::SCC;
7852 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7853 .addReg(LMC.ExecReg)
7854 .addReg(IsSCC ? LMC.VccReg : CondReg);
7855 Inst.removeOperand(1);
7856 } break;
7857
7858 case AMDGPU::S_BFE_U64:
7859 case AMDGPU::S_BFM_B64:
7860 llvm_unreachable("Moving this op to VALU not implemented");
7861
7862 case AMDGPU::S_PACK_LL_B32_B16:
7863 case AMDGPU::S_PACK_LH_B32_B16:
7864 case AMDGPU::S_PACK_HL_B32_B16:
7865 case AMDGPU::S_PACK_HH_B32_B16:
7866 movePackToVALU(Worklist, MRI, Inst);
7867 Inst.eraseFromParent();
7868 return;
7869
7870 case AMDGPU::S_XNOR_B32:
7871 lowerScalarXnor(Worklist, Inst);
7872 Inst.eraseFromParent();
7873 return;
7874
7875 case AMDGPU::S_NAND_B32:
7876 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7877 Inst.eraseFromParent();
7878 return;
7879
7880 case AMDGPU::S_NOR_B32:
7881 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7882 Inst.eraseFromParent();
7883 return;
7884
7885 case AMDGPU::S_ANDN2_B32:
7886 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7887 Inst.eraseFromParent();
7888 return;
7889
7890 case AMDGPU::S_ORN2_B32:
7891 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7892 Inst.eraseFromParent();
7893 return;
7894
7895 // TODO: remove as soon as everything is ready
7896 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7897 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7898 // can only be selected from the uniform SDNode.
7899 case AMDGPU::S_ADD_CO_PSEUDO:
7900 case AMDGPU::S_SUB_CO_PSEUDO: {
7901 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7902 ? AMDGPU::V_ADDC_U32_e64
7903 : AMDGPU::V_SUBB_U32_e64;
7904 const auto *CarryRC = RI.getWaveMaskRegClass();
7905
7906 Register CarryInReg = Inst.getOperand(4).getReg();
7907 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7908 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7909 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7910 .addReg(CarryInReg);
7911 }
7912
7913 Register CarryOutReg = Inst.getOperand(1).getReg();
7914
7915 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7916 MRI.getRegClass(Inst.getOperand(0).getReg())));
7917 MachineInstr *CarryOp =
7918 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7919 .addReg(CarryOutReg, RegState::Define)
7920 .add(Inst.getOperand(2))
7921 .add(Inst.getOperand(3))
7922 .addReg(CarryInReg)
7923 .addImm(0);
7924 legalizeOperands(*CarryOp);
7925 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7926 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7927 Inst.eraseFromParent();
7928 }
7929 return;
7930 case AMDGPU::S_UADDO_PSEUDO:
7931 case AMDGPU::S_USUBO_PSEUDO: {
7932 MachineOperand &Dest0 = Inst.getOperand(0);
7933 MachineOperand &Dest1 = Inst.getOperand(1);
7934 MachineOperand &Src0 = Inst.getOperand(2);
7935 MachineOperand &Src1 = Inst.getOperand(3);
7936
7937 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7938 ? AMDGPU::V_ADD_CO_U32_e64
7939 : AMDGPU::V_SUB_CO_U32_e64;
7940 const TargetRegisterClass *NewRC =
7941 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7942 Register DestReg = MRI.createVirtualRegister(NewRC);
7943 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7944 .addReg(Dest1.getReg(), RegState::Define)
7945 .add(Src0)
7946 .add(Src1)
7947 .addImm(0); // clamp bit
7948
7949 legalizeOperands(*NewInstr, MDT);
7950 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7951 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7952 Inst.eraseFromParent();
7953 }
7954 return;
7955 case AMDGPU::S_LSHL1_ADD_U32:
7956 case AMDGPU::S_LSHL2_ADD_U32:
7957 case AMDGPU::S_LSHL3_ADD_U32:
7958 case AMDGPU::S_LSHL4_ADD_U32: {
7959 MachineOperand &Dest = Inst.getOperand(0);
7960 MachineOperand &Src0 = Inst.getOperand(1);
7961 MachineOperand &Src1 = Inst.getOperand(2);
7962 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
7963 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
7964 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
7965 : 4);
7966
7967 const TargetRegisterClass *NewRC =
7968 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
7969 Register DestReg = MRI.createVirtualRegister(NewRC);
7970 MachineInstr *NewInstr =
7971 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
7972 .add(Src0)
7973 .addImm(ShiftAmt)
7974 .add(Src1);
7975
7976 legalizeOperands(*NewInstr, MDT);
7977 MRI.replaceRegWith(Dest.getReg(), DestReg);
7978 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7979 Inst.eraseFromParent();
7980 }
7981 return;
7982 case AMDGPU::S_CSELECT_B32:
7983 case AMDGPU::S_CSELECT_B64:
7984 lowerSelect(Worklist, Inst, MDT);
7985 Inst.eraseFromParent();
7986 return;
7987 case AMDGPU::S_CMP_EQ_I32:
7988 case AMDGPU::S_CMP_LG_I32:
7989 case AMDGPU::S_CMP_GT_I32:
7990 case AMDGPU::S_CMP_GE_I32:
7991 case AMDGPU::S_CMP_LT_I32:
7992 case AMDGPU::S_CMP_LE_I32:
7993 case AMDGPU::S_CMP_EQ_U32:
7994 case AMDGPU::S_CMP_LG_U32:
7995 case AMDGPU::S_CMP_GT_U32:
7996 case AMDGPU::S_CMP_GE_U32:
7997 case AMDGPU::S_CMP_LT_U32:
7998 case AMDGPU::S_CMP_LE_U32:
7999 case AMDGPU::S_CMP_EQ_U64:
8000 case AMDGPU::S_CMP_LG_U64:
8001 case AMDGPU::S_CMP_LT_F32:
8002 case AMDGPU::S_CMP_EQ_F32:
8003 case AMDGPU::S_CMP_LE_F32:
8004 case AMDGPU::S_CMP_GT_F32:
8005 case AMDGPU::S_CMP_LG_F32:
8006 case AMDGPU::S_CMP_GE_F32:
8007 case AMDGPU::S_CMP_O_F32:
8008 case AMDGPU::S_CMP_U_F32:
8009 case AMDGPU::S_CMP_NGE_F32:
8010 case AMDGPU::S_CMP_NLG_F32:
8011 case AMDGPU::S_CMP_NGT_F32:
8012 case AMDGPU::S_CMP_NLE_F32:
8013 case AMDGPU::S_CMP_NEQ_F32:
8014 case AMDGPU::S_CMP_NLT_F32: {
8015 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8016 auto NewInstr =
8017 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8018 .setMIFlags(Inst.getFlags());
8019 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8020 0) {
8021 NewInstr
8022 .addImm(0) // src0_modifiers
8023 .add(Inst.getOperand(0)) // src0
8024 .addImm(0) // src1_modifiers
8025 .add(Inst.getOperand(1)) // src1
8026 .addImm(0); // clamp
8027 } else {
8028 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8029 }
8030 legalizeOperands(*NewInstr, MDT);
8031 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8032 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8033 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8034 Inst.eraseFromParent();
8035 return;
8036 }
8037 case AMDGPU::S_CMP_LT_F16:
8038 case AMDGPU::S_CMP_EQ_F16:
8039 case AMDGPU::S_CMP_LE_F16:
8040 case AMDGPU::S_CMP_GT_F16:
8041 case AMDGPU::S_CMP_LG_F16:
8042 case AMDGPU::S_CMP_GE_F16:
8043 case AMDGPU::S_CMP_O_F16:
8044 case AMDGPU::S_CMP_U_F16:
8045 case AMDGPU::S_CMP_NGE_F16:
8046 case AMDGPU::S_CMP_NLG_F16:
8047 case AMDGPU::S_CMP_NGT_F16:
8048 case AMDGPU::S_CMP_NLE_F16:
8049 case AMDGPU::S_CMP_NEQ_F16:
8050 case AMDGPU::S_CMP_NLT_F16: {
8051 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8052 auto NewInstr =
8053 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8054 .setMIFlags(Inst.getFlags());
8055 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8056 NewInstr
8057 .addImm(0) // src0_modifiers
8058 .add(Inst.getOperand(0)) // src0
8059 .addImm(0) // src1_modifiers
8060 .add(Inst.getOperand(1)) // src1
8061 .addImm(0); // clamp
8062 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8063 NewInstr.addImm(0); // op_sel0
8064 } else {
8065 NewInstr
8066 .add(Inst.getOperand(0))
8067 .add(Inst.getOperand(1));
8068 }
8069 legalizeOperandsVALUt16(*NewInstr, MRI);
8070 legalizeOperands(*NewInstr, MDT);
8071 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8072 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8073 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8074 Inst.eraseFromParent();
8075 return;
8076 }
8077 case AMDGPU::S_CVT_HI_F32_F16: {
8078 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8079 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8080 if (ST.useRealTrue16Insts()) {
8081 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8082 .add(Inst.getOperand(1));
8083 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8084 .addImm(0) // src0_modifiers
8085 .addReg(TmpReg, 0, AMDGPU::hi16)
8086 .addImm(0) // clamp
8087 .addImm(0) // omod
8088 .addImm(0); // op_sel0
8089 } else {
8090 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8091 .addImm(16)
8092 .add(Inst.getOperand(1));
8093 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8094 .addImm(0) // src0_modifiers
8095 .addReg(TmpReg)
8096 .addImm(0) // clamp
8097 .addImm(0); // omod
8098 }
8099
8100 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8101 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8102 Inst.eraseFromParent();
8103 return;
8104 }
8105 case AMDGPU::S_MINIMUM_F32:
8106 case AMDGPU::S_MAXIMUM_F32: {
8107 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8108 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8109 .addImm(0) // src0_modifiers
8110 .add(Inst.getOperand(1))
8111 .addImm(0) // src1_modifiers
8112 .add(Inst.getOperand(2))
8113 .addImm(0) // clamp
8114 .addImm(0); // omod
8115 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8116
8117 legalizeOperands(*NewInstr, MDT);
8118 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8119 Inst.eraseFromParent();
8120 return;
8121 }
8122 case AMDGPU::S_MINIMUM_F16:
8123 case AMDGPU::S_MAXIMUM_F16: {
8124 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8125 ? &AMDGPU::VGPR_16RegClass
8126 : &AMDGPU::VGPR_32RegClass);
8127 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8128 .addImm(0) // src0_modifiers
8129 .add(Inst.getOperand(1))
8130 .addImm(0) // src1_modifiers
8131 .add(Inst.getOperand(2))
8132 .addImm(0) // clamp
8133 .addImm(0) // omod
8134 .addImm(0); // opsel0
8135 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8136 legalizeOperandsVALUt16(*NewInstr, MRI);
8137 legalizeOperands(*NewInstr, MDT);
8138 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8139 Inst.eraseFromParent();
8140 return;
8141 }
8142 case AMDGPU::V_S_EXP_F16_e64:
8143 case AMDGPU::V_S_LOG_F16_e64:
8144 case AMDGPU::V_S_RCP_F16_e64:
8145 case AMDGPU::V_S_RSQ_F16_e64:
8146 case AMDGPU::V_S_SQRT_F16_e64: {
8147 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8148 ? &AMDGPU::VGPR_16RegClass
8149 : &AMDGPU::VGPR_32RegClass);
8150 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8151 .add(Inst.getOperand(1)) // src0_modifiers
8152 .add(Inst.getOperand(2))
8153 .add(Inst.getOperand(3)) // clamp
8154 .add(Inst.getOperand(4)) // omod
8155 .setMIFlags(Inst.getFlags());
8156 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8157 NewInstr.addImm(0); // opsel0
8158 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8159 legalizeOperandsVALUt16(*NewInstr, MRI);
8160 legalizeOperands(*NewInstr, MDT);
8161 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8162 Inst.eraseFromParent();
8163 return;
8164 }
8165 }
8166
8167 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8168 // We cannot move this instruction to the VALU, so we should try to
8169 // legalize its operands instead.
8170 legalizeOperands(Inst, MDT);
8171 return;
8172 }
8173 // Handle converting generic instructions like COPY-to-SGPR into
8174 // COPY-to-VGPR.
8175 if (NewOpcode == Opcode) {
8176 Register DstReg = Inst.getOperand(0).getReg();
8177 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8178
8179 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8180 // hope for the best.
8181 if (Inst.isCopy() && DstReg.isPhysical() &&
8182 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8183 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8184 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8185 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8186 .add(Inst.getOperand(1));
8187 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8188 DstReg)
8189 .addReg(NewDst);
8190
8191 Inst.eraseFromParent();
8192 return;
8193 }
8194
8195 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8196 Register NewDstReg = Inst.getOperand(1).getReg();
8197 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8198 if (const TargetRegisterClass *CommonRC =
8199 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8200 // Instead of creating a copy where src and dst are the same register
8201 // class, we just replace all uses of dst with src. These kinds of
8202 // copies interfere with the heuristics MachineSink uses to decide
8203 // whether or not to split a critical edge. Since the pass assumes
8204 // that copies will end up as machine instructions and not be
8205 // eliminated.
8206 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8207 MRI.replaceRegWith(DstReg, NewDstReg);
8208 MRI.clearKillFlags(NewDstReg);
8209 Inst.getOperand(0).setReg(DstReg);
8210
8211 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8212 llvm_unreachable("failed to constrain register");
8213
8214 Inst.eraseFromParent();
8215 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8216 for (MachineOperand &MO :
8217 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8218 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8219 }
8220
8221 return;
8222 }
8223 }
8224
8225 // If this is a v2s copy between 16bit and 32bit reg,
8226 // replace vgpr copy to reg_sequence/extract_subreg
8227 // This can be remove after we have sgpr16 in place
8228 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8229 Inst.getOperand(1).getReg().isVirtual() &&
8230 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8231 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8232 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8233 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8234 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8235 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8236 get(AMDGPU::IMPLICIT_DEF), Undef);
8237 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8238 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8239 .addReg(Inst.getOperand(1).getReg())
8240 .addImm(AMDGPU::lo16)
8241 .addReg(Undef)
8242 .addImm(AMDGPU::hi16);
8243 Inst.eraseFromParent();
8244 MRI.replaceRegWith(DstReg, NewDstReg);
8245 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8246 return;
8247 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8248 AMDGPU::lo16)) {
8249 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8250 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8251 MRI.replaceRegWith(DstReg, NewDstReg);
8252 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8253 return;
8254 }
8255 }
8256
8257 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8258 MRI.replaceRegWith(DstReg, NewDstReg);
8259 legalizeOperands(Inst, MDT);
8260 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8261 return;
8262 }
8263
8264 // Use the new VALU Opcode.
8265 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8266 .setMIFlags(Inst.getFlags());
8267 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8268 // Intersperse VOP3 modifiers among the SALU operands.
8269 NewInstr->addOperand(Inst.getOperand(0));
8270 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8271 AMDGPU::OpName::src0_modifiers) >= 0)
8272 NewInstr.addImm(0);
8273 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8274 const MachineOperand &Src = Inst.getOperand(1);
8275 NewInstr->addOperand(Src);
8276 }
8277
8278 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8279 // We are converting these to a BFE, so we need to add the missing
8280 // operands for the size and offset.
8281 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8282 NewInstr.addImm(0);
8283 NewInstr.addImm(Size);
8284 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8285 // The VALU version adds the second operand to the result, so insert an
8286 // extra 0 operand.
8287 NewInstr.addImm(0);
8288 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8289 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8290 // If we need to move this to VGPRs, we need to unpack the second
8291 // operand back into the 2 separate ones for bit offset and width.
8292 assert(OffsetWidthOp.isImm() &&
8293 "Scalar BFE is only implemented for constant width and offset");
8294 uint32_t Imm = OffsetWidthOp.getImm();
8295
8296 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8297 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8298 NewInstr.addImm(Offset);
8299 NewInstr.addImm(BitWidth);
8300 } else {
8301 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8302 AMDGPU::OpName::src1_modifiers) >= 0)
8303 NewInstr.addImm(0);
8304 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8305 NewInstr->addOperand(Inst.getOperand(2));
8306 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8307 AMDGPU::OpName::src2_modifiers) >= 0)
8308 NewInstr.addImm(0);
8309 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8310 NewInstr->addOperand(Inst.getOperand(3));
8311 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8312 NewInstr.addImm(0);
8313 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8314 NewInstr.addImm(0);
8315 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8316 NewInstr.addImm(0);
8317 }
8318 } else {
8319 // Just copy the SALU operands.
8320 for (const MachineOperand &Op : Inst.explicit_operands())
8321 NewInstr->addOperand(Op);
8322 }
8323
8324 // Remove any references to SCC. Vector instructions can't read from it, and
8325 // We're just about to add the implicit use / defs of VCC, and we don't want
8326 // both.
8327 for (MachineOperand &Op : Inst.implicit_operands()) {
8328 if (Op.getReg() == AMDGPU::SCC) {
8329 // Only propagate through live-def of SCC.
8330 if (Op.isDef() && !Op.isDead())
8331 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8332 if (Op.isUse())
8333 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8334 }
8335 }
8336 Inst.eraseFromParent();
8337 Register NewDstReg;
8338 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8339 Register DstReg = NewInstr->getOperand(0).getReg();
8340 assert(DstReg.isVirtual());
8341 // Update the destination register class.
8342 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8343 assert(NewDstRC);
8344 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8345 MRI.replaceRegWith(DstReg, NewDstReg);
8346 }
8347 fixImplicitOperands(*NewInstr);
8348
8349 legalizeOperandsVALUt16(*NewInstr, MRI);
8350
8351 // Legalize the operands
8352 legalizeOperands(*NewInstr, MDT);
8353 if (NewDstReg)
8354 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8355}
8356
8357// Add/sub require special handling to deal with carry outs.
8358std::pair<bool, MachineBasicBlock *>
8359SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8360 MachineDominatorTree *MDT) const {
8361 if (ST.hasAddNoCarry()) {
8362 // Assume there is no user of scc since we don't select this in that case.
8363 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8364 // is used.
8365
8366 MachineBasicBlock &MBB = *Inst.getParent();
8367 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8368
8369 Register OldDstReg = Inst.getOperand(0).getReg();
8370 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8371
8372 unsigned Opc = Inst.getOpcode();
8373 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8374
8375 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8376 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8377
8378 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8379 Inst.removeOperand(3);
8380
8381 Inst.setDesc(get(NewOpc));
8382 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8383 Inst.addImplicitDefUseOperands(*MBB.getParent());
8384 MRI.replaceRegWith(OldDstReg, ResultReg);
8385 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8386
8387 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8388 return std::pair(true, NewBB);
8389 }
8390
8391 return std::pair(false, nullptr);
8392}
8393
8394void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8395 MachineDominatorTree *MDT) const {
8396
8397 MachineBasicBlock &MBB = *Inst.getParent();
8398 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8399 MachineBasicBlock::iterator MII = Inst;
8400 DebugLoc DL = Inst.getDebugLoc();
8401
8402 MachineOperand &Dest = Inst.getOperand(0);
8403 MachineOperand &Src0 = Inst.getOperand(1);
8404 MachineOperand &Src1 = Inst.getOperand(2);
8405 MachineOperand &Cond = Inst.getOperand(3);
8406
8407 Register CondReg = Cond.getReg();
8408 bool IsSCC = (CondReg == AMDGPU::SCC);
8409
8410 // If this is a trivial select where the condition is effectively not SCC
8411 // (CondReg is a source of copy to SCC), then the select is semantically
8412 // equivalent to copying CondReg. Hence, there is no need to create
8413 // V_CNDMASK, we can just use that and bail out.
8414 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8415 (Src1.getImm() == 0)) {
8416 MRI.replaceRegWith(Dest.getReg(), CondReg);
8417 return;
8418 }
8419
8420 Register NewCondReg = CondReg;
8421 if (IsSCC) {
8422 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8423 NewCondReg = MRI.createVirtualRegister(TC);
8424
8425 // Now look for the closest SCC def if it is a copy
8426 // replacing the CondReg with the COPY source register
8427 bool CopyFound = false;
8428 for (MachineInstr &CandI :
8430 Inst.getParent()->rend())) {
8431 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8432 -1) {
8433 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8434 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8435 .addReg(CandI.getOperand(1).getReg());
8436 CopyFound = true;
8437 }
8438 break;
8439 }
8440 }
8441 if (!CopyFound) {
8442 // SCC def is not a copy
8443 // Insert a trivial select instead of creating a copy, because a copy from
8444 // SCC would semantically mean just copying a single bit, but we may need
8445 // the result to be a vector condition mask that needs preserving.
8446 unsigned Opcode =
8447 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8448 auto NewSelect =
8449 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8450 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8451 }
8452 }
8453
8454 Register NewDestReg = MRI.createVirtualRegister(
8455 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8456 MachineInstr *NewInst;
8457 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8458 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8459 .addImm(0)
8460 .add(Src1) // False
8461 .addImm(0)
8462 .add(Src0) // True
8463 .addReg(NewCondReg);
8464 } else {
8465 NewInst =
8466 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8467 .add(Src1) // False
8468 .add(Src0) // True
8469 .addReg(NewCondReg);
8470 }
8471 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8472 legalizeOperands(*NewInst, MDT);
8473 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8474}
8475
8476void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8477 MachineInstr &Inst) const {
8478 MachineBasicBlock &MBB = *Inst.getParent();
8479 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8480 MachineBasicBlock::iterator MII = Inst;
8481 DebugLoc DL = Inst.getDebugLoc();
8482
8483 MachineOperand &Dest = Inst.getOperand(0);
8484 MachineOperand &Src = Inst.getOperand(1);
8485 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8486 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8487
8488 unsigned SubOp = ST.hasAddNoCarry() ?
8489 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8490
8491 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8492 .addImm(0)
8493 .addReg(Src.getReg());
8494
8495 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8496 .addReg(Src.getReg())
8497 .addReg(TmpReg);
8498
8499 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8500 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8501}
8502
8503void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8504 MachineInstr &Inst) const {
8505 MachineBasicBlock &MBB = *Inst.getParent();
8506 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8507 MachineBasicBlock::iterator MII = Inst;
8508 const DebugLoc &DL = Inst.getDebugLoc();
8509
8510 MachineOperand &Dest = Inst.getOperand(0);
8511 MachineOperand &Src1 = Inst.getOperand(1);
8512 MachineOperand &Src2 = Inst.getOperand(2);
8513 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8514 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8515 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8516
8517 unsigned SubOp =
8518 ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8519
8520 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8521 .addReg(Src1.getReg())
8522 .addReg(Src2.getReg());
8523
8524 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8525
8526 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8527 .addReg(SubResultReg)
8528 .addReg(TmpReg);
8529
8530 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8531 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8532}
8533
8534void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8535 MachineInstr &Inst) const {
8536 MachineBasicBlock &MBB = *Inst.getParent();
8537 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8538 MachineBasicBlock::iterator MII = Inst;
8539 const DebugLoc &DL = Inst.getDebugLoc();
8540
8541 MachineOperand &Dest = Inst.getOperand(0);
8542 MachineOperand &Src0 = Inst.getOperand(1);
8543 MachineOperand &Src1 = Inst.getOperand(2);
8544
8545 if (ST.hasDLInsts()) {
8546 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8547 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8548 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8549
8550 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8551 .add(Src0)
8552 .add(Src1);
8553
8554 MRI.replaceRegWith(Dest.getReg(), NewDest);
8555 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8556 } else {
8557 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8558 // invert either source and then perform the XOR. If either source is a
8559 // scalar register, then we can leave the inversion on the scalar unit to
8560 // achieve a better distribution of scalar and vector instructions.
8561 bool Src0IsSGPR = Src0.isReg() &&
8562 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8563 bool Src1IsSGPR = Src1.isReg() &&
8564 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8565 MachineInstr *Xor;
8566 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8567 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8568
8569 // Build a pair of scalar instructions and add them to the work list.
8570 // The next iteration over the work list will lower these to the vector
8571 // unit as necessary.
8572 if (Src0IsSGPR) {
8573 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8574 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8575 .addReg(Temp)
8576 .add(Src1);
8577 } else if (Src1IsSGPR) {
8578 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8579 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8580 .add(Src0)
8581 .addReg(Temp);
8582 } else {
8583 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8584 .add(Src0)
8585 .add(Src1);
8586 MachineInstr *Not =
8587 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8588 Worklist.insert(Not);
8589 }
8590
8591 MRI.replaceRegWith(Dest.getReg(), NewDest);
8592
8593 Worklist.insert(Xor);
8594
8595 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8596 }
8597}
8598
8599void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8600 MachineInstr &Inst,
8601 unsigned Opcode) const {
8602 MachineBasicBlock &MBB = *Inst.getParent();
8603 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8604 MachineBasicBlock::iterator MII = Inst;
8605 const DebugLoc &DL = Inst.getDebugLoc();
8606
8607 MachineOperand &Dest = Inst.getOperand(0);
8608 MachineOperand &Src0 = Inst.getOperand(1);
8609 MachineOperand &Src1 = Inst.getOperand(2);
8610
8611 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8612 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8613
8614 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8615 .add(Src0)
8616 .add(Src1);
8617
8618 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8619 .addReg(Interm);
8620
8621 Worklist.insert(&Op);
8622 Worklist.insert(&Not);
8623
8624 MRI.replaceRegWith(Dest.getReg(), NewDest);
8625 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8626}
8627
8628void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8629 MachineInstr &Inst,
8630 unsigned Opcode) const {
8631 MachineBasicBlock &MBB = *Inst.getParent();
8632 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8633 MachineBasicBlock::iterator MII = Inst;
8634 const DebugLoc &DL = Inst.getDebugLoc();
8635
8636 MachineOperand &Dest = Inst.getOperand(0);
8637 MachineOperand &Src0 = Inst.getOperand(1);
8638 MachineOperand &Src1 = Inst.getOperand(2);
8639
8640 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8641 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8642
8643 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8644 .add(Src1);
8645
8646 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8647 .add(Src0)
8648 .addReg(Interm);
8649
8650 Worklist.insert(&Not);
8651 Worklist.insert(&Op);
8652
8653 MRI.replaceRegWith(Dest.getReg(), NewDest);
8654 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8655}
8656
8657void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8658 MachineInstr &Inst, unsigned Opcode,
8659 bool Swap) const {
8660 MachineBasicBlock &MBB = *Inst.getParent();
8661 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8662
8663 MachineOperand &Dest = Inst.getOperand(0);
8664 MachineOperand &Src0 = Inst.getOperand(1);
8665 DebugLoc DL = Inst.getDebugLoc();
8666
8667 MachineBasicBlock::iterator MII = Inst;
8668
8669 const MCInstrDesc &InstDesc = get(Opcode);
8670 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8671 MRI.getRegClass(Src0.getReg()) :
8672 &AMDGPU::SGPR_32RegClass;
8673
8674 const TargetRegisterClass *Src0SubRC =
8675 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8676
8677 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8678 AMDGPU::sub0, Src0SubRC);
8679
8680 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8681 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8682 const TargetRegisterClass *NewDestSubRC =
8683 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8684
8685 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8686 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8687
8688 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8689 AMDGPU::sub1, Src0SubRC);
8690
8691 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8692 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8693
8694 if (Swap)
8695 std::swap(DestSub0, DestSub1);
8696
8697 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8698 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8699 .addReg(DestSub0)
8700 .addImm(AMDGPU::sub0)
8701 .addReg(DestSub1)
8702 .addImm(AMDGPU::sub1);
8703
8704 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8705
8706 Worklist.insert(&LoHalf);
8707 Worklist.insert(&HiHalf);
8708
8709 // We don't need to legalizeOperands here because for a single operand, src0
8710 // will support any kind of input.
8711
8712 // Move all users of this moved value.
8713 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8714}
8715
8716// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8717// split the s_mul_u64 in 32-bit vector multiplications.
8718void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8719 MachineInstr &Inst,
8720 MachineDominatorTree *MDT) const {
8721 MachineBasicBlock &MBB = *Inst.getParent();
8722 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8723
8724 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8725 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8726 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8727
8728 MachineOperand &Dest = Inst.getOperand(0);
8729 MachineOperand &Src0 = Inst.getOperand(1);
8730 MachineOperand &Src1 = Inst.getOperand(2);
8731 const DebugLoc &DL = Inst.getDebugLoc();
8732 MachineBasicBlock::iterator MII = Inst;
8733
8734 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8735 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8736 const TargetRegisterClass *Src0SubRC =
8737 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8738 if (RI.isSGPRClass(Src0SubRC))
8739 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8740 const TargetRegisterClass *Src1SubRC =
8741 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8742 if (RI.isSGPRClass(Src1SubRC))
8743 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8744
8745 // First, we extract the low 32-bit and high 32-bit values from each of the
8746 // operands.
8747 MachineOperand Op0L =
8748 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8749 MachineOperand Op1L =
8750 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8751 MachineOperand Op0H =
8752 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8753 MachineOperand Op1H =
8754 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8755
8756 // The multilication is done as follows:
8757 //
8758 // Op1H Op1L
8759 // * Op0H Op0L
8760 // --------------------
8761 // Op1H*Op0L Op1L*Op0L
8762 // + Op1H*Op0H Op1L*Op0H
8763 // -----------------------------------------
8764 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8765 //
8766 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8767 // value and that would overflow.
8768 // The low 32-bit value is Op1L*Op0L.
8769 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8770
8771 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8772 MachineInstr *Op1L_Op0H =
8773 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8774 .add(Op1L)
8775 .add(Op0H);
8776
8777 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8778 MachineInstr *Op1H_Op0L =
8779 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8780 .add(Op1H)
8781 .add(Op0L);
8782
8783 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8784 MachineInstr *Carry =
8785 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8786 .add(Op1L)
8787 .add(Op0L);
8788
8789 MachineInstr *LoHalf =
8790 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8791 .add(Op1L)
8792 .add(Op0L);
8793
8794 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8795 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8796 .addReg(Op1L_Op0H_Reg)
8797 .addReg(Op1H_Op0L_Reg);
8798
8799 MachineInstr *HiHalf =
8800 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8801 .addReg(AddReg)
8802 .addReg(CarryReg);
8803
8804 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8805 .addReg(DestSub0)
8806 .addImm(AMDGPU::sub0)
8807 .addReg(DestSub1)
8808 .addImm(AMDGPU::sub1);
8809
8810 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8811
8812 // Try to legalize the operands in case we need to swap the order to keep it
8813 // valid.
8814 legalizeOperands(*Op1L_Op0H, MDT);
8815 legalizeOperands(*Op1H_Op0L, MDT);
8816 legalizeOperands(*Carry, MDT);
8817 legalizeOperands(*LoHalf, MDT);
8818 legalizeOperands(*Add, MDT);
8819 legalizeOperands(*HiHalf, MDT);
8820
8821 // Move all users of this moved value.
8822 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8823}
8824
8825// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8826// multiplications.
8827void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8828 MachineInstr &Inst,
8829 MachineDominatorTree *MDT) const {
8830 MachineBasicBlock &MBB = *Inst.getParent();
8831 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8832
8833 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8834 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8835 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8836
8837 MachineOperand &Dest = Inst.getOperand(0);
8838 MachineOperand &Src0 = Inst.getOperand(1);
8839 MachineOperand &Src1 = Inst.getOperand(2);
8840 const DebugLoc &DL = Inst.getDebugLoc();
8841 MachineBasicBlock::iterator MII = Inst;
8842
8843 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8844 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8845 const TargetRegisterClass *Src0SubRC =
8846 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8847 if (RI.isSGPRClass(Src0SubRC))
8848 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8849 const TargetRegisterClass *Src1SubRC =
8850 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8851 if (RI.isSGPRClass(Src1SubRC))
8852 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8853
8854 // First, we extract the low 32-bit and high 32-bit values from each of the
8855 // operands.
8856 MachineOperand Op0L =
8857 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8858 MachineOperand Op1L =
8859 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8860
8861 unsigned Opc = Inst.getOpcode();
8862 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8863 ? AMDGPU::V_MUL_HI_U32_e64
8864 : AMDGPU::V_MUL_HI_I32_e64;
8865 MachineInstr *HiHalf =
8866 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8867
8868 MachineInstr *LoHalf =
8869 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8870 .add(Op1L)
8871 .add(Op0L);
8872
8873 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8874 .addReg(DestSub0)
8875 .addImm(AMDGPU::sub0)
8876 .addReg(DestSub1)
8877 .addImm(AMDGPU::sub1);
8878
8879 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8880
8881 // Try to legalize the operands in case we need to swap the order to keep it
8882 // valid.
8883 legalizeOperands(*HiHalf, MDT);
8884 legalizeOperands(*LoHalf, MDT);
8885
8886 // Move all users of this moved value.
8887 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8888}
8889
8890void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8891 MachineInstr &Inst, unsigned Opcode,
8892 MachineDominatorTree *MDT) const {
8893 MachineBasicBlock &MBB = *Inst.getParent();
8894 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8895
8896 MachineOperand &Dest = Inst.getOperand(0);
8897 MachineOperand &Src0 = Inst.getOperand(1);
8898 MachineOperand &Src1 = Inst.getOperand(2);
8899 DebugLoc DL = Inst.getDebugLoc();
8900
8901 MachineBasicBlock::iterator MII = Inst;
8902
8903 const MCInstrDesc &InstDesc = get(Opcode);
8904 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8905 MRI.getRegClass(Src0.getReg()) :
8906 &AMDGPU::SGPR_32RegClass;
8907
8908 const TargetRegisterClass *Src0SubRC =
8909 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8910 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8911 MRI.getRegClass(Src1.getReg()) :
8912 &AMDGPU::SGPR_32RegClass;
8913
8914 const TargetRegisterClass *Src1SubRC =
8915 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8916
8917 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8918 AMDGPU::sub0, Src0SubRC);
8919 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8920 AMDGPU::sub0, Src1SubRC);
8921 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8922 AMDGPU::sub1, Src0SubRC);
8923 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8924 AMDGPU::sub1, Src1SubRC);
8925
8926 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8927 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8928 const TargetRegisterClass *NewDestSubRC =
8929 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8930
8931 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8932 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8933 .add(SrcReg0Sub0)
8934 .add(SrcReg1Sub0);
8935
8936 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8937 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8938 .add(SrcReg0Sub1)
8939 .add(SrcReg1Sub1);
8940
8941 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8942 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8943 .addReg(DestSub0)
8944 .addImm(AMDGPU::sub0)
8945 .addReg(DestSub1)
8946 .addImm(AMDGPU::sub1);
8947
8948 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8949
8950 Worklist.insert(&LoHalf);
8951 Worklist.insert(&HiHalf);
8952
8953 // Move all users of this moved value.
8954 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8955}
8956
8957void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8958 MachineInstr &Inst,
8959 MachineDominatorTree *MDT) const {
8960 MachineBasicBlock &MBB = *Inst.getParent();
8961 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8962
8963 MachineOperand &Dest = Inst.getOperand(0);
8964 MachineOperand &Src0 = Inst.getOperand(1);
8965 MachineOperand &Src1 = Inst.getOperand(2);
8966 const DebugLoc &DL = Inst.getDebugLoc();
8967
8968 MachineBasicBlock::iterator MII = Inst;
8969
8970 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8971
8972 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8973
8974 MachineOperand* Op0;
8975 MachineOperand* Op1;
8976
8977 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8978 Op0 = &Src0;
8979 Op1 = &Src1;
8980 } else {
8981 Op0 = &Src1;
8982 Op1 = &Src0;
8983 }
8984
8985 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8986 .add(*Op0);
8987
8988 Register NewDest = MRI.createVirtualRegister(DestRC);
8989
8990 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8991 .addReg(Interm)
8992 .add(*Op1);
8993
8994 MRI.replaceRegWith(Dest.getReg(), NewDest);
8995
8996 Worklist.insert(&Xor);
8997}
8998
8999void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9000 MachineInstr &Inst) const {
9001 MachineBasicBlock &MBB = *Inst.getParent();
9002 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9003
9004 MachineBasicBlock::iterator MII = Inst;
9005 const DebugLoc &DL = Inst.getDebugLoc();
9006
9007 MachineOperand &Dest = Inst.getOperand(0);
9008 MachineOperand &Src = Inst.getOperand(1);
9009
9010 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9011 const TargetRegisterClass *SrcRC = Src.isReg() ?
9012 MRI.getRegClass(Src.getReg()) :
9013 &AMDGPU::SGPR_32RegClass;
9014
9015 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9016 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9017
9018 const TargetRegisterClass *SrcSubRC =
9019 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9020
9021 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9022 AMDGPU::sub0, SrcSubRC);
9023 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9024 AMDGPU::sub1, SrcSubRC);
9025
9026 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9027
9028 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9029
9030 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9031
9032 // We don't need to legalize operands here. src0 for either instruction can be
9033 // an SGPR, and the second input is unused or determined here.
9034 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9035}
9036
9037void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9038 MachineInstr &Inst) const {
9039 MachineBasicBlock &MBB = *Inst.getParent();
9040 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9041 MachineBasicBlock::iterator MII = Inst;
9042 const DebugLoc &DL = Inst.getDebugLoc();
9043
9044 MachineOperand &Dest = Inst.getOperand(0);
9045 uint32_t Imm = Inst.getOperand(2).getImm();
9046 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9047 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9048
9049 (void) Offset;
9050
9051 // Only sext_inreg cases handled.
9052 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9053 Offset == 0 && "Not implemented");
9054
9055 if (BitWidth < 32) {
9056 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9057 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9058 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9059
9060 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9061 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
9062 .addImm(0)
9063 .addImm(BitWidth);
9064
9065 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9066 .addImm(31)
9067 .addReg(MidRegLo);
9068
9069 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9070 .addReg(MidRegLo)
9071 .addImm(AMDGPU::sub0)
9072 .addReg(MidRegHi)
9073 .addImm(AMDGPU::sub1);
9074
9075 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9076 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9077 return;
9078 }
9079
9080 MachineOperand &Src = Inst.getOperand(1);
9081 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9082 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9083
9084 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9085 .addImm(31)
9086 .addReg(Src.getReg(), 0, AMDGPU::sub0);
9087
9088 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9089 .addReg(Src.getReg(), 0, AMDGPU::sub0)
9090 .addImm(AMDGPU::sub0)
9091 .addReg(TmpReg)
9092 .addImm(AMDGPU::sub1);
9093
9094 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9095 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9096}
9097
9098void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9099 MachineInstr &Inst, unsigned Opcode,
9100 MachineDominatorTree *MDT) const {
9101 // (S_FLBIT_I32_B64 hi:lo) ->
9102 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9103 // (S_FF1_I32_B64 hi:lo) ->
9104 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9105
9106 MachineBasicBlock &MBB = *Inst.getParent();
9107 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9108 MachineBasicBlock::iterator MII = Inst;
9109 const DebugLoc &DL = Inst.getDebugLoc();
9110
9111 MachineOperand &Dest = Inst.getOperand(0);
9112 MachineOperand &Src = Inst.getOperand(1);
9113
9114 const MCInstrDesc &InstDesc = get(Opcode);
9115
9116 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9117 unsigned OpcodeAdd =
9118 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9119
9120 const TargetRegisterClass *SrcRC =
9121 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9122 const TargetRegisterClass *SrcSubRC =
9123 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9124
9125 MachineOperand SrcRegSub0 =
9126 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9127 MachineOperand SrcRegSub1 =
9128 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9129
9130 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9131 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9132 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9133 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9134
9135 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9136
9137 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9138
9139 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9140 .addReg(IsCtlz ? MidReg1 : MidReg2)
9141 .addImm(32)
9142 .addImm(1); // enable clamp
9143
9144 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9145 .addReg(MidReg3)
9146 .addReg(IsCtlz ? MidReg2 : MidReg1);
9147
9148 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9149
9150 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9151}
9152
9153void SIInstrInfo::addUsersToMoveToVALUWorklist(
9155 SIInstrWorklist &Worklist) const {
9156 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9157 MachineInstr &UseMI = *MO.getParent();
9158
9159 unsigned OpNo = 0;
9160
9161 switch (UseMI.getOpcode()) {
9162 case AMDGPU::COPY:
9163 case AMDGPU::WQM:
9164 case AMDGPU::SOFT_WQM:
9165 case AMDGPU::STRICT_WWM:
9166 case AMDGPU::STRICT_WQM:
9167 case AMDGPU::REG_SEQUENCE:
9168 case AMDGPU::PHI:
9169 case AMDGPU::INSERT_SUBREG:
9170 break;
9171 default:
9172 OpNo = MO.getOperandNo();
9173 break;
9174 }
9175
9176 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9177 MRI.constrainRegClass(DstReg, OpRC);
9178
9179 if (!RI.hasVectorRegisters(OpRC))
9180 Worklist.insert(&UseMI);
9181 else
9182 // Legalization could change user list.
9184 }
9185}
9186
9187void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9189 MachineInstr &Inst) const {
9190 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9191 MachineBasicBlock *MBB = Inst.getParent();
9192 MachineOperand &Src0 = Inst.getOperand(1);
9193 MachineOperand &Src1 = Inst.getOperand(2);
9194 const DebugLoc &DL = Inst.getDebugLoc();
9195
9196 if (ST.useRealTrue16Insts()) {
9197 Register SrcReg0, SrcReg1;
9198 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9199 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9200 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9201 } else {
9202 SrcReg0 = Src0.getReg();
9203 }
9204
9205 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9206 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9207 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9208 } else {
9209 SrcReg1 = Src1.getReg();
9210 }
9211
9212 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9213 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9214
9215 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9216 switch (Inst.getOpcode()) {
9217 case AMDGPU::S_PACK_LL_B32_B16:
9218 NewMI
9219 .addReg(SrcReg0, 0,
9220 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9221 .addImm(AMDGPU::lo16)
9222 .addReg(SrcReg1, 0,
9223 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9224 .addImm(AMDGPU::hi16);
9225 break;
9226 case AMDGPU::S_PACK_LH_B32_B16:
9227 NewMI
9228 .addReg(SrcReg0, 0,
9229 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9230 .addImm(AMDGPU::lo16)
9231 .addReg(SrcReg1, 0, AMDGPU::hi16)
9232 .addImm(AMDGPU::hi16);
9233 break;
9234 case AMDGPU::S_PACK_HL_B32_B16:
9235 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9236 .addImm(AMDGPU::lo16)
9237 .addReg(SrcReg1, 0,
9238 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9239 .addImm(AMDGPU::hi16);
9240 break;
9241 case AMDGPU::S_PACK_HH_B32_B16:
9242 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9243 .addImm(AMDGPU::lo16)
9244 .addReg(SrcReg1, 0, AMDGPU::hi16)
9245 .addImm(AMDGPU::hi16);
9246 break;
9247 default:
9248 llvm_unreachable("unhandled s_pack_* instruction");
9249 }
9250
9251 MachineOperand &Dest = Inst.getOperand(0);
9252 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9253 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9254 return;
9255 }
9256
9257 switch (Inst.getOpcode()) {
9258 case AMDGPU::S_PACK_LL_B32_B16: {
9259 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9260 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9261
9262 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9263 // 0.
9264 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9265 .addImm(0xffff);
9266
9267 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9268 .addReg(ImmReg, RegState::Kill)
9269 .add(Src0);
9270
9271 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9272 .add(Src1)
9273 .addImm(16)
9274 .addReg(TmpReg, RegState::Kill);
9275 break;
9276 }
9277 case AMDGPU::S_PACK_LH_B32_B16: {
9278 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9279 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9280 .addImm(0xffff);
9281 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9282 .addReg(ImmReg, RegState::Kill)
9283 .add(Src0)
9284 .add(Src1);
9285 break;
9286 }
9287 case AMDGPU::S_PACK_HL_B32_B16: {
9288 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9289 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9290 .addImm(16)
9291 .add(Src0);
9292 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9293 .add(Src1)
9294 .addImm(16)
9295 .addReg(TmpReg, RegState::Kill);
9296 break;
9297 }
9298 case AMDGPU::S_PACK_HH_B32_B16: {
9299 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9300 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9301 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9302 .addImm(16)
9303 .add(Src0);
9304 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9305 .addImm(0xffff0000);
9306 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9307 .add(Src1)
9308 .addReg(ImmReg, RegState::Kill)
9309 .addReg(TmpReg, RegState::Kill);
9310 break;
9311 }
9312 default:
9313 llvm_unreachable("unhandled s_pack_* instruction");
9314 }
9315
9316 MachineOperand &Dest = Inst.getOperand(0);
9317 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9318 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9319}
9320
9321void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9322 MachineInstr &SCCDefInst,
9323 SIInstrWorklist &Worklist,
9324 Register NewCond) const {
9325
9326 // Ensure that def inst defines SCC, which is still live.
9327 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9328 !Op.isDead() && Op.getParent() == &SCCDefInst);
9329 SmallVector<MachineInstr *, 4> CopyToDelete;
9330 // This assumes that all the users of SCC are in the same block
9331 // as the SCC def.
9332 for (MachineInstr &MI : // Skip the def inst itself.
9333 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9334 SCCDefInst.getParent()->end())) {
9335 // Check if SCC is used first.
9336 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9337 if (SCCIdx != -1) {
9338 if (MI.isCopy()) {
9339 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9340 Register DestReg = MI.getOperand(0).getReg();
9341
9342 MRI.replaceRegWith(DestReg, NewCond);
9343 CopyToDelete.push_back(&MI);
9344 } else {
9345
9346 if (NewCond.isValid())
9347 MI.getOperand(SCCIdx).setReg(NewCond);
9348
9349 Worklist.insert(&MI);
9350 }
9351 }
9352 // Exit if we find another SCC def.
9353 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9354 break;
9355 }
9356 for (auto &Copy : CopyToDelete)
9357 Copy->eraseFromParent();
9358}
9359
9360// Instructions that use SCC may be converted to VALU instructions. When that
9361// happens, the SCC register is changed to VCC_LO. The instruction that defines
9362// SCC must be changed to an instruction that defines VCC. This function makes
9363// sure that the instruction that defines SCC is added to the moveToVALU
9364// worklist.
9365void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9366 SIInstrWorklist &Worklist) const {
9367 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9368 // then there is nothing to do because the defining instruction has been
9369 // converted to a VALU already. If SCC then that instruction needs to be
9370 // converted to a VALU.
9371 for (MachineInstr &MI :
9372 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9373 SCCUseInst->getParent()->rend())) {
9374 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9375 break;
9376 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9377 Worklist.insert(&MI);
9378 break;
9379 }
9380 }
9381}
9382
9383const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9384 const MachineInstr &Inst) const {
9385 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9386
9387 switch (Inst.getOpcode()) {
9388 // For target instructions, getOpRegClass just returns the virtual register
9389 // class associated with the operand, so we need to find an equivalent VGPR
9390 // register class in order to move the instruction to the VALU.
9391 case AMDGPU::COPY:
9392 case AMDGPU::PHI:
9393 case AMDGPU::REG_SEQUENCE:
9394 case AMDGPU::INSERT_SUBREG:
9395 case AMDGPU::WQM:
9396 case AMDGPU::SOFT_WQM:
9397 case AMDGPU::STRICT_WWM:
9398 case AMDGPU::STRICT_WQM: {
9399 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9400 if (RI.isAGPRClass(SrcRC)) {
9401 if (RI.isAGPRClass(NewDstRC))
9402 return nullptr;
9403
9404 switch (Inst.getOpcode()) {
9405 case AMDGPU::PHI:
9406 case AMDGPU::REG_SEQUENCE:
9407 case AMDGPU::INSERT_SUBREG:
9408 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9409 break;
9410 default:
9411 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9412 }
9413
9414 if (!NewDstRC)
9415 return nullptr;
9416 } else {
9417 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9418 return nullptr;
9419
9420 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9421 if (!NewDstRC)
9422 return nullptr;
9423 }
9424
9425 return NewDstRC;
9426 }
9427 default:
9428 return NewDstRC;
9429 }
9430}
9431
9432// Find the one SGPR operand we are allowed to use.
9433Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9434 int OpIndices[3]) const {
9435 const MCInstrDesc &Desc = MI.getDesc();
9436
9437 // Find the one SGPR operand we are allowed to use.
9438 //
9439 // First we need to consider the instruction's operand requirements before
9440 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9441 // of VCC, but we are still bound by the constant bus requirement to only use
9442 // one.
9443 //
9444 // If the operand's class is an SGPR, we can never move it.
9445
9446 Register SGPRReg = findImplicitSGPRRead(MI);
9447 if (SGPRReg)
9448 return SGPRReg;
9449
9450 Register UsedSGPRs[3] = {Register()};
9451 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9452
9453 for (unsigned i = 0; i < 3; ++i) {
9454 int Idx = OpIndices[i];
9455 if (Idx == -1)
9456 break;
9457
9458 const MachineOperand &MO = MI.getOperand(Idx);
9459 if (!MO.isReg())
9460 continue;
9461
9462 // Is this operand statically required to be an SGPR based on the operand
9463 // constraints?
9464 const TargetRegisterClass *OpRC =
9465 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9466 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9467 if (IsRequiredSGPR)
9468 return MO.getReg();
9469
9470 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9471 Register Reg = MO.getReg();
9472 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9473 if (RI.isSGPRClass(RegRC))
9474 UsedSGPRs[i] = Reg;
9475 }
9476
9477 // We don't have a required SGPR operand, so we have a bit more freedom in
9478 // selecting operands to move.
9479
9480 // Try to select the most used SGPR. If an SGPR is equal to one of the
9481 // others, we choose that.
9482 //
9483 // e.g.
9484 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9485 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9486
9487 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9488 // prefer those.
9489
9490 if (UsedSGPRs[0]) {
9491 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9492 SGPRReg = UsedSGPRs[0];
9493 }
9494
9495 if (!SGPRReg && UsedSGPRs[1]) {
9496 if (UsedSGPRs[1] == UsedSGPRs[2])
9497 SGPRReg = UsedSGPRs[1];
9498 }
9499
9500 return SGPRReg;
9501}
9502
9504 AMDGPU::OpName OperandName) const {
9505 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9506 return nullptr;
9507
9508 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9509 if (Idx == -1)
9510 return nullptr;
9511
9512 return &MI.getOperand(Idx);
9513}
9514
9516 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9517 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9520 return (Format << 44) |
9521 (1ULL << 56) | // RESOURCE_LEVEL = 1
9522 (3ULL << 60); // OOB_SELECT = 3
9523 }
9524
9525 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9526 if (ST.isAmdHsaOS()) {
9527 // Set ATC = 1. GFX9 doesn't have this bit.
9528 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9529 RsrcDataFormat |= (1ULL << 56);
9530
9531 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9532 // BTW, it disables TC L2 and therefore decreases performance.
9533 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9534 RsrcDataFormat |= (2ULL << 59);
9535 }
9536
9537 return RsrcDataFormat;
9538}
9539
9543 0xffffffff; // Size;
9544
9545 // GFX9 doesn't have ELEMENT_SIZE.
9546 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9547 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9548 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9549 }
9550
9551 // IndexStride = 64 / 32.
9552 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9553 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9554
9555 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9556 // Clear them unless we want a huge stride.
9557 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9558 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9559 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9560
9561 return Rsrc23;
9562}
9563
9565 unsigned Opc = MI.getOpcode();
9566
9567 return isSMRD(Opc);
9568}
9569
9571 return get(Opc).mayLoad() &&
9572 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9573}
9574
9576 int &FrameIndex) const {
9577 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9578 if (!Addr || !Addr->isFI())
9579 return Register();
9580
9581 assert(!MI.memoperands_empty() &&
9582 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9583
9584 FrameIndex = Addr->getIndex();
9585 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9586}
9587
9589 int &FrameIndex) const {
9590 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9591 assert(Addr && Addr->isFI());
9592 FrameIndex = Addr->getIndex();
9593 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9594}
9595
9597 int &FrameIndex) const {
9598 if (!MI.mayLoad())
9599 return Register();
9600
9601 if (isMUBUF(MI) || isVGPRSpill(MI))
9602 return isStackAccess(MI, FrameIndex);
9603
9604 if (isSGPRSpill(MI))
9605 return isSGPRStackAccess(MI, FrameIndex);
9606
9607 return Register();
9608}
9609
9611 int &FrameIndex) const {
9612 if (!MI.mayStore())
9613 return Register();
9614
9615 if (isMUBUF(MI) || isVGPRSpill(MI))
9616 return isStackAccess(MI, FrameIndex);
9617
9618 if (isSGPRSpill(MI))
9619 return isSGPRStackAccess(MI, FrameIndex);
9620
9621 return Register();
9622}
9623
9625 unsigned Size = 0;
9627 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9628 while (++I != E && I->isInsideBundle()) {
9629 assert(!I->isBundle() && "No nested bundle!");
9631 }
9632
9633 return Size;
9634}
9635
9637 unsigned Opc = MI.getOpcode();
9639 unsigned DescSize = Desc.getSize();
9640
9641 // If we have a definitive size, we can use it. Otherwise we need to inspect
9642 // the operands to know the size.
9643 if (isFixedSize(MI)) {
9644 unsigned Size = DescSize;
9645
9646 // If we hit the buggy offset, an extra nop will be inserted in MC so
9647 // estimate the worst case.
9648 if (MI.isBranch() && ST.hasOffset3fBug())
9649 Size += 4;
9650
9651 return Size;
9652 }
9653
9654 // Instructions may have a 32-bit literal encoded after them. Check
9655 // operands that could ever be literals.
9656 if (isVALU(MI) || isSALU(MI)) {
9657 if (isDPP(MI))
9658 return DescSize;
9659 bool HasLiteral = false;
9660 unsigned LiteralSize = 4;
9661 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9662 const MachineOperand &Op = MI.getOperand(I);
9663 const MCOperandInfo &OpInfo = Desc.operands()[I];
9664 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9665 HasLiteral = true;
9666 if (ST.has64BitLiterals()) {
9667 switch (OpInfo.OperandType) {
9668 default:
9669 break;
9671 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9672 LiteralSize = 8;
9673 break;
9675 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9676 LiteralSize = 8;
9677 break;
9678 }
9679 }
9680 break;
9681 }
9682 }
9683 return HasLiteral ? DescSize + LiteralSize : DescSize;
9684 }
9685
9686 // Check whether we have extra NSA words.
9687 if (isMIMG(MI)) {
9688 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9689 if (VAddr0Idx < 0)
9690 return 8;
9691
9692 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9693 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9694 }
9695
9696 switch (Opc) {
9697 case TargetOpcode::BUNDLE:
9698 return getInstBundleSize(MI);
9699 case TargetOpcode::INLINEASM:
9700 case TargetOpcode::INLINEASM_BR: {
9701 const MachineFunction *MF = MI.getMF();
9702 const char *AsmStr = MI.getOperand(0).getSymbolName();
9703 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9704 }
9705 default:
9706 if (MI.isMetaInstruction())
9707 return 0;
9708
9709 // If D16 Pseudo inst, get correct MC code size
9710 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9711 if (D16Info) {
9712 // Assume d16_lo/hi inst are always in same size
9713 unsigned LoInstOpcode = D16Info->LoOp;
9714 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9715 DescSize = Desc.getSize();
9716 }
9717
9718 // If FMA Pseudo inst, get correct MC code size
9719 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9720 // All potential lowerings are the same size; arbitrarily pick one.
9721 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9722 DescSize = Desc.getSize();
9723 }
9724
9725 return DescSize;
9726 }
9727}
9728
9730 if (!isFLAT(MI))
9731 return false;
9732
9733 if (MI.memoperands_empty())
9734 return true;
9735
9736 for (const MachineMemOperand *MMO : MI.memoperands()) {
9737 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9738 return true;
9739 }
9740 return false;
9741}
9742
9745 static const std::pair<int, const char *> TargetIndices[] = {
9746 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9747 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9748 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9749 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9750 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9751 return ArrayRef(TargetIndices);
9752}
9753
9754/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9755/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9761
9762/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9763/// pass.
9768
9769// Called during:
9770// - pre-RA scheduling and post-RA scheduling
9773 const ScheduleDAGMI *DAG) const {
9774 // Borrowed from Arm Target
9775 // We would like to restrict this hazard recognizer to only
9776 // post-RA scheduling; we can tell that we're post-RA because we don't
9777 // track VRegLiveness.
9778 if (!DAG->hasVRegLiveness())
9779 return new GCNHazardRecognizer(DAG->MF);
9781}
9782
9783std::pair<unsigned, unsigned>
9785 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9786}
9787
9790 static const std::pair<unsigned, const char *> TargetFlags[] = {
9791 {MO_GOTPCREL, "amdgpu-gotprel"},
9792 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9793 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9794 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9795 {MO_REL32_LO, "amdgpu-rel32-lo"},
9796 {MO_REL32_HI, "amdgpu-rel32-hi"},
9797 {MO_REL64, "amdgpu-rel64"},
9798 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9799 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9800 {MO_ABS64, "amdgpu-abs64"},
9801 };
9802
9803 return ArrayRef(TargetFlags);
9804}
9805
9808 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9809 {
9810 {MONoClobber, "amdgpu-noclobber"},
9811 {MOLastUse, "amdgpu-last-use"},
9812 {MOCooperative, "amdgpu-cooperative"},
9813 };
9814
9815 return ArrayRef(TargetFlags);
9816}
9817
9819 const MachineFunction &MF) const {
9821 assert(SrcReg.isVirtual());
9822 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9823 return AMDGPU::WWM_COPY;
9824
9825 return AMDGPU::COPY;
9826}
9827
9829 Register Reg) const {
9830 // We need to handle instructions which may be inserted during register
9831 // allocation to handle the prolog. The initial prolog instruction may have
9832 // been separated from the start of the block by spills and copies inserted
9833 // needed by the prolog. However, the insertions for scalar registers can
9834 // always be placed at the BB top as they are independent of the exec mask
9835 // value.
9836 const MachineFunction *MF = MI.getMF();
9837 bool IsNullOrVectorRegister = true;
9838 if (Reg) {
9839 const MachineRegisterInfo &MRI = MF->getRegInfo();
9840 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9841 }
9842
9843 uint16_t Opcode = MI.getOpcode();
9845 return IsNullOrVectorRegister &&
9846 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9847 (Opcode == AMDGPU::IMPLICIT_DEF &&
9848 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9849 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9850 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9851}
9852
9856 const DebugLoc &DL,
9857 Register DestReg) const {
9858 if (ST.hasAddNoCarry())
9859 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9860
9861 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9862 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9863 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9864
9865 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9866 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9867}
9868
9871 const DebugLoc &DL,
9872 Register DestReg,
9873 RegScavenger &RS) const {
9874 if (ST.hasAddNoCarry())
9875 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9876
9877 // If available, prefer to use vcc.
9878 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9879 ? Register(RI.getVCC())
9880 : RS.scavengeRegisterBackwards(
9881 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9882 0, /* AllowSpill */ false);
9883
9884 // TODO: Users need to deal with this.
9885 if (!UnusedCarry.isValid())
9886 return MachineInstrBuilder();
9887
9888 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9889 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9890}
9891
9892bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9893 switch (Opcode) {
9894 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9895 case AMDGPU::SI_KILL_I1_TERMINATOR:
9896 return true;
9897 default:
9898 return false;
9899 }
9900}
9901
9903 switch (Opcode) {
9904 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9905 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9906 case AMDGPU::SI_KILL_I1_PSEUDO:
9907 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9908 default:
9909 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9910 }
9911}
9912
9913bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9914 return Imm <= getMaxMUBUFImmOffset(ST);
9915}
9916
9918 // GFX12 field is non-negative 24-bit signed byte offset.
9919 const unsigned OffsetBits =
9920 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9921 return (1 << OffsetBits) - 1;
9922}
9923
9925 if (!ST.isWave32())
9926 return;
9927
9928 if (MI.isInlineAsm())
9929 return;
9930
9931 for (auto &Op : MI.implicit_operands()) {
9932 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9933 Op.setReg(AMDGPU::VCC_LO);
9934 }
9935}
9936
9938 if (!isSMRD(MI))
9939 return false;
9940
9941 // Check that it is using a buffer resource.
9942 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9943 if (Idx == -1) // e.g. s_memtime
9944 return false;
9945
9946 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
9947 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9948}
9949
9950// Given Imm, split it into the values to put into the SOffset and ImmOffset
9951// fields in an MUBUF instruction. Return false if it is not possible (due to a
9952// hardware bug needing a workaround).
9953//
9954// The required alignment ensures that individual address components remain
9955// aligned if they are aligned to begin with. It also ensures that additional
9956// offsets within the given alignment can be added to the resulting ImmOffset.
9958 uint32_t &ImmOffset, Align Alignment) const {
9959 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9960 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9961 uint32_t Overflow = 0;
9962
9963 if (Imm > MaxImm) {
9964 if (Imm <= MaxImm + 64) {
9965 // Use an SOffset inline constant for 4..64
9966 Overflow = Imm - MaxImm;
9967 Imm = MaxImm;
9968 } else {
9969 // Try to keep the same value in SOffset for adjacent loads, so that
9970 // the corresponding register contents can be re-used.
9971 //
9972 // Load values with all low-bits (except for alignment bits) set into
9973 // SOffset, so that a larger range of values can be covered using
9974 // s_movk_i32.
9975 //
9976 // Atomic operations fail to work correctly when individual address
9977 // components are unaligned, even if their sum is aligned.
9978 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9979 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9980 Imm = Low;
9981 Overflow = High - Alignment.value();
9982 }
9983 }
9984
9985 if (Overflow > 0) {
9986 // There is a hardware bug in SI and CI which prevents address clamping in
9987 // MUBUF instructions from working correctly with SOffsets. The immediate
9988 // offset is unaffected.
9989 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9990 return false;
9991
9992 // It is not possible to set immediate in SOffset field on some targets.
9993 if (ST.hasRestrictedSOffset())
9994 return false;
9995 }
9996
9997 ImmOffset = Imm;
9998 SOffset = Overflow;
9999 return true;
10000}
10001
10002// Depending on the used address space and instructions, some immediate offsets
10003// are allowed and some are not.
10004// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10005// scratch instruction offsets can also be negative. On GFX12, offsets can be
10006// negative for all variants.
10007//
10008// There are several bugs related to these offsets:
10009// On gfx10.1, flat instructions that go into the global address space cannot
10010// use an offset.
10011//
10012// For scratch instructions, the address can be either an SGPR or a VGPR.
10013// The following offsets can be used, depending on the architecture (x means
10014// cannot be used):
10015// +----------------------------+------+------+
10016// | Address-Mode | SGPR | VGPR |
10017// +----------------------------+------+------+
10018// | gfx9 | | |
10019// | negative, 4-aligned offset | x | ok |
10020// | negative, unaligned offset | x | ok |
10021// +----------------------------+------+------+
10022// | gfx10 | | |
10023// | negative, 4-aligned offset | ok | ok |
10024// | negative, unaligned offset | ok | x |
10025// +----------------------------+------+------+
10026// | gfx10.3 | | |
10027// | negative, 4-aligned offset | ok | ok |
10028// | negative, unaligned offset | ok | ok |
10029// +----------------------------+------+------+
10030//
10031// This function ignores the addressing mode, so if an offset cannot be used in
10032// one addressing mode, it is considered illegal.
10033bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10034 uint64_t FlatVariant) const {
10035 // TODO: Should 0 be special cased?
10036 if (!ST.hasFlatInstOffsets())
10037 return false;
10038
10039 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10040 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10041 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10042 return false;
10043
10044 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10045 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10046 (Offset % 4) != 0) {
10047 return false;
10048 }
10049
10050 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10051 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10052 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10053}
10054
10055// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10056std::pair<int64_t, int64_t>
10057SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10058 uint64_t FlatVariant) const {
10059 int64_t RemainderOffset = COffsetVal;
10060 int64_t ImmField = 0;
10061
10062 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10063 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10064
10065 if (AllowNegative) {
10066 // Use signed division by a power of two to truncate towards 0.
10067 int64_t D = 1LL << NumBits;
10068 RemainderOffset = (COffsetVal / D) * D;
10069 ImmField = COffsetVal - RemainderOffset;
10070
10071 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10072 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10073 (ImmField % 4) != 0) {
10074 // Make ImmField a multiple of 4
10075 RemainderOffset += ImmField % 4;
10076 ImmField -= ImmField % 4;
10077 }
10078 } else if (COffsetVal >= 0) {
10079 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10080 RemainderOffset = COffsetVal - ImmField;
10081 }
10082
10083 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10084 assert(RemainderOffset + ImmField == COffsetVal);
10085 return {ImmField, RemainderOffset};
10086}
10087
10089 if (ST.hasNegativeScratchOffsetBug() &&
10090 FlatVariant == SIInstrFlags::FlatScratch)
10091 return false;
10092
10093 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10094}
10095
10096static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10097 switch (ST.getGeneration()) {
10098 default:
10099 break;
10102 return SIEncodingFamily::SI;
10105 return SIEncodingFamily::VI;
10111 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10113 }
10114 llvm_unreachable("Unknown subtarget generation!");
10115}
10116
10117bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10118 switch(MCOp) {
10119 // These opcodes use indirect register addressing so
10120 // they need special handling by codegen (currently missing).
10121 // Therefore it is too risky to allow these opcodes
10122 // to be selected by dpp combiner or sdwa peepholer.
10123 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10124 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10125 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10126 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10127 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10128 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10129 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10130 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10131 return true;
10132 default:
10133 return false;
10134 }
10135}
10136
10137#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10138 case OPCODE##_dpp: \
10139 case OPCODE##_e32: \
10140 case OPCODE##_e64: \
10141 case OPCODE##_e64_dpp: \
10142 case OPCODE##_sdwa:
10143
10144static bool isRenamedInGFX9(int Opcode) {
10145 switch (Opcode) {
10146 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10147 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10148 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10149 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10150 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10151 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10152 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10153 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10154 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10155 //
10156 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10157 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10158 case AMDGPU::V_FMA_F16_gfx9_e64:
10159 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10160 case AMDGPU::V_INTERP_P2_F16:
10161 case AMDGPU::V_MAD_F16_e64:
10162 case AMDGPU::V_MAD_U16_e64:
10163 case AMDGPU::V_MAD_I16_e64:
10164 return true;
10165 default:
10166 return false;
10167 }
10168}
10169
10170int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10171 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
10172
10173 unsigned Gen = subtargetEncodingFamily(ST);
10174
10175 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10177
10178 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10179 // subtarget has UnpackedD16VMem feature.
10180 // TODO: remove this when we discard GFX80 encoding.
10181 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10183
10184 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10185 switch (ST.getGeneration()) {
10186 default:
10188 break;
10191 break;
10194 break;
10195 }
10196 }
10197
10198 if (isMAI(Opcode)) {
10199 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10200 if (MFMAOp != -1)
10201 Opcode = MFMAOp;
10202 }
10203
10204 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10205
10206 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10208
10209 // -1 means that Opcode is already a native instruction.
10210 if (MCOp == -1)
10211 return Opcode;
10212
10213 if (ST.hasGFX90AInsts()) {
10214 uint16_t NMCOp = (uint16_t)-1;
10215 if (ST.hasGFX940Insts())
10217 if (NMCOp == (uint16_t)-1)
10219 if (NMCOp == (uint16_t)-1)
10221 if (NMCOp != (uint16_t)-1)
10222 MCOp = NMCOp;
10223 }
10224
10225 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10226 // no encoding in the given subtarget generation.
10227 if (MCOp == (uint16_t)-1)
10228 return -1;
10229
10230 if (isAsmOnlyOpcode(MCOp))
10231 return -1;
10232
10233 return MCOp;
10234}
10235
10236static
10238 assert(RegOpnd.isReg());
10239 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10240 getRegSubRegPair(RegOpnd);
10241}
10242
10245 assert(MI.isRegSequence());
10246 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10247 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10248 auto &RegOp = MI.getOperand(1 + 2 * I);
10249 return getRegOrUndef(RegOp);
10250 }
10252}
10253
10254// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10255// Following a subreg of reg:subreg isn't supported
10258 if (!RSR.SubReg)
10259 return false;
10260 switch (MI.getOpcode()) {
10261 default: break;
10262 case AMDGPU::REG_SEQUENCE:
10263 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10264 return true;
10265 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10266 case AMDGPU::INSERT_SUBREG:
10267 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10268 // inserted the subreg we're looking for
10269 RSR = getRegOrUndef(MI.getOperand(2));
10270 else { // the subreg in the rest of the reg
10271 auto R1 = getRegOrUndef(MI.getOperand(1));
10272 if (R1.SubReg) // subreg of subreg isn't supported
10273 return false;
10274 RSR.Reg = R1.Reg;
10275 }
10276 return true;
10277 }
10278 return false;
10279}
10280
10282 const MachineRegisterInfo &MRI) {
10283 assert(MRI.isSSA());
10284 if (!P.Reg.isVirtual())
10285 return nullptr;
10286
10287 auto RSR = P;
10288 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10289 while (auto *MI = DefInst) {
10290 DefInst = nullptr;
10291 switch (MI->getOpcode()) {
10292 case AMDGPU::COPY:
10293 case AMDGPU::V_MOV_B32_e32: {
10294 auto &Op1 = MI->getOperand(1);
10295 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10296 if (Op1.isUndef())
10297 return nullptr;
10298 RSR = getRegSubRegPair(Op1);
10299 DefInst = MRI.getVRegDef(RSR.Reg);
10300 }
10301 break;
10302 }
10303 default:
10304 if (followSubRegDef(*MI, RSR)) {
10305 if (!RSR.Reg)
10306 return nullptr;
10307 DefInst = MRI.getVRegDef(RSR.Reg);
10308 }
10309 }
10310 if (!DefInst)
10311 return MI;
10312 }
10313 return nullptr;
10314}
10315
10317 Register VReg,
10318 const MachineInstr &DefMI,
10319 const MachineInstr &UseMI) {
10320 assert(MRI.isSSA() && "Must be run on SSA");
10321
10322 auto *TRI = MRI.getTargetRegisterInfo();
10323 auto *DefBB = DefMI.getParent();
10324
10325 // Don't bother searching between blocks, although it is possible this block
10326 // doesn't modify exec.
10327 if (UseMI.getParent() != DefBB)
10328 return true;
10329
10330 const int MaxInstScan = 20;
10331 int NumInst = 0;
10332
10333 // Stop scan at the use.
10334 auto E = UseMI.getIterator();
10335 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10336 if (I->isDebugInstr())
10337 continue;
10338
10339 if (++NumInst > MaxInstScan)
10340 return true;
10341
10342 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10343 return true;
10344 }
10345
10346 return false;
10347}
10348
10350 Register VReg,
10351 const MachineInstr &DefMI) {
10352 assert(MRI.isSSA() && "Must be run on SSA");
10353
10354 auto *TRI = MRI.getTargetRegisterInfo();
10355 auto *DefBB = DefMI.getParent();
10356
10357 const int MaxUseScan = 10;
10358 int NumUse = 0;
10359
10360 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10361 auto &UseInst = *Use.getParent();
10362 // Don't bother searching between blocks, although it is possible this block
10363 // doesn't modify exec.
10364 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10365 return true;
10366
10367 if (++NumUse > MaxUseScan)
10368 return true;
10369 }
10370
10371 if (NumUse == 0)
10372 return false;
10373
10374 const int MaxInstScan = 20;
10375 int NumInst = 0;
10376
10377 // Stop scan when we have seen all the uses.
10378 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10379 assert(I != DefBB->end());
10380
10381 if (I->isDebugInstr())
10382 continue;
10383
10384 if (++NumInst > MaxInstScan)
10385 return true;
10386
10387 for (const MachineOperand &Op : I->operands()) {
10388 // We don't check reg masks here as they're used only on calls:
10389 // 1. EXEC is only considered const within one BB
10390 // 2. Call should be a terminator instruction if present in a BB
10391
10392 if (!Op.isReg())
10393 continue;
10394
10395 Register Reg = Op.getReg();
10396 if (Op.isUse()) {
10397 if (Reg == VReg && --NumUse == 0)
10398 return false;
10399 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10400 return true;
10401 }
10402 }
10403}
10404
10407 const DebugLoc &DL, Register Src, Register Dst) const {
10408 auto Cur = MBB.begin();
10409 if (Cur != MBB.end())
10410 do {
10411 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10412 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10413 ++Cur;
10414 } while (Cur != MBB.end() && Cur != LastPHIIt);
10415
10416 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10417 Dst);
10418}
10419
10422 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10423 if (InsPt != MBB.end() &&
10424 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10425 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10426 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10427 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10428 InsPt++;
10429 return BuildMI(MBB, InsPt, DL,
10430 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10431 .addReg(Src, 0, SrcSubReg)
10432 .addReg(AMDGPU::EXEC, RegState::Implicit);
10433 }
10434 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10435 Dst);
10436}
10437
10438bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10439
10442 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10443 VirtRegMap *VRM) const {
10444 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10445 //
10446 // %0:sreg_32 = COPY $m0
10447 //
10448 // We explicitly chose SReg_32 for the virtual register so such a copy might
10449 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10450 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10451 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10452 // TargetInstrInfo::foldMemoryOperand() is going to try.
10453 // A similar issue also exists with spilling and reloading $exec registers.
10454 //
10455 // To prevent that, constrain the %0 register class here.
10456 if (isFullCopyInstr(MI)) {
10457 Register DstReg = MI.getOperand(0).getReg();
10458 Register SrcReg = MI.getOperand(1).getReg();
10459 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10460 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10462 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10463 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10464 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10465 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10466 return nullptr;
10467 }
10468 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10469 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10470 return nullptr;
10471 }
10472 }
10473 }
10474
10475 return nullptr;
10476}
10477
10479 const MachineInstr &MI,
10480 unsigned *PredCost) const {
10481 if (MI.isBundle()) {
10483 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10484 unsigned Lat = 0, Count = 0;
10485 for (++I; I != E && I->isBundledWithPred(); ++I) {
10486 ++Count;
10487 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10488 }
10489 return Lat + Count - 1;
10490 }
10491
10492 return SchedModel.computeInstrLatency(&MI);
10493}
10494
10497 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10498 unsigned Opcode = MI.getOpcode();
10499
10500 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10501 Register Dst = MI.getOperand(0).getReg();
10502 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10503 : MI.getOperand(1).getReg();
10504 LLT DstTy = MRI.getType(Dst);
10505 LLT SrcTy = MRI.getType(Src);
10506 unsigned DstAS = DstTy.getAddressSpace();
10507 unsigned SrcAS = SrcTy.getAddressSpace();
10508 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10509 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10510 ST.hasGloballyAddressableScratch()
10513 };
10514
10515 // If the target supports globally addressable scratch, the mapping from
10516 // scratch memory to the flat aperture changes therefore an address space cast
10517 // is no longer uniform.
10518 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10519 return HandleAddrSpaceCast(MI);
10520
10521 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10522 auto IID = GI->getIntrinsicID();
10527
10528 switch (IID) {
10529 case Intrinsic::amdgcn_addrspacecast_nonnull:
10530 return HandleAddrSpaceCast(MI);
10531 case Intrinsic::amdgcn_if:
10532 case Intrinsic::amdgcn_else:
10533 // FIXME: Uniform if second result
10534 break;
10535 }
10536
10538 }
10539
10540 // Loads from the private and flat address spaces are divergent, because
10541 // threads can execute the load instruction with the same inputs and get
10542 // different results.
10543 //
10544 // All other loads are not divergent, because if threads issue loads with the
10545 // same arguments, they will always get the same result.
10546 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10547 Opcode == AMDGPU::G_SEXTLOAD) {
10548 if (MI.memoperands_empty())
10549 return InstructionUniformity::NeverUniform; // conservative assumption
10550
10551 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10552 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10553 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10554 })) {
10555 // At least one MMO in a non-global address space.
10557 }
10559 }
10560
10561 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10562 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10563 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10564 AMDGPU::isGenericAtomic(Opcode)) {
10566 }
10568}
10569
10572
10573 if (isNeverUniform(MI))
10575
10576 unsigned opcode = MI.getOpcode();
10577 if (opcode == AMDGPU::V_READLANE_B32 ||
10578 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10579 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10581
10582 if (isCopyInstr(MI)) {
10583 const MachineOperand &srcOp = MI.getOperand(1);
10584 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10585 const TargetRegisterClass *regClass =
10586 RI.getPhysRegBaseClass(srcOp.getReg());
10587 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10589 }
10591 }
10592
10593 // GMIR handling
10594 if (MI.isPreISelOpcode())
10596
10597 // Atomics are divergent because they are executed sequentially: when an
10598 // atomic operation refers to the same address in each thread, then each
10599 // thread after the first sees the value written by the previous thread as
10600 // original value.
10601
10602 if (isAtomic(MI))
10604
10605 // Loads from the private and flat address spaces are divergent, because
10606 // threads can execute the load instruction with the same inputs and get
10607 // different results.
10608 if (isFLAT(MI) && MI.mayLoad()) {
10609 if (MI.memoperands_empty())
10610 return InstructionUniformity::NeverUniform; // conservative assumption
10611
10612 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10613 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10614 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10615 })) {
10616 // At least one MMO in a non-global address space.
10618 }
10619
10621 }
10622
10623 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10624 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10625
10626 // FIXME: It's conceptually broken to report this for an instruction, and not
10627 // a specific def operand. For inline asm in particular, there could be mixed
10628 // uniform and divergent results.
10629 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10630 const MachineOperand &SrcOp = MI.getOperand(I);
10631 if (!SrcOp.isReg())
10632 continue;
10633
10634 Register Reg = SrcOp.getReg();
10635 if (!Reg || !SrcOp.readsReg())
10636 continue;
10637
10638 // If RegBank is null, this is unassigned or an unallocatable special
10639 // register, which are all scalars.
10640 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10641 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10643 }
10644
10645 // TODO: Uniformity check condtions above can be rearranged for more
10646 // redability
10647
10648 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10649 // currently turned into no-op COPYs by SelectionDAG ISel and are
10650 // therefore no longer recognizable.
10651
10653}
10654
10656 switch (MF.getFunction().getCallingConv()) {
10658 return 1;
10660 return 2;
10662 return 3;
10666 const Function &F = MF.getFunction();
10667 F.getContext().diagnose(DiagnosticInfoUnsupported(
10668 F, "ds_ordered_count unsupported for this calling conv"));
10669 [[fallthrough]];
10670 }
10673 case CallingConv::C:
10674 case CallingConv::Fast:
10675 default:
10676 // Assume other calling conventions are various compute callable functions
10677 return 0;
10678 }
10679}
10680
10682 Register &SrcReg2, int64_t &CmpMask,
10683 int64_t &CmpValue) const {
10684 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10685 return false;
10686
10687 switch (MI.getOpcode()) {
10688 default:
10689 break;
10690 case AMDGPU::S_CMP_EQ_U32:
10691 case AMDGPU::S_CMP_EQ_I32:
10692 case AMDGPU::S_CMP_LG_U32:
10693 case AMDGPU::S_CMP_LG_I32:
10694 case AMDGPU::S_CMP_LT_U32:
10695 case AMDGPU::S_CMP_LT_I32:
10696 case AMDGPU::S_CMP_GT_U32:
10697 case AMDGPU::S_CMP_GT_I32:
10698 case AMDGPU::S_CMP_LE_U32:
10699 case AMDGPU::S_CMP_LE_I32:
10700 case AMDGPU::S_CMP_GE_U32:
10701 case AMDGPU::S_CMP_GE_I32:
10702 case AMDGPU::S_CMP_EQ_U64:
10703 case AMDGPU::S_CMP_LG_U64:
10704 SrcReg = MI.getOperand(0).getReg();
10705 if (MI.getOperand(1).isReg()) {
10706 if (MI.getOperand(1).getSubReg())
10707 return false;
10708 SrcReg2 = MI.getOperand(1).getReg();
10709 CmpValue = 0;
10710 } else if (MI.getOperand(1).isImm()) {
10711 SrcReg2 = Register();
10712 CmpValue = MI.getOperand(1).getImm();
10713 } else {
10714 return false;
10715 }
10716 CmpMask = ~0;
10717 return true;
10718 case AMDGPU::S_CMPK_EQ_U32:
10719 case AMDGPU::S_CMPK_EQ_I32:
10720 case AMDGPU::S_CMPK_LG_U32:
10721 case AMDGPU::S_CMPK_LG_I32:
10722 case AMDGPU::S_CMPK_LT_U32:
10723 case AMDGPU::S_CMPK_LT_I32:
10724 case AMDGPU::S_CMPK_GT_U32:
10725 case AMDGPU::S_CMPK_GT_I32:
10726 case AMDGPU::S_CMPK_LE_U32:
10727 case AMDGPU::S_CMPK_LE_I32:
10728 case AMDGPU::S_CMPK_GE_U32:
10729 case AMDGPU::S_CMPK_GE_I32:
10730 SrcReg = MI.getOperand(0).getReg();
10731 SrcReg2 = Register();
10732 CmpValue = MI.getOperand(1).getImm();
10733 CmpMask = ~0;
10734 return true;
10735 }
10736
10737 return false;
10738}
10739
10740// SCC is already valid after SCCValid.
10741// SCCRedefine will redefine SCC to the same value already available after
10742// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10743// update kill/dead flags if necessary.
10744static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10745 const SIRegisterInfo &RI) {
10746 MachineInstr *KillsSCC = nullptr;
10747 if (SCCValid->getParent() != SCCRedefine->getParent())
10748 return false;
10749 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10750 SCCRedefine->getIterator())) {
10751 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10752 return false;
10753 if (MI.killsRegister(AMDGPU::SCC, &RI))
10754 KillsSCC = &MI;
10755 }
10756 if (MachineOperand *SccDef =
10757 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10758 SccDef->setIsDead(false);
10759 if (KillsSCC)
10760 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10761 SCCRedefine->eraseFromParent();
10762 return true;
10763}
10764
10765static bool foldableSelect(const MachineInstr &Def) {
10766 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10767 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10768 return false;
10769 bool Op1IsNonZeroImm =
10770 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10771 bool Op2IsZeroImm =
10772 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10773 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10774 return false;
10775 return true;
10776}
10777
10779 Register SrcReg2, int64_t CmpMask,
10780 int64_t CmpValue,
10781 const MachineRegisterInfo *MRI) const {
10782 if (!SrcReg || SrcReg.isPhysical())
10783 return false;
10784
10785 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10786 return false;
10787
10788 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10789 this]() -> bool {
10790 if (CmpValue != 0)
10791 return false;
10792
10793 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10794 if (!Def)
10795 return false;
10796
10797 // For S_OP that set SCC = DST!=0, do the transformation
10798 //
10799 // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10800
10801 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
10802 // for S_CSELECT* already has the same value that will be calculated by
10803 // s_cmp_lg_*
10804 //
10805 // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10806 // imm), 0)
10807 if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
10808 return false;
10809
10810 if (!optimizeSCC(Def, &CmpInstr, RI))
10811 return false;
10812
10813 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
10814 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
10815 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
10816 // sX = s_cselect_b64 (non-zero imm), 0
10817 // sLo = copy sX.sub0
10818 // sHi = copy sX.sub1
10819 // sY = s_or_b32 sLo, sHi
10820 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
10821 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
10822 const MachineOperand &OrOpnd1 = Def->getOperand(1);
10823 const MachineOperand &OrOpnd2 = Def->getOperand(2);
10824 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
10825 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
10826 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
10827 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
10828 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
10829 Def2->getOperand(1).isReg() &&
10830 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
10831 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
10832 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
10833 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
10834 if (Select && foldableSelect(*Select))
10835 optimizeSCC(Select, Def, RI);
10836 }
10837 }
10838 }
10839 return true;
10840 };
10841
10842 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10843 this](int64_t ExpectedValue, unsigned SrcSize,
10844 bool IsReversible, bool IsSigned) -> bool {
10845 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10846 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10847 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10848 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10849 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10850 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10851 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10852 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10853 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10854 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10855 //
10856 // Signed ge/gt are not used for the sign bit.
10857 //
10858 // If result of the AND is unused except in the compare:
10859 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10860 //
10861 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10862 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10863 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10864 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10865 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10866 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10867
10868 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10869 if (!Def)
10870 return false;
10871
10872 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10873 Def->getOpcode() != AMDGPU::S_AND_B64)
10874 return false;
10875
10876 int64_t Mask;
10877 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10878 if (MO->isImm())
10879 Mask = MO->getImm();
10880 else if (!getFoldableImm(MO, Mask))
10881 return false;
10882 Mask &= maxUIntN(SrcSize);
10883 return isPowerOf2_64(Mask);
10884 };
10885
10886 MachineOperand *SrcOp = &Def->getOperand(1);
10887 if (isMask(SrcOp))
10888 SrcOp = &Def->getOperand(2);
10889 else if (isMask(&Def->getOperand(2)))
10890 SrcOp = &Def->getOperand(1);
10891 else
10892 return false;
10893
10894 // A valid Mask is required to have a single bit set, hence a non-zero and
10895 // power-of-two value. This verifies that we will not do 64-bit shift below.
10896 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10897 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10898 if (IsSigned && BitNo == SrcSize - 1)
10899 return false;
10900
10901 ExpectedValue <<= BitNo;
10902
10903 bool IsReversedCC = false;
10904 if (CmpValue != ExpectedValue) {
10905 if (!IsReversible)
10906 return false;
10907 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10908 if (!IsReversedCC)
10909 return false;
10910 }
10911
10912 Register DefReg = Def->getOperand(0).getReg();
10913 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10914 return false;
10915
10916 if (!optimizeSCC(Def, &CmpInstr, RI))
10917 return false;
10918
10919 if (!MRI->use_nodbg_empty(DefReg)) {
10920 assert(!IsReversedCC);
10921 return true;
10922 }
10923
10924 // Replace AND with unused result with a S_BITCMP.
10925 MachineBasicBlock *MBB = Def->getParent();
10926
10927 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10928 : AMDGPU::S_BITCMP1_B32
10929 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10930 : AMDGPU::S_BITCMP1_B64;
10931
10932 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10933 .add(*SrcOp)
10934 .addImm(BitNo);
10935 Def->eraseFromParent();
10936
10937 return true;
10938 };
10939
10940 switch (CmpInstr.getOpcode()) {
10941 default:
10942 break;
10943 case AMDGPU::S_CMP_EQ_U32:
10944 case AMDGPU::S_CMP_EQ_I32:
10945 case AMDGPU::S_CMPK_EQ_U32:
10946 case AMDGPU::S_CMPK_EQ_I32:
10947 return optimizeCmpAnd(1, 32, true, false);
10948 case AMDGPU::S_CMP_GE_U32:
10949 case AMDGPU::S_CMPK_GE_U32:
10950 return optimizeCmpAnd(1, 32, false, false);
10951 case AMDGPU::S_CMP_GE_I32:
10952 case AMDGPU::S_CMPK_GE_I32:
10953 return optimizeCmpAnd(1, 32, false, true);
10954 case AMDGPU::S_CMP_EQ_U64:
10955 return optimizeCmpAnd(1, 64, true, false);
10956 case AMDGPU::S_CMP_LG_U32:
10957 case AMDGPU::S_CMP_LG_I32:
10958 case AMDGPU::S_CMPK_LG_U32:
10959 case AMDGPU::S_CMPK_LG_I32:
10960 return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
10961 case AMDGPU::S_CMP_GT_U32:
10962 case AMDGPU::S_CMPK_GT_U32:
10963 return optimizeCmpAnd(0, 32, false, false);
10964 case AMDGPU::S_CMP_GT_I32:
10965 case AMDGPU::S_CMPK_GT_I32:
10966 return optimizeCmpAnd(0, 32, false, true);
10967 case AMDGPU::S_CMP_LG_U64:
10968 return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
10969 }
10970
10971 return false;
10972}
10973
10975 AMDGPU::OpName OpName) const {
10976 if (!ST.needsAlignedVGPRs())
10977 return;
10978
10979 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10980 if (OpNo < 0)
10981 return;
10982 MachineOperand &Op = MI.getOperand(OpNo);
10983 if (getOpSize(MI, OpNo) > 4)
10984 return;
10985
10986 // Add implicit aligned super-reg to force alignment on the data operand.
10987 const DebugLoc &DL = MI.getDebugLoc();
10988 MachineBasicBlock *BB = MI.getParent();
10990 Register DataReg = Op.getReg();
10991 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10992 Register Undef = MRI.createVirtualRegister(
10993 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10994 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10995 Register NewVR =
10996 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10997 : &AMDGPU::VReg_64_Align2RegClass);
10998 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10999 .addReg(DataReg, 0, Op.getSubReg())
11000 .addImm(AMDGPU::sub0)
11001 .addReg(Undef)
11002 .addImm(AMDGPU::sub1);
11003 Op.setReg(NewVR);
11004 Op.setSubReg(AMDGPU::sub0);
11005 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11006}
11007
11009 if (isIGLP(*MI))
11010 return false;
11011
11013}
11014
11016 if (!isWMMA(MI) && !isSWMMAC(MI))
11017 return false;
11018
11019 if (AMDGPU::isGFX1250(ST))
11020 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11021
11022 return true;
11023}
11024
11026 unsigned Opcode = MI.getOpcode();
11027
11028 if (AMDGPU::isGFX12Plus(ST))
11029 return isDOT(MI) || isXDLWMMA(MI);
11030
11031 if (!isMAI(MI) || isDGEMM(Opcode) ||
11032 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11033 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11034 return false;
11035
11036 if (!ST.hasGFX940Insts())
11037 return true;
11038
11039 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11040}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, const SIRegisterInfo &RI)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:221
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:87
uint8_t OperandType
Information about the type of the operand.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:96
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool setsSCCifResultIsNonZero(const MachineInstr &MI)
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:587
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:589
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:586
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:588
@ TI_CONSTDATA_START
Definition AMDGPU.h:585
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:72
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:70
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:71
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:62
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:73
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.