LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197}
198
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(ToCycle)) {
223 FromCycle->getExitingBlocks(ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opc0) && isDS(Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(0) != Load1->getOperand(0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opc0).NumDefs;
279 Offset1Idx -= get(Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opc0) && isSMRD(Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Load0);
292 if (NumOps != getNumOperandsNoGlue(Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(0) != Load1->getOperand(0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
306 const ConstantSDNode *Load1Offset =
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opc0).NumDefs;
336 OffIdx1 -= get(Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(OffIdx0);
339 SDValue Off1 = Load1->getOperand(OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(LdSt)) {
378 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(64);
395 else
396 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
432 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(RSrc);
448 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(LdSt, AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
469 return true;
470 }
471
472 if (isImage(LdSt)) {
473 auto RsrcOpName =
474 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
476 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(&LdSt.getOperand(I));
482 } else {
483 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(LdSt)) {
495 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(BaseOp);
499 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(BaseOp);
514 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(BaseOp);
517 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
533 const MachineInstr &MI2,
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(Base1);
554 Base2 = getUnderlyingObject(Base2);
555
556 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
563 int64_t Offset1, bool OffsetIsScalable1,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
634 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(SrcReg, getKillRegState(KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(SrcReg, &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(DefOp.getReg(), &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(DefOp.getReg(), &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(ImpUseSuperReg,
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(std::next(MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
718 *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
732 /* RestoreAfter */ false, 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749 .addReg(SrcReg, getKillRegState(KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(ImpUseSuperReg,
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(Tmp, RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
761}
762
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, 2);
786 DestSubReg = RI.getSubReg(DestReg, SubIdx);
787 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794 .addReg(SrcSubReg)
795 .addReg(SrcReg, RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(FirstMI, LastMI);
807
808 FirstMI->addOperand(
809 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(SrcReg, &RI);
813}
814
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
821 unsigned Size = RI.getRegSizeInBits(*RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(DestReg);
843 Size = RI.getRegSizeInBits(*RC);
844 SrcRC = RI.getPhysRegBaseClass(SrcReg);
845 SrcSize = RI.getRegSizeInBits(*SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(MBB, MI, DL, get(Opc), DestReg)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(1)
865 .addImm(0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(0)
875 .addReg(SrcReg, getKillRegState(KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(1)
892 .addImm(0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(0)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
911 .addReg(SrcReg, getKillRegState(KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
924 .addReg(SrcReg, getKillRegState(KillSrc))
925 .addImm(0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
939 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(SrcReg, getKillRegState(KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(SrcReg, getKillRegState(KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
954 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
969 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
979 .addReg(NewSrcReg, getKillRegState(KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(SrcReg);
1003 } else {
1004 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(0) // src0_modifiers
1006 .addReg(SrcReg)
1007 .addImm(0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1019 .addReg(NewSrcReg, getKillRegState(KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1024 .addImm(0) // src0_modifiers
1025 .addReg(NewSrcReg)
1026 .addImm(0) // clamp
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(SrcReg, getKillRegState(KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1047 .addReg(SrcReg)
1049 .addReg(SrcReg)
1050 .addImm(0) // op_sel_lo
1051 .addImm(0) // op_sel_hi
1052 .addImm(0) // neg_lo
1053 .addImm(0) // neg_hi
1054 .addImm(0) // clamp
1055 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(SrcRC)) {
1063 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1067 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(*RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1120 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1130 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1133 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1135 .addReg(SrcSubReg)
1137 .addReg(SrcSubReg)
1138 .addImm(0) // op_sel_lo
1139 .addImm(0) // op_sel_hi
1140 .addImm(0) // neg_lo
1141 .addImm(0) // neg_hi
1142 .addImm(0) // clamp
1143 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1182 const DebugLoc &DL, Register DstReg,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1194 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1195 .add(Cond[0]);
1196 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1197 .addImm(0)
1198 .addReg(FalseReg)
1199 .addImm(0)
1200 .addReg(TrueReg)
1201 .addReg(SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1208 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1209 .addImm(0)
1210 .addReg(FalseReg)
1211 .addImm(0)
1212 .addReg(TrueReg)
1213 .addReg(SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1231 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1232 .add(RegOp);
1233 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1234 .addImm(0)
1235 .addReg(FalseReg)
1236 .addImm(0)
1237 .addReg(TrueReg)
1238 .addReg(SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1245 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1246 .add(RegOp);
1247 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addReg(SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1258 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1259 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1272 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1273 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1274 .addImm(0)
1275 .addReg(FalseReg)
1276 .addImm(0)
1277 .addReg(TrueReg)
1278 .addReg(SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1296 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1297 .addImm(Value)
1298 .addReg(SrcReg);
1299
1300 return Reg;
1301}
1302
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1309 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1310 .addImm(Value)
1311 .addReg(SrcReg);
1312
1313 return Reg;
1314}
1315
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO: {
1330 const MachineOperand &Src0 = MI.getOperand(1);
1331 if (Src0.isImm()) {
1332 ImmVal = Src0.getImm();
1333 return MI.getOperand(0).getReg() == Reg;
1334 }
1335
1336 return false;
1337 }
1338 case AMDGPU::S_BREV_B32:
1339 case AMDGPU::V_BFREV_B32_e32:
1340 case AMDGPU::V_BFREV_B32_e64: {
1341 const MachineOperand &Src0 = MI.getOperand(1);
1342 if (Src0.isImm()) {
1343 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1344 return MI.getOperand(0).getReg() == Reg;
1345 }
1346
1347 return false;
1348 }
1349 case AMDGPU::S_NOT_B32:
1350 case AMDGPU::V_NOT_B32_e32:
1351 case AMDGPU::V_NOT_B32_e64: {
1352 const MachineOperand &Src0 = MI.getOperand(1);
1353 if (Src0.isImm()) {
1354 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1355 return MI.getOperand(0).getReg() == Reg;
1356 }
1357
1358 return false;
1359 }
1360 default:
1361 return false;
1362 }
1363}
1364
1366
1367 if (RI.isAGPRClass(DstRC))
1368 return AMDGPU::COPY;
1369 if (RI.getRegSizeInBits(*DstRC) == 16) {
1370 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1371 // before RA.
1372 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1373 }
1374 if (RI.getRegSizeInBits(*DstRC) == 32)
1375 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1376 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1377 return AMDGPU::S_MOV_B64;
1378 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1379 return AMDGPU::V_MOV_B64_PSEUDO;
1380 return AMDGPU::COPY;
1381}
1382
1383const MCInstrDesc &
1385 bool IsIndirectSrc) const {
1386 if (IsIndirectSrc) {
1387 if (VecSize <= 32) // 4 bytes
1388 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1389 if (VecSize <= 64) // 8 bytes
1390 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1391 if (VecSize <= 96) // 12 bytes
1392 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1393 if (VecSize <= 128) // 16 bytes
1394 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1395 if (VecSize <= 160) // 20 bytes
1396 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1397 if (VecSize <= 256) // 32 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1399 if (VecSize <= 288) // 36 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1401 if (VecSize <= 320) // 40 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1403 if (VecSize <= 352) // 44 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1405 if (VecSize <= 384) // 48 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1407 if (VecSize <= 512) // 64 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1409 if (VecSize <= 1024) // 128 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1411
1412 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1413 }
1414
1415 if (VecSize <= 32) // 4 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1417 if (VecSize <= 64) // 8 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1419 if (VecSize <= 96) // 12 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1421 if (VecSize <= 128) // 16 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1423 if (VecSize <= 160) // 20 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1425 if (VecSize <= 256) // 32 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1427 if (VecSize <= 288) // 36 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1429 if (VecSize <= 320) // 40 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1431 if (VecSize <= 352) // 44 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1433 if (VecSize <= 384) // 48 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1435 if (VecSize <= 512) // 64 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1437 if (VecSize <= 1024) // 128 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1439
1440 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1441}
1442
1443static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1444 if (VecSize <= 32) // 4 bytes
1445 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1446 if (VecSize <= 64) // 8 bytes
1447 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1448 if (VecSize <= 96) // 12 bytes
1449 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1450 if (VecSize <= 128) // 16 bytes
1451 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1452 if (VecSize <= 160) // 20 bytes
1453 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1454 if (VecSize <= 256) // 32 bytes
1455 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1456 if (VecSize <= 288) // 36 bytes
1457 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1458 if (VecSize <= 320) // 40 bytes
1459 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1460 if (VecSize <= 352) // 44 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1462 if (VecSize <= 384) // 48 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1464 if (VecSize <= 512) // 64 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1466 if (VecSize <= 1024) // 128 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1468
1469 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1470}
1471
1472static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1473 if (VecSize <= 32) // 4 bytes
1474 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1475 if (VecSize <= 64) // 8 bytes
1476 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1477 if (VecSize <= 96) // 12 bytes
1478 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1479 if (VecSize <= 128) // 16 bytes
1480 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1481 if (VecSize <= 160) // 20 bytes
1482 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1483 if (VecSize <= 256) // 32 bytes
1484 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1485 if (VecSize <= 288) // 36 bytes
1486 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1487 if (VecSize <= 320) // 40 bytes
1488 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1489 if (VecSize <= 352) // 44 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1491 if (VecSize <= 384) // 48 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1493 if (VecSize <= 512) // 64 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1495 if (VecSize <= 1024) // 128 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1497
1498 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1499}
1500
1501static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1502 if (VecSize <= 64) // 8 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1504 if (VecSize <= 128) // 16 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1506 if (VecSize <= 256) // 32 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1508 if (VecSize <= 512) // 64 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1510 if (VecSize <= 1024) // 128 bytes
1511 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1512
1513 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1514}
1515
1516const MCInstrDesc &
1517SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1518 bool IsSGPR) const {
1519 if (IsSGPR) {
1520 switch (EltSize) {
1521 case 32:
1522 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1523 case 64:
1524 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1525 default:
1526 llvm_unreachable("invalid reg indexing elt size");
1527 }
1528 }
1529
1530 assert(EltSize == 32 && "invalid reg indexing elt size");
1532}
1533
1534static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1535 switch (Size) {
1536 case 4:
1537 return AMDGPU::SI_SPILL_S32_SAVE;
1538 case 8:
1539 return AMDGPU::SI_SPILL_S64_SAVE;
1540 case 12:
1541 return AMDGPU::SI_SPILL_S96_SAVE;
1542 case 16:
1543 return AMDGPU::SI_SPILL_S128_SAVE;
1544 case 20:
1545 return AMDGPU::SI_SPILL_S160_SAVE;
1546 case 24:
1547 return AMDGPU::SI_SPILL_S192_SAVE;
1548 case 28:
1549 return AMDGPU::SI_SPILL_S224_SAVE;
1550 case 32:
1551 return AMDGPU::SI_SPILL_S256_SAVE;
1552 case 36:
1553 return AMDGPU::SI_SPILL_S288_SAVE;
1554 case 40:
1555 return AMDGPU::SI_SPILL_S320_SAVE;
1556 case 44:
1557 return AMDGPU::SI_SPILL_S352_SAVE;
1558 case 48:
1559 return AMDGPU::SI_SPILL_S384_SAVE;
1560 case 64:
1561 return AMDGPU::SI_SPILL_S512_SAVE;
1562 case 128:
1563 return AMDGPU::SI_SPILL_S1024_SAVE;
1564 default:
1565 llvm_unreachable("unknown register size");
1566 }
1567}
1568
1569static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1570 switch (Size) {
1571 case 2:
1572 return AMDGPU::SI_SPILL_V16_SAVE;
1573 case 4:
1574 return AMDGPU::SI_SPILL_V32_SAVE;
1575 case 8:
1576 return AMDGPU::SI_SPILL_V64_SAVE;
1577 case 12:
1578 return AMDGPU::SI_SPILL_V96_SAVE;
1579 case 16:
1580 return AMDGPU::SI_SPILL_V128_SAVE;
1581 case 20:
1582 return AMDGPU::SI_SPILL_V160_SAVE;
1583 case 24:
1584 return AMDGPU::SI_SPILL_V192_SAVE;
1585 case 28:
1586 return AMDGPU::SI_SPILL_V224_SAVE;
1587 case 32:
1588 return AMDGPU::SI_SPILL_V256_SAVE;
1589 case 36:
1590 return AMDGPU::SI_SPILL_V288_SAVE;
1591 case 40:
1592 return AMDGPU::SI_SPILL_V320_SAVE;
1593 case 44:
1594 return AMDGPU::SI_SPILL_V352_SAVE;
1595 case 48:
1596 return AMDGPU::SI_SPILL_V384_SAVE;
1597 case 64:
1598 return AMDGPU::SI_SPILL_V512_SAVE;
1599 case 128:
1600 return AMDGPU::SI_SPILL_V1024_SAVE;
1601 default:
1602 llvm_unreachable("unknown register size");
1603 }
1604}
1605
1606static unsigned getAVSpillSaveOpcode(unsigned Size) {
1607 switch (Size) {
1608 case 4:
1609 return AMDGPU::SI_SPILL_AV32_SAVE;
1610 case 8:
1611 return AMDGPU::SI_SPILL_AV64_SAVE;
1612 case 12:
1613 return AMDGPU::SI_SPILL_AV96_SAVE;
1614 case 16:
1615 return AMDGPU::SI_SPILL_AV128_SAVE;
1616 case 20:
1617 return AMDGPU::SI_SPILL_AV160_SAVE;
1618 case 24:
1619 return AMDGPU::SI_SPILL_AV192_SAVE;
1620 case 28:
1621 return AMDGPU::SI_SPILL_AV224_SAVE;
1622 case 32:
1623 return AMDGPU::SI_SPILL_AV256_SAVE;
1624 case 36:
1625 return AMDGPU::SI_SPILL_AV288_SAVE;
1626 case 40:
1627 return AMDGPU::SI_SPILL_AV320_SAVE;
1628 case 44:
1629 return AMDGPU::SI_SPILL_AV352_SAVE;
1630 case 48:
1631 return AMDGPU::SI_SPILL_AV384_SAVE;
1632 case 64:
1633 return AMDGPU::SI_SPILL_AV512_SAVE;
1634 case 128:
1635 return AMDGPU::SI_SPILL_AV1024_SAVE;
1636 default:
1637 llvm_unreachable("unknown register size");
1638 }
1639}
1640
1641static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1642 bool IsVectorSuperClass) {
1643 // Currently, there is only 32-bit WWM register spills needed.
1644 if (Size != 4)
1645 llvm_unreachable("unknown wwm register spill size");
1646
1647 if (IsVectorSuperClass)
1648 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1649
1650 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1651}
1652
1654 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1655 const SIMachineFunctionInfo &MFI) const {
1656 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1657
1658 // Choose the right opcode if spilling a WWM register.
1660 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1661
1662 // TODO: Check if AGPRs are available
1663 if (ST.hasMAIInsts())
1664 return getAVSpillSaveOpcode(Size);
1665
1667}
1668
1671 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1672 MachineInstr::MIFlag Flags) const {
1673 MachineFunction *MF = MBB.getParent();
1675 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1676 const DebugLoc &DL = MBB.findDebugLoc(MI);
1677
1678 MachinePointerInfo PtrInfo
1679 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1681 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1682 FrameInfo.getObjectAlign(FrameIndex));
1683 unsigned SpillSize = RI.getSpillSize(*RC);
1684
1686 if (RI.isSGPRClass(RC)) {
1687 MFI->setHasSpilledSGPRs();
1688 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1689 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1690 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1691
1692 // We are only allowed to create one new instruction when spilling
1693 // registers, so we need to use pseudo instruction for spilling SGPRs.
1694 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1695
1696 // The SGPR spill/restore instructions only work on number sgprs, so we need
1697 // to make sure we are using the correct register class.
1698 if (SrcReg.isVirtual() && SpillSize == 4) {
1699 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1700 }
1701
1702 BuildMI(MBB, MI, DL, OpDesc)
1703 .addReg(SrcReg, getKillRegState(isKill)) // data
1704 .addFrameIndex(FrameIndex) // addr
1705 .addMemOperand(MMO)
1707
1708 if (RI.spillSGPRToVGPR())
1709 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1710 return;
1711 }
1712
1713 unsigned Opcode =
1714 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1715 MFI->setHasSpilledVGPRs();
1716
1717 BuildMI(MBB, MI, DL, get(Opcode))
1718 .addReg(SrcReg, getKillRegState(isKill)) // data
1719 .addFrameIndex(FrameIndex) // addr
1720 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1721 .addImm(0) // offset
1722 .addMemOperand(MMO);
1723}
1724
1725static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1726 switch (Size) {
1727 case 4:
1728 return AMDGPU::SI_SPILL_S32_RESTORE;
1729 case 8:
1730 return AMDGPU::SI_SPILL_S64_RESTORE;
1731 case 12:
1732 return AMDGPU::SI_SPILL_S96_RESTORE;
1733 case 16:
1734 return AMDGPU::SI_SPILL_S128_RESTORE;
1735 case 20:
1736 return AMDGPU::SI_SPILL_S160_RESTORE;
1737 case 24:
1738 return AMDGPU::SI_SPILL_S192_RESTORE;
1739 case 28:
1740 return AMDGPU::SI_SPILL_S224_RESTORE;
1741 case 32:
1742 return AMDGPU::SI_SPILL_S256_RESTORE;
1743 case 36:
1744 return AMDGPU::SI_SPILL_S288_RESTORE;
1745 case 40:
1746 return AMDGPU::SI_SPILL_S320_RESTORE;
1747 case 44:
1748 return AMDGPU::SI_SPILL_S352_RESTORE;
1749 case 48:
1750 return AMDGPU::SI_SPILL_S384_RESTORE;
1751 case 64:
1752 return AMDGPU::SI_SPILL_S512_RESTORE;
1753 case 128:
1754 return AMDGPU::SI_SPILL_S1024_RESTORE;
1755 default:
1756 llvm_unreachable("unknown register size");
1757 }
1758}
1759
1760static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1761 switch (Size) {
1762 case 2:
1763 return AMDGPU::SI_SPILL_V16_RESTORE;
1764 case 4:
1765 return AMDGPU::SI_SPILL_V32_RESTORE;
1766 case 8:
1767 return AMDGPU::SI_SPILL_V64_RESTORE;
1768 case 12:
1769 return AMDGPU::SI_SPILL_V96_RESTORE;
1770 case 16:
1771 return AMDGPU::SI_SPILL_V128_RESTORE;
1772 case 20:
1773 return AMDGPU::SI_SPILL_V160_RESTORE;
1774 case 24:
1775 return AMDGPU::SI_SPILL_V192_RESTORE;
1776 case 28:
1777 return AMDGPU::SI_SPILL_V224_RESTORE;
1778 case 32:
1779 return AMDGPU::SI_SPILL_V256_RESTORE;
1780 case 36:
1781 return AMDGPU::SI_SPILL_V288_RESTORE;
1782 case 40:
1783 return AMDGPU::SI_SPILL_V320_RESTORE;
1784 case 44:
1785 return AMDGPU::SI_SPILL_V352_RESTORE;
1786 case 48:
1787 return AMDGPU::SI_SPILL_V384_RESTORE;
1788 case 64:
1789 return AMDGPU::SI_SPILL_V512_RESTORE;
1790 case 128:
1791 return AMDGPU::SI_SPILL_V1024_RESTORE;
1792 default:
1793 llvm_unreachable("unknown register size");
1794 }
1795}
1796
1797static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1798 switch (Size) {
1799 case 4:
1800 return AMDGPU::SI_SPILL_AV32_RESTORE;
1801 case 8:
1802 return AMDGPU::SI_SPILL_AV64_RESTORE;
1803 case 12:
1804 return AMDGPU::SI_SPILL_AV96_RESTORE;
1805 case 16:
1806 return AMDGPU::SI_SPILL_AV128_RESTORE;
1807 case 20:
1808 return AMDGPU::SI_SPILL_AV160_RESTORE;
1809 case 24:
1810 return AMDGPU::SI_SPILL_AV192_RESTORE;
1811 case 28:
1812 return AMDGPU::SI_SPILL_AV224_RESTORE;
1813 case 32:
1814 return AMDGPU::SI_SPILL_AV256_RESTORE;
1815 case 36:
1816 return AMDGPU::SI_SPILL_AV288_RESTORE;
1817 case 40:
1818 return AMDGPU::SI_SPILL_AV320_RESTORE;
1819 case 44:
1820 return AMDGPU::SI_SPILL_AV352_RESTORE;
1821 case 48:
1822 return AMDGPU::SI_SPILL_AV384_RESTORE;
1823 case 64:
1824 return AMDGPU::SI_SPILL_AV512_RESTORE;
1825 case 128:
1826 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1827 default:
1828 llvm_unreachable("unknown register size");
1829 }
1830}
1831
1832static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1833 bool IsVectorSuperClass) {
1834 // Currently, there is only 32-bit WWM register spills needed.
1835 if (Size != 4)
1836 llvm_unreachable("unknown wwm register spill size");
1837
1838 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1839 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1840
1841 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1842}
1843
1845 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1846 const SIMachineFunctionInfo &MFI) const {
1847 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1848
1849 // Choose the right opcode if restoring a WWM register.
1851 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1852
1853 // TODO: Check if AGPRs are available
1854 if (ST.hasMAIInsts())
1856
1857 assert(!RI.isAGPRClass(RC));
1859}
1860
1863 Register DestReg, int FrameIndex,
1864 const TargetRegisterClass *RC,
1865 Register VReg,
1866 MachineInstr::MIFlag Flags) const {
1867 MachineFunction *MF = MBB.getParent();
1869 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1870 const DebugLoc &DL = MBB.findDebugLoc(MI);
1871 unsigned SpillSize = RI.getSpillSize(*RC);
1872
1873 MachinePointerInfo PtrInfo
1874 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1875
1877 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1878 FrameInfo.getObjectAlign(FrameIndex));
1879
1880 if (RI.isSGPRClass(RC)) {
1881 MFI->setHasSpilledSGPRs();
1882 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1883 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1884 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1885
1886 // FIXME: Maybe this should not include a memoperand because it will be
1887 // lowered to non-memory instructions.
1888 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1889 if (DestReg.isVirtual() && SpillSize == 4) {
1891 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1892 }
1893
1894 if (RI.spillSGPRToVGPR())
1895 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1896 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1897 .addFrameIndex(FrameIndex) // addr
1898 .addMemOperand(MMO)
1900
1901 return;
1902 }
1903
1904 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1905 SpillSize, *MFI);
1906 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1907 .addFrameIndex(FrameIndex) // vaddr
1908 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1909 .addImm(0) // offset
1910 .addMemOperand(MMO);
1911}
1912
1917
1920 unsigned Quantity) const {
1921 DebugLoc DL = MBB.findDebugLoc(MI);
1922 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1923 while (Quantity > 0) {
1924 unsigned Arg = std::min(Quantity, MaxSNopCount);
1925 Quantity -= Arg;
1926 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1927 }
1928}
1929
1931 auto *MF = MBB.getParent();
1932 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1933
1934 assert(Info->isEntryFunction());
1935
1936 if (MBB.succ_empty()) {
1937 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1938 if (HasNoTerminator) {
1939 if (Info->returnsVoid()) {
1940 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1941 } else {
1942 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1943 }
1944 }
1945 }
1946}
1947
1951 const DebugLoc &DL) const {
1952 MachineFunction *MF = MBB.getParent();
1953 constexpr unsigned DoorbellIDMask = 0x3ff;
1954 constexpr unsigned ECQueueWaveAbort = 0x400;
1955
1956 MachineBasicBlock *TrapBB = &MBB;
1957 MachineBasicBlock *ContBB = &MBB;
1958 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1959
1960 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1961 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1962 TrapBB = MF->CreateMachineBasicBlock();
1963 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1964 MF->push_back(TrapBB);
1965 MBB.addSuccessor(TrapBB);
1966 } else {
1967 // Since we're adding HaltLoopBB and modifying the CFG, we must return a
1968 // different block to signal the change.
1969 ContBB = HaltLoopBB;
1970 }
1971
1972 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1973 // will be a nop.
1974 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1975 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1976 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1977 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1978 DoorbellReg)
1980 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1981 .addUse(AMDGPU::M0);
1982 Register DoorbellRegMasked =
1983 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1984 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1985 .addUse(DoorbellReg)
1986 .addImm(DoorbellIDMask);
1987 Register SetWaveAbortBit =
1988 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1989 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1990 .addUse(DoorbellRegMasked)
1991 .addImm(ECQueueWaveAbort);
1992 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1993 .addUse(SetWaveAbortBit);
1994 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
1996 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1997 .addUse(AMDGPU::TTMP2);
1998 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
1999 TrapBB->addSuccessor(HaltLoopBB);
2000
2001 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2002 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2003 .addMBB(HaltLoopBB);
2004 MF->push_back(HaltLoopBB);
2005 HaltLoopBB->addSuccessor(HaltLoopBB);
2006
2007 return ContBB;
2008}
2009
2011 switch (MI.getOpcode()) {
2012 default:
2013 if (MI.isMetaInstruction())
2014 return 0;
2015 return 1; // FIXME: Do wait states equal cycles?
2016
2017 case AMDGPU::S_NOP:
2018 return MI.getOperand(0).getImm() + 1;
2019 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2020 // hazard, even if one exist, won't really be visible. Should we handle it?
2021 }
2022}
2023
2025 MachineBasicBlock &MBB = *MI.getParent();
2026 DebugLoc DL = MBB.findDebugLoc(MI);
2028 switch (MI.getOpcode()) {
2029 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2030 case AMDGPU::S_MOV_B64_term:
2031 // This is only a terminator to get the correct spill code placement during
2032 // register allocation.
2033 MI.setDesc(get(AMDGPU::S_MOV_B64));
2034 break;
2035
2036 case AMDGPU::S_MOV_B32_term:
2037 // This is only a terminator to get the correct spill code placement during
2038 // register allocation.
2039 MI.setDesc(get(AMDGPU::S_MOV_B32));
2040 break;
2041
2042 case AMDGPU::S_XOR_B64_term:
2043 // This is only a terminator to get the correct spill code placement during
2044 // register allocation.
2045 MI.setDesc(get(AMDGPU::S_XOR_B64));
2046 break;
2047
2048 case AMDGPU::S_XOR_B32_term:
2049 // This is only a terminator to get the correct spill code placement during
2050 // register allocation.
2051 MI.setDesc(get(AMDGPU::S_XOR_B32));
2052 break;
2053 case AMDGPU::S_OR_B64_term:
2054 // This is only a terminator to get the correct spill code placement during
2055 // register allocation.
2056 MI.setDesc(get(AMDGPU::S_OR_B64));
2057 break;
2058 case AMDGPU::S_OR_B32_term:
2059 // This is only a terminator to get the correct spill code placement during
2060 // register allocation.
2061 MI.setDesc(get(AMDGPU::S_OR_B32));
2062 break;
2063
2064 case AMDGPU::S_ANDN2_B64_term:
2065 // This is only a terminator to get the correct spill code placement during
2066 // register allocation.
2067 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2068 break;
2069
2070 case AMDGPU::S_ANDN2_B32_term:
2071 // This is only a terminator to get the correct spill code placement during
2072 // register allocation.
2073 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2074 break;
2075
2076 case AMDGPU::S_AND_B64_term:
2077 // This is only a terminator to get the correct spill code placement during
2078 // register allocation.
2079 MI.setDesc(get(AMDGPU::S_AND_B64));
2080 break;
2081
2082 case AMDGPU::S_AND_B32_term:
2083 // This is only a terminator to get the correct spill code placement during
2084 // register allocation.
2085 MI.setDesc(get(AMDGPU::S_AND_B32));
2086 break;
2087
2088 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2089 // This is only a terminator to get the correct spill code placement during
2090 // register allocation.
2091 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2092 break;
2093
2094 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2095 // This is only a terminator to get the correct spill code placement during
2096 // register allocation.
2097 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2098 break;
2099
2100 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2101 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2102 break;
2103
2104 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2105 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2106 break;
2107 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2108 Register Dst = MI.getOperand(0).getReg();
2109 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2110 MI.setDesc(
2111 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2112 break;
2113 }
2114 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2115 Register Dst = MI.getOperand(0).getReg();
2116 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2117 int64_t Imm = MI.getOperand(1).getImm();
2118
2119 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2120 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2121 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2124 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2125 .addImm(SignExtend64<32>(Imm >> 32))
2127 MI.eraseFromParent();
2128 break;
2129 }
2130
2131 [[fallthrough]];
2132 }
2133 case AMDGPU::V_MOV_B64_PSEUDO: {
2134 Register Dst = MI.getOperand(0).getReg();
2135 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2136 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2137
2138 const MachineOperand &SrcOp = MI.getOperand(1);
2139 // FIXME: Will this work for 64-bit floating point immediates?
2140 assert(!SrcOp.isFPImm());
2141 if (ST.hasMovB64()) {
2142 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2143 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2144 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2145 break;
2146 }
2147 if (SrcOp.isImm()) {
2148 APInt Imm(64, SrcOp.getImm());
2149 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2150 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2151 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2152 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2154 .addImm(Lo.getSExtValue())
2156 .addImm(Lo.getSExtValue())
2157 .addImm(0) // op_sel_lo
2158 .addImm(0) // op_sel_hi
2159 .addImm(0) // neg_lo
2160 .addImm(0) // neg_hi
2161 .addImm(0); // clamp
2162 } else {
2163 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2164 .addImm(Lo.getSExtValue())
2166 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2167 .addImm(Hi.getSExtValue())
2169 }
2170 } else {
2171 assert(SrcOp.isReg());
2172 if (ST.hasPkMovB32() &&
2173 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2174 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2175 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2176 .addReg(SrcOp.getReg())
2178 .addReg(SrcOp.getReg())
2179 .addImm(0) // op_sel_lo
2180 .addImm(0) // op_sel_hi
2181 .addImm(0) // neg_lo
2182 .addImm(0) // neg_hi
2183 .addImm(0); // clamp
2184 } else {
2185 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2186 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2188 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2189 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2191 }
2192 }
2193 MI.eraseFromParent();
2194 break;
2195 }
2196 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2198 break;
2199 }
2200 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2201 const MachineOperand &SrcOp = MI.getOperand(1);
2202 assert(!SrcOp.isFPImm());
2203
2204 if (ST.has64BitLiterals()) {
2205 MI.setDesc(get(AMDGPU::S_MOV_B64));
2206 break;
2207 }
2208
2209 APInt Imm(64, SrcOp.getImm());
2210 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2211 MI.setDesc(get(AMDGPU::S_MOV_B64));
2212 break;
2213 }
2214
2215 Register Dst = MI.getOperand(0).getReg();
2216 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2217 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2218
2219 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2220 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2221 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2222 .addImm(Lo.getSExtValue())
2224 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2225 .addImm(Hi.getSExtValue())
2227 MI.eraseFromParent();
2228 break;
2229 }
2230 case AMDGPU::V_SET_INACTIVE_B32: {
2231 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2232 Register DstReg = MI.getOperand(0).getReg();
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2234 .add(MI.getOperand(3))
2235 .add(MI.getOperand(4))
2236 .add(MI.getOperand(1))
2237 .add(MI.getOperand(2))
2238 .add(MI.getOperand(5));
2239 MI.eraseFromParent();
2240 break;
2241 }
2242 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2243 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2244 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2245 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2246 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2247 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2248 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2249 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2250 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2251 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2252 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2253 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2254 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2255 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2256 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2257 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2258 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2259 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2260 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2261 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2262 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2263 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2268 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2269 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2270 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2271 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2272
2273 unsigned Opc;
2274 if (RI.hasVGPRs(EltRC)) {
2275 Opc = AMDGPU::V_MOVRELD_B32_e32;
2276 } else {
2277 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2278 : AMDGPU::S_MOVRELD_B32;
2279 }
2280
2281 const MCInstrDesc &OpDesc = get(Opc);
2282 Register VecReg = MI.getOperand(0).getReg();
2283 bool IsUndef = MI.getOperand(1).isUndef();
2284 unsigned SubReg = MI.getOperand(3).getImm();
2285 assert(VecReg == MI.getOperand(1).getReg());
2286
2288 BuildMI(MBB, MI, DL, OpDesc)
2289 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2290 .add(MI.getOperand(2))
2292 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2293
2294 const int ImpDefIdx =
2295 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2296 const int ImpUseIdx = ImpDefIdx + 1;
2297 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2298 MI.eraseFromParent();
2299 break;
2300 }
2301 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2313 assert(ST.useVGPRIndexMode());
2314 Register VecReg = MI.getOperand(0).getReg();
2315 bool IsUndef = MI.getOperand(1).isUndef();
2316 MachineOperand &Idx = MI.getOperand(3);
2317 Register SubReg = MI.getOperand(4).getImm();
2318
2319 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2320 .add(Idx)
2322 SetOn->getOperand(3).setIsUndef();
2323
2324 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2326 BuildMI(MBB, MI, DL, OpDesc)
2327 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2328 .add(MI.getOperand(2))
2330 .addReg(VecReg,
2331 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2332
2333 const int ImpDefIdx =
2334 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2335 const int ImpUseIdx = ImpDefIdx + 1;
2336 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2337
2338 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2339
2340 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2341
2342 MI.eraseFromParent();
2343 break;
2344 }
2345 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2346 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2347 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2348 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2349 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2350 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2351 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2352 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2353 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2354 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2355 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2356 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2357 assert(ST.useVGPRIndexMode());
2358 Register Dst = MI.getOperand(0).getReg();
2359 Register VecReg = MI.getOperand(1).getReg();
2360 bool IsUndef = MI.getOperand(1).isUndef();
2361 Register Idx = MI.getOperand(2).getReg();
2362 Register SubReg = MI.getOperand(3).getImm();
2363
2364 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2365 .addReg(Idx)
2367 SetOn->getOperand(3).setIsUndef();
2368
2369 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2370 .addDef(Dst)
2371 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2372 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2373
2374 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2375
2376 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2377
2378 MI.eraseFromParent();
2379 break;
2380 }
2381 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2382 MachineFunction &MF = *MBB.getParent();
2383 Register Reg = MI.getOperand(0).getReg();
2384 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2385 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2386 MachineOperand OpLo = MI.getOperand(1);
2387 MachineOperand OpHi = MI.getOperand(2);
2388
2389 // Create a bundle so these instructions won't be re-ordered by the
2390 // post-RA scheduler.
2391 MIBundleBuilder Bundler(MBB, MI);
2392 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2393
2394 // What we want here is an offset from the value returned by s_getpc (which
2395 // is the address of the s_add_u32 instruction) to the global variable, but
2396 // since the encoding of $symbol starts 4 bytes after the start of the
2397 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2398 // small. This requires us to add 4 to the global variable offset in order
2399 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2400 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2401 // instruction.
2402
2403 int64_t Adjust = 0;
2404 if (ST.hasGetPCZeroExtension()) {
2405 // Fix up hardware that does not sign-extend the 48-bit PC value by
2406 // inserting: s_sext_i32_i16 reghi, reghi
2407 Bundler.append(
2408 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2409 Adjust += 4;
2410 }
2411
2412 if (OpLo.isGlobal())
2413 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2414 Bundler.append(
2415 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2416
2417 if (OpHi.isGlobal())
2418 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2419 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2420 .addReg(RegHi)
2421 .add(OpHi));
2422
2423 finalizeBundle(MBB, Bundler.begin());
2424
2425 MI.eraseFromParent();
2426 break;
2427 }
2428 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2429 MachineFunction &MF = *MBB.getParent();
2430 Register Reg = MI.getOperand(0).getReg();
2431 MachineOperand Op = MI.getOperand(1);
2432
2433 // Create a bundle so these instructions won't be re-ordered by the
2434 // post-RA scheduler.
2435 MIBundleBuilder Bundler(MBB, MI);
2436 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2437 if (Op.isGlobal())
2438 Op.setOffset(Op.getOffset() + 4);
2439 Bundler.append(
2440 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2441
2442 finalizeBundle(MBB, Bundler.begin());
2443
2444 MI.eraseFromParent();
2445 break;
2446 }
2447 case AMDGPU::ENTER_STRICT_WWM: {
2448 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2449 // Whole Wave Mode is entered.
2450 MI.setDesc(get(LMC.OrSaveExecOpc));
2451 break;
2452 }
2453 case AMDGPU::ENTER_STRICT_WQM: {
2454 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2455 // STRICT_WQM is entered.
2456 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2457 .addReg(LMC.ExecReg);
2458 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2459
2460 MI.eraseFromParent();
2461 break;
2462 }
2463 case AMDGPU::EXIT_STRICT_WWM:
2464 case AMDGPU::EXIT_STRICT_WQM: {
2465 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2466 // WWM/STICT_WQM is exited.
2467 MI.setDesc(get(LMC.MovOpc));
2468 break;
2469 }
2470 case AMDGPU::SI_RETURN: {
2471 const MachineFunction *MF = MBB.getParent();
2472 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2473 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2474 // Hiding the return address use with SI_RETURN may lead to extra kills in
2475 // the function and missing live-ins. We are fine in practice because callee
2476 // saved register handling ensures the register value is restored before
2477 // RET, but we need the undef flag here to appease the MachineVerifier
2478 // liveness checks.
2480 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2481 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2482
2483 MIB.copyImplicitOps(MI);
2484 MI.eraseFromParent();
2485 break;
2486 }
2487
2488 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2489 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2490 MI.setDesc(get(AMDGPU::S_MUL_U64));
2491 break;
2492
2493 case AMDGPU::S_GETPC_B64_pseudo:
2494 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2495 if (ST.hasGetPCZeroExtension()) {
2496 Register Dst = MI.getOperand(0).getReg();
2497 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2498 // Fix up hardware that does not sign-extend the 48-bit PC value by
2499 // inserting: s_sext_i32_i16 dsthi, dsthi
2500 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2501 DstHi)
2502 .addReg(DstHi);
2503 }
2504 break;
2505
2506 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2507 assert(ST.hasBF16PackedInsts());
2508 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2509 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2510 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2511 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2512 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2513 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2514 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2515 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2516 break;
2517 }
2518
2519 return true;
2520}
2521
2524 unsigned SubIdx,
2525 const MachineInstr &Orig) const {
2526
2527 // Try shrinking the instruction to remat only the part needed for current
2528 // context.
2529 // TODO: Handle more cases.
2530 unsigned Opcode = Orig.getOpcode();
2531 switch (Opcode) {
2532 case AMDGPU::S_LOAD_DWORDX16_IMM:
2533 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2534 if (SubIdx != 0)
2535 break;
2536
2537 if (I == MBB.end())
2538 break;
2539
2540 if (I->isBundled())
2541 break;
2542
2543 // Look for a single use of the register that is also a subreg.
2544 Register RegToFind = Orig.getOperand(0).getReg();
2545 MachineOperand *UseMO = nullptr;
2546 for (auto &CandMO : I->operands()) {
2547 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2548 continue;
2549 if (UseMO) {
2550 UseMO = nullptr;
2551 break;
2552 }
2553 UseMO = &CandMO;
2554 }
2555 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2556 break;
2557
2558 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2559 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2560
2561 MachineFunction *MF = MBB.getParent();
2563 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2564
2565 unsigned NewOpcode = -1;
2566 if (SubregSize == 256)
2567 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2568 else if (SubregSize == 128)
2569 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2570 else
2571 break;
2572
2573 const MCInstrDesc &TID = get(NewOpcode);
2574 const TargetRegisterClass *NewRC =
2575 RI.getAllocatableClass(getRegClass(TID, 0));
2576 MRI.setRegClass(DestReg, NewRC);
2577
2578 UseMO->setReg(DestReg);
2579 UseMO->setSubReg(AMDGPU::NoSubRegister);
2580
2581 // Use a smaller load with the desired size, possibly with updated offset.
2582 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2583 MI->setDesc(TID);
2584 MI->getOperand(0).setReg(DestReg);
2585 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2586 if (Offset) {
2587 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2588 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2589 OffsetMO->setImm(FinalOffset);
2590 }
2592 for (const MachineMemOperand *MemOp : Orig.memoperands())
2593 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2594 SubregSize / 8));
2595 MI->setMemRefs(*MF, NewMMOs);
2596
2597 MBB.insert(I, MI);
2598 return;
2599 }
2600
2601 default:
2602 break;
2603 }
2604
2605 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2606}
2607
2608std::pair<MachineInstr*, MachineInstr*>
2610 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2611
2612 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2614 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2615 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2616 return std::pair(&MI, nullptr);
2617 }
2618
2619 MachineBasicBlock &MBB = *MI.getParent();
2620 DebugLoc DL = MBB.findDebugLoc(MI);
2621 MachineFunction *MF = MBB.getParent();
2623 Register Dst = MI.getOperand(0).getReg();
2624 unsigned Part = 0;
2625 MachineInstr *Split[2];
2626
2627 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2628 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2629 if (Dst.isPhysical()) {
2630 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2631 } else {
2632 assert(MRI.isSSA());
2633 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2634 MovDPP.addDef(Tmp);
2635 }
2636
2637 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2638 const MachineOperand &SrcOp = MI.getOperand(I);
2639 assert(!SrcOp.isFPImm());
2640 if (SrcOp.isImm()) {
2641 APInt Imm(64, SrcOp.getImm());
2642 Imm.ashrInPlace(Part * 32);
2643 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2644 } else {
2645 assert(SrcOp.isReg());
2646 Register Src = SrcOp.getReg();
2647 if (Src.isPhysical())
2648 MovDPP.addReg(RI.getSubReg(Src, Sub));
2649 else
2650 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2651 }
2652 }
2653
2654 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2655 MovDPP.addImm(MO.getImm());
2656
2657 Split[Part] = MovDPP;
2658 ++Part;
2659 }
2660
2661 if (Dst.isVirtual())
2662 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2663 .addReg(Split[0]->getOperand(0).getReg())
2664 .addImm(AMDGPU::sub0)
2665 .addReg(Split[1]->getOperand(0).getReg())
2666 .addImm(AMDGPU::sub1);
2667
2668 MI.eraseFromParent();
2669 return std::pair(Split[0], Split[1]);
2670}
2671
2672std::optional<DestSourcePair>
2674 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2675 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2676
2677 return std::nullopt;
2678}
2679
2681 AMDGPU::OpName Src0OpName,
2682 MachineOperand &Src1,
2683 AMDGPU::OpName Src1OpName) const {
2684 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2685 if (!Src0Mods)
2686 return false;
2687
2688 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2689 assert(Src1Mods &&
2690 "All commutable instructions have both src0 and src1 modifiers");
2691
2692 int Src0ModsVal = Src0Mods->getImm();
2693 int Src1ModsVal = Src1Mods->getImm();
2694
2695 Src1Mods->setImm(Src0ModsVal);
2696 Src0Mods->setImm(Src1ModsVal);
2697 return true;
2698}
2699
2701 MachineOperand &RegOp,
2702 MachineOperand &NonRegOp) {
2703 Register Reg = RegOp.getReg();
2704 unsigned SubReg = RegOp.getSubReg();
2705 bool IsKill = RegOp.isKill();
2706 bool IsDead = RegOp.isDead();
2707 bool IsUndef = RegOp.isUndef();
2708 bool IsDebug = RegOp.isDebug();
2709
2710 if (NonRegOp.isImm())
2711 RegOp.ChangeToImmediate(NonRegOp.getImm());
2712 else if (NonRegOp.isFI())
2713 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2714 else if (NonRegOp.isGlobal()) {
2715 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2716 NonRegOp.getTargetFlags());
2717 } else
2718 return nullptr;
2719
2720 // Make sure we don't reinterpret a subreg index in the target flags.
2721 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2722
2723 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2724 NonRegOp.setSubReg(SubReg);
2725
2726 return &MI;
2727}
2728
2730 MachineOperand &NonRegOp1,
2731 MachineOperand &NonRegOp2) {
2732 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2733 int64_t NonRegVal = NonRegOp1.getImm();
2734
2735 NonRegOp1.setImm(NonRegOp2.getImm());
2736 NonRegOp2.setImm(NonRegVal);
2737 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2738 NonRegOp2.setTargetFlags(TargetFlags);
2739 return &MI;
2740}
2741
2742bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2743 unsigned OpIdx1) const {
2744 const MCInstrDesc &InstDesc = MI.getDesc();
2745 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2746 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2747
2748 unsigned Opc = MI.getOpcode();
2749 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2750
2751 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2752 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2753
2754 // Swap doesn't breach constant bus or literal limits
2755 // It may move literal to position other than src0, this is not allowed
2756 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2757 // FIXME: After gfx9, literal can be in place other than Src0
2758 if (isVALU(MI)) {
2759 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2760 !isInlineConstant(MO0, OpInfo1))
2761 return false;
2762 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2763 !isInlineConstant(MO1, OpInfo0))
2764 return false;
2765 }
2766
2767 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2768 if (OpInfo1.RegClass == -1)
2769 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2770 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2771 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2772 }
2773 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2774 if (OpInfo0.RegClass == -1)
2775 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2776 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2777 isLegalRegOperand(MI, OpIdx0, MO1);
2778 }
2779
2780 // No need to check 64-bit literals since swapping does not bring new
2781 // 64-bit literals into current instruction to fold to 32-bit
2782
2783 return isImmOperandLegal(MI, OpIdx1, MO0);
2784}
2785
2787 unsigned Src0Idx,
2788 unsigned Src1Idx) const {
2789 assert(!NewMI && "this should never be used");
2790
2791 unsigned Opc = MI.getOpcode();
2792 int CommutedOpcode = commuteOpcode(Opc);
2793 if (CommutedOpcode == -1)
2794 return nullptr;
2795
2796 if (Src0Idx > Src1Idx)
2797 std::swap(Src0Idx, Src1Idx);
2798
2799 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2800 static_cast<int>(Src0Idx) &&
2801 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2802 static_cast<int>(Src1Idx) &&
2803 "inconsistency with findCommutedOpIndices");
2804
2805 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2806 return nullptr;
2807
2808 MachineInstr *CommutedMI = nullptr;
2809 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2810 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2811 if (Src0.isReg() && Src1.isReg()) {
2812 // Be sure to copy the source modifiers to the right place.
2813 CommutedMI =
2814 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2815 } else if (Src0.isReg() && !Src1.isReg()) {
2816 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2817 } else if (!Src0.isReg() && Src1.isReg()) {
2818 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2819 } else if (Src0.isImm() && Src1.isImm()) {
2820 CommutedMI = swapImmOperands(MI, Src0, Src1);
2821 } else {
2822 // FIXME: Found two non registers to commute. This does happen.
2823 return nullptr;
2824 }
2825
2826 if (CommutedMI) {
2827 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2828 Src1, AMDGPU::OpName::src1_modifiers);
2829
2830 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2831 AMDGPU::OpName::src1_sel);
2832
2833 CommutedMI->setDesc(get(CommutedOpcode));
2834 }
2835
2836 return CommutedMI;
2837}
2838
2839// This needs to be implemented because the source modifiers may be inserted
2840// between the true commutable operands, and the base
2841// TargetInstrInfo::commuteInstruction uses it.
2843 unsigned &SrcOpIdx0,
2844 unsigned &SrcOpIdx1) const {
2845 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2846}
2847
2849 unsigned &SrcOpIdx0,
2850 unsigned &SrcOpIdx1) const {
2851 if (!Desc.isCommutable())
2852 return false;
2853
2854 unsigned Opc = Desc.getOpcode();
2855 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2856 if (Src0Idx == -1)
2857 return false;
2858
2859 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2860 if (Src1Idx == -1)
2861 return false;
2862
2863 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2864}
2865
2867 int64_t BrOffset) const {
2868 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2869 // because its dest block is unanalyzable.
2870 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2871
2872 // Convert to dwords.
2873 BrOffset /= 4;
2874
2875 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2876 // from the next instruction.
2877 BrOffset -= 1;
2878
2879 return isIntN(BranchOffsetBits, BrOffset);
2880}
2881
2884 return MI.getOperand(0).getMBB();
2885}
2886
2888 for (const MachineInstr &MI : MBB->terminators()) {
2889 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2890 MI.getOpcode() == AMDGPU::SI_LOOP)
2891 return true;
2892 }
2893 return false;
2894}
2895
2897 MachineBasicBlock &DestBB,
2898 MachineBasicBlock &RestoreBB,
2899 const DebugLoc &DL, int64_t BrOffset,
2900 RegScavenger *RS) const {
2901 assert(MBB.empty() &&
2902 "new block should be inserted for expanding unconditional branch");
2903 assert(MBB.pred_size() == 1);
2904 assert(RestoreBB.empty() &&
2905 "restore block should be inserted for restoring clobbered registers");
2906
2907 MachineFunction *MF = MBB.getParent();
2910 auto I = MBB.end();
2911 auto &MCCtx = MF->getContext();
2912
2913 if (ST.hasAddPC64Inst()) {
2914 MCSymbol *Offset =
2915 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2916 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2918 MCSymbol *PostAddPCLabel =
2919 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2920 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2921 auto *OffsetExpr = MCBinaryExpr::createSub(
2922 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2923 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2924 Offset->setVariableValue(OffsetExpr);
2925 return;
2926 }
2927
2928 assert(RS && "RegScavenger required for long branching");
2929
2930 // FIXME: Virtual register workaround for RegScavenger not working with empty
2931 // blocks.
2932 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2933
2934 // Note: as this is used after hazard recognizer we need to apply some hazard
2935 // workarounds directly.
2936 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2937 ST.hasVALUReadSGPRHazard();
2938 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2939 if (FlushSGPRWrites)
2940 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2942 };
2943
2944 // We need to compute the offset relative to the instruction immediately after
2945 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2946 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2947 ApplyHazardWorkarounds();
2948
2949 MCSymbol *PostGetPCLabel =
2950 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2951 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2952
2953 MCSymbol *OffsetLo =
2954 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2955 MCSymbol *OffsetHi =
2956 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2957 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2958 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2959 .addReg(PCReg, 0, AMDGPU::sub0)
2960 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2961 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2962 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2963 .addReg(PCReg, 0, AMDGPU::sub1)
2964 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2965 ApplyHazardWorkarounds();
2966
2967 // Insert the indirect branch after the other terminator.
2968 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2969 .addReg(PCReg);
2970
2971 // If a spill is needed for the pc register pair, we need to insert a spill
2972 // restore block right before the destination block, and insert a short branch
2973 // into the old destination block's fallthrough predecessor.
2974 // e.g.:
2975 //
2976 // s_cbranch_scc0 skip_long_branch:
2977 //
2978 // long_branch_bb:
2979 // spill s[8:9]
2980 // s_getpc_b64 s[8:9]
2981 // s_add_u32 s8, s8, restore_bb
2982 // s_addc_u32 s9, s9, 0
2983 // s_setpc_b64 s[8:9]
2984 //
2985 // skip_long_branch:
2986 // foo;
2987 //
2988 // .....
2989 //
2990 // dest_bb_fallthrough_predecessor:
2991 // bar;
2992 // s_branch dest_bb
2993 //
2994 // restore_bb:
2995 // restore s[8:9]
2996 // fallthrough dest_bb
2997 ///
2998 // dest_bb:
2999 // buzz;
3000
3001 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3002 Register Scav;
3003
3004 // If we've previously reserved a register for long branches
3005 // avoid running the scavenger and just use those registers
3006 if (LongBranchReservedReg) {
3007 RS->enterBasicBlock(MBB);
3008 Scav = LongBranchReservedReg;
3009 } else {
3010 RS->enterBasicBlockEnd(MBB);
3011 Scav = RS->scavengeRegisterBackwards(
3012 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3013 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3014 }
3015 if (Scav) {
3016 RS->setRegUsed(Scav);
3017 MRI.replaceRegWith(PCReg, Scav);
3018 MRI.clearVirtRegs();
3019 } else {
3020 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3021 // SGPR spill.
3022 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3023 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3024 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3025 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3026 MRI.clearVirtRegs();
3027 }
3028
3029 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3030 // Now, the distance could be defined.
3032 MCSymbolRefExpr::create(DestLabel, MCCtx),
3033 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3034 // Add offset assignments.
3035 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3036 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3037 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3038 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3039}
3040
3041unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3042 switch (Cond) {
3043 case SIInstrInfo::SCC_TRUE:
3044 return AMDGPU::S_CBRANCH_SCC1;
3045 case SIInstrInfo::SCC_FALSE:
3046 return AMDGPU::S_CBRANCH_SCC0;
3047 case SIInstrInfo::VCCNZ:
3048 return AMDGPU::S_CBRANCH_VCCNZ;
3049 case SIInstrInfo::VCCZ:
3050 return AMDGPU::S_CBRANCH_VCCZ;
3051 case SIInstrInfo::EXECNZ:
3052 return AMDGPU::S_CBRANCH_EXECNZ;
3053 case SIInstrInfo::EXECZ:
3054 return AMDGPU::S_CBRANCH_EXECZ;
3055 default:
3056 llvm_unreachable("invalid branch predicate");
3057 }
3058}
3059
3060SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3061 switch (Opcode) {
3062 case AMDGPU::S_CBRANCH_SCC0:
3063 return SCC_FALSE;
3064 case AMDGPU::S_CBRANCH_SCC1:
3065 return SCC_TRUE;
3066 case AMDGPU::S_CBRANCH_VCCNZ:
3067 return VCCNZ;
3068 case AMDGPU::S_CBRANCH_VCCZ:
3069 return VCCZ;
3070 case AMDGPU::S_CBRANCH_EXECNZ:
3071 return EXECNZ;
3072 case AMDGPU::S_CBRANCH_EXECZ:
3073 return EXECZ;
3074 default:
3075 return INVALID_BR;
3076 }
3077}
3078
3082 MachineBasicBlock *&FBB,
3084 bool AllowModify) const {
3085 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3086 // Unconditional Branch
3087 TBB = I->getOperand(0).getMBB();
3088 return false;
3089 }
3090
3091 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3092 if (Pred == INVALID_BR)
3093 return true;
3094
3095 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3096 Cond.push_back(MachineOperand::CreateImm(Pred));
3097 Cond.push_back(I->getOperand(1)); // Save the branch register.
3098
3099 ++I;
3100
3101 if (I == MBB.end()) {
3102 // Conditional branch followed by fall-through.
3103 TBB = CondBB;
3104 return false;
3105 }
3106
3107 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3108 TBB = CondBB;
3109 FBB = I->getOperand(0).getMBB();
3110 return false;
3111 }
3112
3113 return true;
3114}
3115
3117 MachineBasicBlock *&FBB,
3119 bool AllowModify) const {
3120 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3121 auto E = MBB.end();
3122 if (I == E)
3123 return false;
3124
3125 // Skip over the instructions that are artificially terminators for special
3126 // exec management.
3127 while (I != E && !I->isBranch() && !I->isReturn()) {
3128 switch (I->getOpcode()) {
3129 case AMDGPU::S_MOV_B64_term:
3130 case AMDGPU::S_XOR_B64_term:
3131 case AMDGPU::S_OR_B64_term:
3132 case AMDGPU::S_ANDN2_B64_term:
3133 case AMDGPU::S_AND_B64_term:
3134 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3135 case AMDGPU::S_MOV_B32_term:
3136 case AMDGPU::S_XOR_B32_term:
3137 case AMDGPU::S_OR_B32_term:
3138 case AMDGPU::S_ANDN2_B32_term:
3139 case AMDGPU::S_AND_B32_term:
3140 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3141 break;
3142 case AMDGPU::SI_IF:
3143 case AMDGPU::SI_ELSE:
3144 case AMDGPU::SI_KILL_I1_TERMINATOR:
3145 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3146 // FIXME: It's messy that these need to be considered here at all.
3147 return true;
3148 default:
3149 llvm_unreachable("unexpected non-branch terminator inst");
3150 }
3151
3152 ++I;
3153 }
3154
3155 if (I == E)
3156 return false;
3157
3158 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3159}
3160
3162 int *BytesRemoved) const {
3163 unsigned Count = 0;
3164 unsigned RemovedSize = 0;
3165 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3166 // Skip over artificial terminators when removing instructions.
3167 if (MI.isBranch() || MI.isReturn()) {
3168 RemovedSize += getInstSizeInBytes(MI);
3169 MI.eraseFromParent();
3170 ++Count;
3171 }
3172 }
3173
3174 if (BytesRemoved)
3175 *BytesRemoved = RemovedSize;
3176
3177 return Count;
3178}
3179
3180// Copy the flags onto the implicit condition register operand.
3182 const MachineOperand &OrigCond) {
3183 CondReg.setIsUndef(OrigCond.isUndef());
3184 CondReg.setIsKill(OrigCond.isKill());
3185}
3186
3189 MachineBasicBlock *FBB,
3191 const DebugLoc &DL,
3192 int *BytesAdded) const {
3193 if (!FBB && Cond.empty()) {
3194 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3195 .addMBB(TBB);
3196 if (BytesAdded)
3197 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3198 return 1;
3199 }
3200
3201 assert(TBB && Cond[0].isImm());
3202
3203 unsigned Opcode
3204 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3205
3206 if (!FBB) {
3207 MachineInstr *CondBr =
3208 BuildMI(&MBB, DL, get(Opcode))
3209 .addMBB(TBB);
3210
3211 // Copy the flags onto the implicit condition register operand.
3212 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3213 fixImplicitOperands(*CondBr);
3214
3215 if (BytesAdded)
3216 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3217 return 1;
3218 }
3219
3220 assert(TBB && FBB);
3221
3222 MachineInstr *CondBr =
3223 BuildMI(&MBB, DL, get(Opcode))
3224 .addMBB(TBB);
3225 fixImplicitOperands(*CondBr);
3226 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3227 .addMBB(FBB);
3228
3229 MachineOperand &CondReg = CondBr->getOperand(1);
3230 CondReg.setIsUndef(Cond[1].isUndef());
3231 CondReg.setIsKill(Cond[1].isKill());
3232
3233 if (BytesAdded)
3234 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3235
3236 return 2;
3237}
3238
3241 if (Cond.size() != 2) {
3242 return true;
3243 }
3244
3245 if (Cond[0].isImm()) {
3246 Cond[0].setImm(-Cond[0].getImm());
3247 return false;
3248 }
3249
3250 return true;
3251}
3252
3255 Register DstReg, Register TrueReg,
3256 Register FalseReg, int &CondCycles,
3257 int &TrueCycles, int &FalseCycles) const {
3258 switch (Cond[0].getImm()) {
3259 case VCCNZ:
3260 case VCCZ: {
3261 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3262 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3263 if (MRI.getRegClass(FalseReg) != RC)
3264 return false;
3265
3266 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3267 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3268
3269 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3270 return RI.hasVGPRs(RC) && NumInsts <= 6;
3271 }
3272 case SCC_TRUE:
3273 case SCC_FALSE: {
3274 // FIXME: We could insert for VGPRs if we could replace the original compare
3275 // with a vector one.
3276 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3277 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3278 if (MRI.getRegClass(FalseReg) != RC)
3279 return false;
3280
3281 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3282
3283 // Multiples of 8 can do s_cselect_b64
3284 if (NumInsts % 2 == 0)
3285 NumInsts /= 2;
3286
3287 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3288 return RI.isSGPRClass(RC);
3289 }
3290 default:
3291 return false;
3292 }
3293}
3294
3298 Register TrueReg, Register FalseReg) const {
3299 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3300 if (Pred == VCCZ || Pred == SCC_FALSE) {
3301 Pred = static_cast<BranchPredicate>(-Pred);
3302 std::swap(TrueReg, FalseReg);
3303 }
3304
3305 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3306 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3307 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3308
3309 if (DstSize == 32) {
3311 if (Pred == SCC_TRUE) {
3312 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3313 .addReg(TrueReg)
3314 .addReg(FalseReg);
3315 } else {
3316 // Instruction's operands are backwards from what is expected.
3317 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3318 .addReg(FalseReg)
3319 .addReg(TrueReg);
3320 }
3321
3322 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3323 return;
3324 }
3325
3326 if (DstSize == 64 && Pred == SCC_TRUE) {
3328 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3329 .addReg(TrueReg)
3330 .addReg(FalseReg);
3331
3332 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3333 return;
3334 }
3335
3336 static const int16_t Sub0_15[] = {
3337 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3338 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3339 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3340 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3341 };
3342
3343 static const int16_t Sub0_15_64[] = {
3344 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3345 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3346 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3347 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3348 };
3349
3350 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3351 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3352 const int16_t *SubIndices = Sub0_15;
3353 int NElts = DstSize / 32;
3354
3355 // 64-bit select is only available for SALU.
3356 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3357 if (Pred == SCC_TRUE) {
3358 if (NElts % 2) {
3359 SelOp = AMDGPU::S_CSELECT_B32;
3360 EltRC = &AMDGPU::SGPR_32RegClass;
3361 } else {
3362 SelOp = AMDGPU::S_CSELECT_B64;
3363 EltRC = &AMDGPU::SGPR_64RegClass;
3364 SubIndices = Sub0_15_64;
3365 NElts /= 2;
3366 }
3367 }
3368
3370 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3371
3372 I = MIB->getIterator();
3373
3375 for (int Idx = 0; Idx != NElts; ++Idx) {
3376 Register DstElt = MRI.createVirtualRegister(EltRC);
3377 Regs.push_back(DstElt);
3378
3379 unsigned SubIdx = SubIndices[Idx];
3380
3382 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3383 Select =
3384 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3385 .addReg(FalseReg, 0, SubIdx)
3386 .addReg(TrueReg, 0, SubIdx);
3387 } else {
3388 Select =
3389 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3390 .addReg(TrueReg, 0, SubIdx)
3391 .addReg(FalseReg, 0, SubIdx);
3392 }
3393
3394 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3396
3397 MIB.addReg(DstElt)
3398 .addImm(SubIdx);
3399 }
3400}
3401
3403 switch (MI.getOpcode()) {
3404 case AMDGPU::V_MOV_B16_t16_e32:
3405 case AMDGPU::V_MOV_B16_t16_e64:
3406 case AMDGPU::V_MOV_B32_e32:
3407 case AMDGPU::V_MOV_B32_e64:
3408 case AMDGPU::V_MOV_B64_PSEUDO:
3409 case AMDGPU::V_MOV_B64_e32:
3410 case AMDGPU::V_MOV_B64_e64:
3411 case AMDGPU::S_MOV_B32:
3412 case AMDGPU::S_MOV_B64:
3413 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3414 case AMDGPU::COPY:
3415 case AMDGPU::WWM_COPY:
3416 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3417 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3418 case AMDGPU::V_ACCVGPR_MOV_B32:
3419 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3420 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3421 return true;
3422 default:
3423 return false;
3424 }
3425}
3426
3428 switch (MI.getOpcode()) {
3429 case AMDGPU::V_MOV_B16_t16_e32:
3430 case AMDGPU::V_MOV_B16_t16_e64:
3431 return 2;
3432 case AMDGPU::V_MOV_B32_e32:
3433 case AMDGPU::V_MOV_B32_e64:
3434 case AMDGPU::V_MOV_B64_PSEUDO:
3435 case AMDGPU::V_MOV_B64_e32:
3436 case AMDGPU::V_MOV_B64_e64:
3437 case AMDGPU::S_MOV_B32:
3438 case AMDGPU::S_MOV_B64:
3439 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3440 case AMDGPU::COPY:
3441 case AMDGPU::WWM_COPY:
3442 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3443 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3444 case AMDGPU::V_ACCVGPR_MOV_B32:
3445 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3446 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3447 return 1;
3448 default:
3449 llvm_unreachable("MI is not a foldable copy");
3450 }
3451}
3452
3453static constexpr AMDGPU::OpName ModifierOpNames[] = {
3454 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3455 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3456 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3457
3459 unsigned Opc = MI.getOpcode();
3460 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3461 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3462 if (Idx >= 0)
3463 MI.removeOperand(Idx);
3464 }
3465}
3466
3468 const MCInstrDesc &NewDesc) const {
3469 MI.setDesc(NewDesc);
3470
3471 // Remove any leftover implicit operands from mutating the instruction. e.g.
3472 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3473 // anymore.
3474 const MCInstrDesc &Desc = MI.getDesc();
3475 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3476 Desc.implicit_defs().size();
3477
3478 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3479 MI.removeOperand(I);
3480}
3481
3482std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3483 unsigned SubRegIndex) {
3484 switch (SubRegIndex) {
3485 case AMDGPU::NoSubRegister:
3486 return Imm;
3487 case AMDGPU::sub0:
3488 return SignExtend64<32>(Imm);
3489 case AMDGPU::sub1:
3490 return SignExtend64<32>(Imm >> 32);
3491 case AMDGPU::lo16:
3492 return SignExtend64<16>(Imm);
3493 case AMDGPU::hi16:
3494 return SignExtend64<16>(Imm >> 16);
3495 case AMDGPU::sub1_lo16:
3496 return SignExtend64<16>(Imm >> 32);
3497 case AMDGPU::sub1_hi16:
3498 return SignExtend64<16>(Imm >> 48);
3499 default:
3500 return std::nullopt;
3501 }
3502
3503 llvm_unreachable("covered subregister switch");
3504}
3505
3506static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3507 switch (Opc) {
3508 case AMDGPU::V_MAC_F16_e32:
3509 case AMDGPU::V_MAC_F16_e64:
3510 case AMDGPU::V_MAD_F16_e64:
3511 return AMDGPU::V_MADAK_F16;
3512 case AMDGPU::V_MAC_F32_e32:
3513 case AMDGPU::V_MAC_F32_e64:
3514 case AMDGPU::V_MAD_F32_e64:
3515 return AMDGPU::V_MADAK_F32;
3516 case AMDGPU::V_FMAC_F32_e32:
3517 case AMDGPU::V_FMAC_F32_e64:
3518 case AMDGPU::V_FMA_F32_e64:
3519 return AMDGPU::V_FMAAK_F32;
3520 case AMDGPU::V_FMAC_F16_e32:
3521 case AMDGPU::V_FMAC_F16_e64:
3522 case AMDGPU::V_FMAC_F16_t16_e64:
3523 case AMDGPU::V_FMAC_F16_fake16_e64:
3524 case AMDGPU::V_FMA_F16_e64:
3525 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3526 ? AMDGPU::V_FMAAK_F16_t16
3527 : AMDGPU::V_FMAAK_F16_fake16
3528 : AMDGPU::V_FMAAK_F16;
3529 case AMDGPU::V_FMAC_F64_e32:
3530 case AMDGPU::V_FMAC_F64_e64:
3531 case AMDGPU::V_FMA_F64_e64:
3532 return AMDGPU::V_FMAAK_F64;
3533 default:
3534 llvm_unreachable("invalid instruction");
3535 }
3536}
3537
3538static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3539 switch (Opc) {
3540 case AMDGPU::V_MAC_F16_e32:
3541 case AMDGPU::V_MAC_F16_e64:
3542 case AMDGPU::V_MAD_F16_e64:
3543 return AMDGPU::V_MADMK_F16;
3544 case AMDGPU::V_MAC_F32_e32:
3545 case AMDGPU::V_MAC_F32_e64:
3546 case AMDGPU::V_MAD_F32_e64:
3547 return AMDGPU::V_MADMK_F32;
3548 case AMDGPU::V_FMAC_F32_e32:
3549 case AMDGPU::V_FMAC_F32_e64:
3550 case AMDGPU::V_FMA_F32_e64:
3551 return AMDGPU::V_FMAMK_F32;
3552 case AMDGPU::V_FMAC_F16_e32:
3553 case AMDGPU::V_FMAC_F16_e64:
3554 case AMDGPU::V_FMAC_F16_t16_e64:
3555 case AMDGPU::V_FMAC_F16_fake16_e64:
3556 case AMDGPU::V_FMA_F16_e64:
3557 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3558 ? AMDGPU::V_FMAMK_F16_t16
3559 : AMDGPU::V_FMAMK_F16_fake16
3560 : AMDGPU::V_FMAMK_F16;
3561 case AMDGPU::V_FMAC_F64_e32:
3562 case AMDGPU::V_FMAC_F64_e64:
3563 case AMDGPU::V_FMA_F64_e64:
3564 return AMDGPU::V_FMAMK_F64;
3565 default:
3566 llvm_unreachable("invalid instruction");
3567 }
3568}
3569
3571 Register Reg, MachineRegisterInfo *MRI) const {
3572 int64_t Imm;
3573 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3574 return false;
3575
3576 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3577
3578 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3579
3580 unsigned Opc = UseMI.getOpcode();
3581 if (Opc == AMDGPU::COPY) {
3582 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3583
3584 Register DstReg = UseMI.getOperand(0).getReg();
3585 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3586
3587 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3588
3589 if (HasMultipleUses) {
3590 // TODO: This should fold in more cases with multiple use, but we need to
3591 // more carefully consider what those uses are.
3592 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3593
3594 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3595 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3596 return false;
3597
3598 // Most of the time folding a 32-bit inline constant is free (though this
3599 // might not be true if we can't later fold it into a real user).
3600 //
3601 // FIXME: This isInlineConstant check is imprecise if
3602 // getConstValDefinedInReg handled the tricky non-mov cases.
3603 if (ImmDefSize == 32 &&
3605 return false;
3606 }
3607
3608 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3609 RI.getSubRegIdxSize(UseSubReg) == 16;
3610
3611 if (Is16Bit) {
3612 if (RI.hasVGPRs(DstRC))
3613 return false; // Do not clobber vgpr_hi16
3614
3615 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3616 return false;
3617 }
3618
3619 MachineFunction *MF = UseMI.getMF();
3620
3621 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3622 MCRegister MovDstPhysReg =
3623 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3624
3625 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3626
3627 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3628 for (unsigned MovOp :
3629 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3630 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3631 const MCInstrDesc &MovDesc = get(MovOp);
3632
3633 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3634 if (Is16Bit) {
3635 // We just need to find a correctly sized register class, so the
3636 // subregister index compatibility doesn't matter since we're statically
3637 // extracting the immediate value.
3638 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3639 if (!MovDstRC)
3640 continue;
3641
3642 if (MovDstPhysReg) {
3643 // FIXME: We probably should not do this. If there is a live value in
3644 // the high half of the register, it will be corrupted.
3645 MovDstPhysReg =
3646 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3647 if (!MovDstPhysReg)
3648 continue;
3649 }
3650 }
3651
3652 // Result class isn't the right size, try the next instruction.
3653 if (MovDstPhysReg) {
3654 if (!MovDstRC->contains(MovDstPhysReg))
3655 return false;
3656 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3657 // TODO: This will be overly conservative in the case of 16-bit virtual
3658 // SGPRs. We could hack up the virtual register uses to use a compatible
3659 // 32-bit class.
3660 continue;
3661 }
3662
3663 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3664
3665 // Ensure the interpreted immediate value is a valid operand in the new
3666 // mov.
3667 //
3668 // FIXME: isImmOperandLegal should have form that doesn't require existing
3669 // MachineInstr or MachineOperand
3670 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3671 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3672 break;
3673
3674 NewOpc = MovOp;
3675 break;
3676 }
3677
3678 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3679 return false;
3680
3681 if (Is16Bit) {
3682 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3683 if (MovDstPhysReg)
3684 UseMI.getOperand(0).setReg(MovDstPhysReg);
3685 assert(UseMI.getOperand(1).getReg().isVirtual());
3686 }
3687
3688 const MCInstrDesc &NewMCID = get(NewOpc);
3689 UseMI.setDesc(NewMCID);
3690 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3691 UseMI.addImplicitDefUseOperands(*MF);
3692 return true;
3693 }
3694
3695 if (HasMultipleUses)
3696 return false;
3697
3698 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3699 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3700 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3701 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3702 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3703 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3704 Opc == AMDGPU::V_FMAC_F64_e64) {
3705 // Don't fold if we are using source or output modifiers. The new VOP2
3706 // instructions don't have them.
3708 return false;
3709
3710 // If this is a free constant, there's no reason to do this.
3711 // TODO: We could fold this here instead of letting SIFoldOperands do it
3712 // later.
3713 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3714
3715 // Any src operand can be used for the legality check.
3716 if (isInlineConstant(UseMI, Src0Idx, Imm))
3717 return false;
3718
3719 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3720
3721 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3722 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3723
3724 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3725 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3726 (Src1->isReg() && Src1->getReg() == Reg)) {
3727 MachineOperand *RegSrc =
3728 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3729 if (!RegSrc->isReg())
3730 return false;
3731 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3732 ST.getConstantBusLimit(Opc) < 2)
3733 return false;
3734
3735 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3736 return false;
3737
3738 // If src2 is also a literal constant then we have to choose which one to
3739 // fold. In general it is better to choose madak so that the other literal
3740 // can be materialized in an sgpr instead of a vgpr:
3741 // s_mov_b32 s0, literal
3742 // v_madak_f32 v0, s0, v0, literal
3743 // Instead of:
3744 // v_mov_b32 v1, literal
3745 // v_madmk_f32 v0, v0, literal, v1
3746 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3747 if (Def && Def->isMoveImmediate() &&
3748 !isInlineConstant(Def->getOperand(1)))
3749 return false;
3750
3751 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3752 if (pseudoToMCOpcode(NewOpc) == -1)
3753 return false;
3754
3755 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3756 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3757 // restricting their register classes. For now just bail out.
3758 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3759 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3760 return false;
3761
3762 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3763 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3764
3765 // FIXME: This would be a lot easier if we could return a new instruction
3766 // instead of having to modify in place.
3767
3768 Register SrcReg = RegSrc->getReg();
3769 unsigned SrcSubReg = RegSrc->getSubReg();
3770 Src0->setReg(SrcReg);
3771 Src0->setSubReg(SrcSubReg);
3772 Src0->setIsKill(RegSrc->isKill());
3773
3774 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3775 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3776 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3777 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3778 UseMI.untieRegOperand(
3779 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3780
3781 Src1->ChangeToImmediate(*SubRegImm);
3782
3784 UseMI.setDesc(get(NewOpc));
3785
3786 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3787 if (DeleteDef)
3788 DefMI.eraseFromParent();
3789
3790 return true;
3791 }
3792
3793 // Added part is the constant: Use v_madak_{f16, f32}.
3794 if (Src2->isReg() && Src2->getReg() == Reg) {
3795 if (ST.getConstantBusLimit(Opc) < 2) {
3796 // Not allowed to use constant bus for another operand.
3797 // We can however allow an inline immediate as src0.
3798 bool Src0Inlined = false;
3799 if (Src0->isReg()) {
3800 // Try to inline constant if possible.
3801 // If the Def moves immediate and the use is single
3802 // We are saving VGPR here.
3803 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3804 if (Def && Def->isMoveImmediate() &&
3805 isInlineConstant(Def->getOperand(1)) &&
3806 MRI->hasOneNonDBGUse(Src0->getReg())) {
3807 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3808 Src0Inlined = true;
3809 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3810 RI.isSGPRReg(*MRI, Src0->getReg())) {
3811 return false;
3812 }
3813 // VGPR is okay as Src0 - fallthrough
3814 }
3815
3816 if (Src1->isReg() && !Src0Inlined) {
3817 // We have one slot for inlinable constant so far - try to fill it
3818 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3819 if (Def && Def->isMoveImmediate() &&
3820 isInlineConstant(Def->getOperand(1)) &&
3821 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3822 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3823 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3824 return false;
3825 // VGPR is okay as Src1 - fallthrough
3826 }
3827 }
3828
3829 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3830 if (pseudoToMCOpcode(NewOpc) == -1)
3831 return false;
3832
3833 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3834 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3835 // restricting their register classes. For now just bail out.
3836 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3837 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3838 return false;
3839
3840 // FIXME: This would be a lot easier if we could return a new instruction
3841 // instead of having to modify in place.
3842
3843 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3844 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3845 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3846 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3847 UseMI.untieRegOperand(
3848 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3849
3850 const std::optional<int64_t> SubRegImm =
3851 extractSubregFromImm(Imm, Src2->getSubReg());
3852
3853 // ChangingToImmediate adds Src2 back to the instruction.
3854 Src2->ChangeToImmediate(*SubRegImm);
3855
3856 // These come before src2.
3858 UseMI.setDesc(get(NewOpc));
3859 // It might happen that UseMI was commuted
3860 // and we now have SGPR as SRC1. If so 2 inlined
3861 // constant and SGPR are illegal.
3863
3864 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3865 if (DeleteDef)
3866 DefMI.eraseFromParent();
3867
3868 return true;
3869 }
3870 }
3871
3872 return false;
3873}
3874
3875static bool
3878 if (BaseOps1.size() != BaseOps2.size())
3879 return false;
3880 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3881 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3882 return false;
3883 }
3884 return true;
3885}
3886
3887static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3888 LocationSize WidthB, int OffsetB) {
3889 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3890 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3891 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3892 return LowWidth.hasValue() &&
3893 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3894}
3895
3896bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3897 const MachineInstr &MIb) const {
3898 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3899 int64_t Offset0, Offset1;
3900 LocationSize Dummy0 = LocationSize::precise(0);
3901 LocationSize Dummy1 = LocationSize::precise(0);
3902 bool Offset0IsScalable, Offset1IsScalable;
3903 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3904 Dummy0, &RI) ||
3905 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3906 Dummy1, &RI))
3907 return false;
3908
3909 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3910 return false;
3911
3912 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3913 // FIXME: Handle ds_read2 / ds_write2.
3914 return false;
3915 }
3916 LocationSize Width0 = MIa.memoperands().front()->getSize();
3917 LocationSize Width1 = MIb.memoperands().front()->getSize();
3918 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3919}
3920
3922 const MachineInstr &MIb) const {
3923 assert(MIa.mayLoadOrStore() &&
3924 "MIa must load from or modify a memory location");
3925 assert(MIb.mayLoadOrStore() &&
3926 "MIb must load from or modify a memory location");
3927
3929 return false;
3930
3931 // XXX - Can we relax this between address spaces?
3932 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3933 return false;
3934
3935 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3936 return false;
3937
3938 if (MIa.isBundle() || MIb.isBundle())
3939 return false;
3940
3941 // TODO: Should we check the address space from the MachineMemOperand? That
3942 // would allow us to distinguish objects we know don't alias based on the
3943 // underlying address space, even if it was lowered to a different one,
3944 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3945 // buffer.
3946 if (isDS(MIa)) {
3947 if (isDS(MIb))
3948 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3949
3950 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3951 }
3952
3953 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3954 if (isMUBUF(MIb) || isMTBUF(MIb))
3955 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3956
3957 if (isFLAT(MIb))
3958 return isFLATScratch(MIb);
3959
3960 return !isSMRD(MIb);
3961 }
3962
3963 if (isSMRD(MIa)) {
3964 if (isSMRD(MIb))
3965 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3966
3967 if (isFLAT(MIb))
3968 return isFLATScratch(MIb);
3969
3970 return !isMUBUF(MIb) && !isMTBUF(MIb);
3971 }
3972
3973 if (isFLAT(MIa)) {
3974 if (isFLAT(MIb)) {
3975 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3976 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3977 return true;
3978
3979 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3980 }
3981
3982 return false;
3983 }
3984
3985 return false;
3986}
3987
3989 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3990 if (Reg.isPhysical())
3991 return false;
3992 auto *Def = MRI.getUniqueVRegDef(Reg);
3993 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3994 Imm = Def->getOperand(1).getImm();
3995 if (DefMI)
3996 *DefMI = Def;
3997 return true;
3998 }
3999 return false;
4000}
4001
4002static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4003 MachineInstr **DefMI = nullptr) {
4004 if (!MO->isReg())
4005 return false;
4006 const MachineFunction *MF = MO->getParent()->getMF();
4007 const MachineRegisterInfo &MRI = MF->getRegInfo();
4008 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4009}
4010
4012 MachineInstr &NewMI) {
4013 if (LV) {
4014 unsigned NumOps = MI.getNumOperands();
4015 for (unsigned I = 1; I < NumOps; ++I) {
4016 MachineOperand &Op = MI.getOperand(I);
4017 if (Op.isReg() && Op.isKill())
4018 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4019 }
4020 }
4021}
4022
4023static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4024 switch (Opc) {
4025 case AMDGPU::V_MAC_F16_e32:
4026 case AMDGPU::V_MAC_F16_e64:
4027 return AMDGPU::V_MAD_F16_e64;
4028 case AMDGPU::V_MAC_F32_e32:
4029 case AMDGPU::V_MAC_F32_e64:
4030 return AMDGPU::V_MAD_F32_e64;
4031 case AMDGPU::V_MAC_LEGACY_F32_e32:
4032 case AMDGPU::V_MAC_LEGACY_F32_e64:
4033 return AMDGPU::V_MAD_LEGACY_F32_e64;
4034 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4035 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4036 return AMDGPU::V_FMA_LEGACY_F32_e64;
4037 case AMDGPU::V_FMAC_F16_e32:
4038 case AMDGPU::V_FMAC_F16_e64:
4039 case AMDGPU::V_FMAC_F16_t16_e64:
4040 case AMDGPU::V_FMAC_F16_fake16_e64:
4041 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4042 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4043 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4044 : AMDGPU::V_FMA_F16_gfx9_e64;
4045 case AMDGPU::V_FMAC_F32_e32:
4046 case AMDGPU::V_FMAC_F32_e64:
4047 return AMDGPU::V_FMA_F32_e64;
4048 case AMDGPU::V_FMAC_F64_e32:
4049 case AMDGPU::V_FMAC_F64_e64:
4050 return AMDGPU::V_FMA_F64_e64;
4051 default:
4052 llvm_unreachable("invalid instruction");
4053 }
4054}
4055
4056/// Helper struct for the implementation of 3-address conversion to communicate
4057/// updates made to instruction operands.
4059 /// Other instruction whose def is no longer used by the converted
4060 /// instruction.
4062};
4063
4065 LiveVariables *LV,
4066 LiveIntervals *LIS) const {
4067 MachineBasicBlock &MBB = *MI.getParent();
4068 MachineInstr *CandidateMI = &MI;
4069
4070 if (MI.isBundle()) {
4071 // This is a temporary placeholder for bundle handling that enables us to
4072 // exercise the relevant code paths in the two-address instruction pass.
4073 if (MI.getBundleSize() != 1)
4074 return nullptr;
4075 CandidateMI = MI.getNextNode();
4076 }
4077
4079 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4080 if (!NewMI)
4081 return nullptr;
4082
4083 if (MI.isBundle()) {
4084 CandidateMI->eraseFromBundle();
4085
4086 for (MachineOperand &MO : MI.all_defs()) {
4087 if (MO.isTied())
4088 MI.untieRegOperand(MO.getOperandNo());
4089 }
4090 } else {
4091 updateLiveVariables(LV, MI, *NewMI);
4092 if (LIS) {
4093 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4094 // SlotIndex of defs needs to be updated when converting to early-clobber
4095 MachineOperand &Def = NewMI->getOperand(0);
4096 if (Def.isEarlyClobber() && Def.isReg() &&
4097 LIS->hasInterval(Def.getReg())) {
4098 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4099 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4100 auto &LI = LIS->getInterval(Def.getReg());
4101 auto UpdateDefIndex = [&](LiveRange &LR) {
4102 auto *S = LR.find(OldIndex);
4103 if (S != LR.end() && S->start == OldIndex) {
4104 assert(S->valno && S->valno->def == OldIndex);
4105 S->start = NewIndex;
4106 S->valno->def = NewIndex;
4107 }
4108 };
4109 UpdateDefIndex(LI);
4110 for (auto &SR : LI.subranges())
4111 UpdateDefIndex(SR);
4112 }
4113 }
4114 }
4115
4116 if (U.RemoveMIUse) {
4117 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4118 // The only user is the instruction which will be killed.
4119 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4120
4121 if (MRI.hasOneNonDBGUse(DefReg)) {
4122 // We cannot just remove the DefMI here, calling pass will crash.
4123 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4124 U.RemoveMIUse->getOperand(0).setIsDead(true);
4125 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4126 U.RemoveMIUse->removeOperand(I);
4127 if (LV)
4128 LV->getVarInfo(DefReg).AliveBlocks.clear();
4129 }
4130
4131 if (MI.isBundle()) {
4132 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4133 if (!VRI.Reads && !VRI.Writes) {
4134 for (MachineOperand &MO : MI.all_uses()) {
4135 if (MO.isReg() && MO.getReg() == DefReg) {
4136 assert(MO.getSubReg() == 0 &&
4137 "tied sub-registers in bundles currently not supported");
4138 MI.removeOperand(MO.getOperandNo());
4139 break;
4140 }
4141 }
4142
4143 if (LIS)
4144 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4145 }
4146 } else if (LIS) {
4147 LiveInterval &DefLI = LIS->getInterval(DefReg);
4148
4149 // We cannot delete the original instruction here, so hack out the use
4150 // in the original instruction with a dummy register so we can use
4151 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4152 // not have the complexity of deleting a use to consider here.
4153 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4154 for (MachineOperand &MIOp : MI.uses()) {
4155 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4156 MIOp.setIsUndef(true);
4157 MIOp.setReg(DummyReg);
4158 }
4159 }
4160
4161 if (MI.isBundle()) {
4162 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4163 if (!VRI.Reads && !VRI.Writes) {
4164 for (MachineOperand &MIOp : MI.uses()) {
4165 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4166 MIOp.setIsUndef(true);
4167 MIOp.setReg(DummyReg);
4168 }
4169 }
4170 }
4171
4172 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4173 false, /*isUndef=*/true));
4174 }
4175
4176 LIS->shrinkToUses(&DefLI);
4177 }
4178 }
4179
4180 return MI.isBundle() ? &MI : NewMI;
4181}
4182
4184SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4185 ThreeAddressUpdates &U) const {
4186 MachineBasicBlock &MBB = *MI.getParent();
4187 unsigned Opc = MI.getOpcode();
4188
4189 // Handle MFMA.
4190 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4191 if (NewMFMAOpc != -1) {
4193 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4194 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4195 MIB.add(MI.getOperand(I));
4196 return MIB;
4197 }
4198
4199 if (SIInstrInfo::isWMMA(MI)) {
4200 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4201 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4202 .setMIFlags(MI.getFlags());
4203 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4204 MIB->addOperand(MI.getOperand(I));
4205 return MIB;
4206 }
4207
4208 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4209 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4210 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4211 "present pre-RA");
4212
4213 // Handle MAC/FMAC.
4214 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4215 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4216 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4217 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4218 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4219 bool Src0Literal = false;
4220
4221 switch (Opc) {
4222 default:
4223 return nullptr;
4224 case AMDGPU::V_MAC_F16_e64:
4225 case AMDGPU::V_FMAC_F16_e64:
4226 case AMDGPU::V_FMAC_F16_t16_e64:
4227 case AMDGPU::V_FMAC_F16_fake16_e64:
4228 case AMDGPU::V_MAC_F32_e64:
4229 case AMDGPU::V_MAC_LEGACY_F32_e64:
4230 case AMDGPU::V_FMAC_F32_e64:
4231 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4232 case AMDGPU::V_FMAC_F64_e64:
4233 break;
4234 case AMDGPU::V_MAC_F16_e32:
4235 case AMDGPU::V_FMAC_F16_e32:
4236 case AMDGPU::V_MAC_F32_e32:
4237 case AMDGPU::V_MAC_LEGACY_F32_e32:
4238 case AMDGPU::V_FMAC_F32_e32:
4239 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4240 case AMDGPU::V_FMAC_F64_e32: {
4241 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4242 AMDGPU::OpName::src0);
4243 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4244 if (!Src0->isReg() && !Src0->isImm())
4245 return nullptr;
4246
4247 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4248 Src0Literal = true;
4249
4250 break;
4251 }
4252 }
4253
4254 MachineInstrBuilder MIB;
4255 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4256 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4257 const MachineOperand *Src0Mods =
4258 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4259 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4260 const MachineOperand *Src1Mods =
4261 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4262 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4263 const MachineOperand *Src2Mods =
4264 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4265 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4266 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4267 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4268
4269 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4270 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4271 // If we have an SGPR input, we will violate the constant bus restriction.
4272 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4273 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4274 MachineInstr *DefMI;
4275
4276 int64_t Imm;
4277 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4278 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4279 if (pseudoToMCOpcode(NewOpc) != -1) {
4280 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4281 .add(*Dst)
4282 .add(*Src0)
4283 .add(*Src1)
4284 .addImm(Imm)
4285 .setMIFlags(MI.getFlags());
4286 U.RemoveMIUse = DefMI;
4287 return MIB;
4288 }
4289 }
4290 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4291 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4292 if (pseudoToMCOpcode(NewOpc) != -1) {
4293 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4294 .add(*Dst)
4295 .add(*Src0)
4296 .addImm(Imm)
4297 .add(*Src2)
4298 .setMIFlags(MI.getFlags());
4299 U.RemoveMIUse = DefMI;
4300 return MIB;
4301 }
4302 }
4303 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4304 if (Src0Literal) {
4305 Imm = Src0->getImm();
4306 DefMI = nullptr;
4307 }
4308 if (pseudoToMCOpcode(NewOpc) != -1 &&
4310 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4311 Src1)) {
4312 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4313 .add(*Dst)
4314 .add(*Src1)
4315 .addImm(Imm)
4316 .add(*Src2)
4317 .setMIFlags(MI.getFlags());
4318 U.RemoveMIUse = DefMI;
4319 return MIB;
4320 }
4321 }
4322 }
4323
4324 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4325 // if VOP3 does not allow a literal operand.
4326 if (Src0Literal && !ST.hasVOP3Literal())
4327 return nullptr;
4328
4329 unsigned NewOpc = getNewFMAInst(ST, Opc);
4330
4331 if (pseudoToMCOpcode(NewOpc) == -1)
4332 return nullptr;
4333
4334 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4335 .add(*Dst)
4336 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4337 .add(*Src0)
4338 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4339 .add(*Src1)
4340 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4341 .add(*Src2)
4342 .addImm(Clamp ? Clamp->getImm() : 0)
4343 .addImm(Omod ? Omod->getImm() : 0)
4344 .setMIFlags(MI.getFlags());
4345 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4346 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4347 return MIB;
4348}
4349
4350// It's not generally safe to move VALU instructions across these since it will
4351// start using the register as a base index rather than directly.
4352// XXX - Why isn't hasSideEffects sufficient for these?
4354 switch (MI.getOpcode()) {
4355 case AMDGPU::S_SET_GPR_IDX_ON:
4356 case AMDGPU::S_SET_GPR_IDX_MODE:
4357 case AMDGPU::S_SET_GPR_IDX_OFF:
4358 return true;
4359 default:
4360 return false;
4361 }
4362}
4363
4365 const MachineBasicBlock *MBB,
4366 const MachineFunction &MF) const {
4367 // Skipping the check for SP writes in the base implementation. The reason it
4368 // was added was apparently due to compile time concerns.
4369 //
4370 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4371 // but is probably avoidable.
4372
4373 // Copied from base implementation.
4374 // Terminators and labels can't be scheduled around.
4375 if (MI.isTerminator() || MI.isPosition())
4376 return true;
4377
4378 // INLINEASM_BR can jump to another block
4379 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4380 return true;
4381
4382 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4383 return true;
4384
4385 // Target-independent instructions do not have an implicit-use of EXEC, even
4386 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4387 // boundaries prevents incorrect movements of such instructions.
4388 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4389 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4390 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4391 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4392 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4394}
4395
4397 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4398 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4399 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4400}
4401
4403 // Instructions that access scratch use FLAT encoding or BUF encodings.
4404 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4405 return false;
4406
4407 // If scratch is not initialized, we can never access it.
4408 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4409 return false;
4410
4411 // SCRATCH instructions always access scratch.
4412 if (isFLATScratch(MI))
4413 return true;
4414
4415 // If there are no memory operands then conservatively assume the flat
4416 // operation may access scratch.
4417 if (MI.memoperands_empty())
4418 return true;
4419
4420 // See if any memory operand specifies an address space that involves scratch.
4421 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4422 unsigned AS = Memop->getAddrSpace();
4423 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4424 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4425 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4426 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4427 }
4428 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4429 });
4430}
4431
4433 assert(isFLAT(MI));
4434
4435 // All flat instructions use the VMEM counter except prefetch.
4436 if (!usesVM_CNT(MI))
4437 return false;
4438
4439 // If there are no memory operands then conservatively assume the flat
4440 // operation may access VMEM.
4441 if (MI.memoperands_empty())
4442 return true;
4443
4444 // See if any memory operand specifies an address space that involves VMEM.
4445 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4446 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4447 // (GDS) address space is not supported by flat operations. Therefore, simply
4448 // return true unless only the LDS address space is found.
4449 for (const MachineMemOperand *Memop : MI.memoperands()) {
4450 unsigned AS = Memop->getAddrSpace();
4452 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4453 return true;
4454 }
4455
4456 return false;
4457}
4458
4460 assert(isFLAT(MI));
4461
4462 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4463 if (!usesLGKM_CNT(MI))
4464 return false;
4465
4466 // If in tgsplit mode then there can be no use of LDS.
4467 if (ST.isTgSplitEnabled())
4468 return false;
4469
4470 // If there are no memory operands then conservatively assume the flat
4471 // operation may access LDS.
4472 if (MI.memoperands_empty())
4473 return true;
4474
4475 // See if any memory operand specifies an address space that involves LDS.
4476 for (const MachineMemOperand *Memop : MI.memoperands()) {
4477 unsigned AS = Memop->getAddrSpace();
4479 return true;
4480 }
4481
4482 return false;
4483}
4484
4486 // Skip the full operand and register alias search modifiesRegister
4487 // does. There's only a handful of instructions that touch this, it's only an
4488 // implicit def, and doesn't alias any other registers.
4489 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4490}
4491
4493 unsigned Opcode = MI.getOpcode();
4494
4495 if (MI.mayStore() && isSMRD(MI))
4496 return true; // scalar store or atomic
4497
4498 // This will terminate the function when other lanes may need to continue.
4499 if (MI.isReturn())
4500 return true;
4501
4502 // These instructions cause shader I/O that may cause hardware lockups
4503 // when executed with an empty EXEC mask.
4504 //
4505 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4506 // EXEC = 0, but checking for that case here seems not worth it
4507 // given the typical code patterns.
4508 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4509 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4510 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4511 return true;
4512
4513 if (MI.isCall() || MI.isInlineAsm())
4514 return true; // conservative assumption
4515
4516 // Assume that barrier interactions are only intended with active lanes.
4517 if (isBarrier(Opcode))
4518 return true;
4519
4520 // A mode change is a scalar operation that influences vector instructions.
4522 return true;
4523
4524 // These are like SALU instructions in terms of effects, so it's questionable
4525 // whether we should return true for those.
4526 //
4527 // However, executing them with EXEC = 0 causes them to operate on undefined
4528 // data, which we avoid by returning true here.
4529 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4530 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4531 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4532 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4533 return true;
4534
4535 return false;
4536}
4537
4539 const MachineInstr &MI) const {
4540 if (MI.isMetaInstruction())
4541 return false;
4542
4543 // This won't read exec if this is an SGPR->SGPR copy.
4544 if (MI.isCopyLike()) {
4545 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4546 return true;
4547
4548 // Make sure this isn't copying exec as a normal operand
4549 return MI.readsRegister(AMDGPU::EXEC, &RI);
4550 }
4551
4552 // Make a conservative assumption about the callee.
4553 if (MI.isCall())
4554 return true;
4555
4556 // Be conservative with any unhandled generic opcodes.
4557 if (!isTargetSpecificOpcode(MI.getOpcode()))
4558 return true;
4559
4560 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4561}
4562
4563bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4564 switch (Imm.getBitWidth()) {
4565 case 1: // This likely will be a condition code mask.
4566 return true;
4567
4568 case 32:
4569 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4570 ST.hasInv2PiInlineImm());
4571 case 64:
4572 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4573 ST.hasInv2PiInlineImm());
4574 case 16:
4575 return ST.has16BitInsts() &&
4576 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4577 ST.hasInv2PiInlineImm());
4578 default:
4579 llvm_unreachable("invalid bitwidth");
4580 }
4581}
4582
4584 APInt IntImm = Imm.bitcastToAPInt();
4585 int64_t IntImmVal = IntImm.getSExtValue();
4586 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4587 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4588 default:
4589 llvm_unreachable("invalid fltSemantics");
4592 return isInlineConstant(IntImm);
4594 return ST.has16BitInsts() &&
4595 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4597 return ST.has16BitInsts() &&
4598 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4599 }
4600}
4601
4602bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4603 // MachineOperand provides no way to tell the true operand size, since it only
4604 // records a 64-bit value. We need to know the size to determine if a 32-bit
4605 // floating point immediate bit pattern is legal for an integer immediate. It
4606 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4607 switch (OperandType) {
4617 int32_t Trunc = static_cast<int32_t>(Imm);
4618 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4619 }
4625 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4628 // We would expect inline immediates to not be concerned with an integer/fp
4629 // distinction. However, in the case of 16-bit integer operations, the
4630 // "floating point" values appear to not work. It seems read the low 16-bits
4631 // of 32-bit immediates, which happens to always work for the integer
4632 // values.
4633 //
4634 // See llvm bugzilla 46302.
4635 //
4636 // TODO: Theoretically we could use op-sel to use the high bits of the
4637 // 32-bit FP values.
4649 return false;
4652 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4653 // A few special case instructions have 16-bit operands on subtargets
4654 // where 16-bit instructions are not legal.
4655 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4656 // constants in these cases
4657 int16_t Trunc = static_cast<int16_t>(Imm);
4658 return ST.has16BitInsts() &&
4659 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4660 }
4661
4662 return false;
4663 }
4666 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4667 int16_t Trunc = static_cast<int16_t>(Imm);
4668 return ST.has16BitInsts() &&
4669 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4670 }
4671 return false;
4672 }
4676 return false;
4678 return isLegalAV64PseudoImm(Imm);
4681 // Always embedded in the instruction for free.
4682 return true;
4692 // Just ignore anything else.
4693 return true;
4694 default:
4695 llvm_unreachable("invalid operand type");
4696 }
4697}
4698
4699static bool compareMachineOp(const MachineOperand &Op0,
4700 const MachineOperand &Op1) {
4701 if (Op0.getType() != Op1.getType())
4702 return false;
4703
4704 switch (Op0.getType()) {
4706 return Op0.getReg() == Op1.getReg();
4708 return Op0.getImm() == Op1.getImm();
4709 default:
4710 llvm_unreachable("Didn't expect to be comparing these operand types");
4711 }
4712}
4713
4715 const MCOperandInfo &OpInfo) const {
4716 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4717 return true;
4718
4719 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4720 return false;
4721
4722 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4723 return true;
4724
4725 return ST.hasVOP3Literal();
4726}
4727
4728bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4729 int64_t ImmVal) const {
4730 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4731 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4732 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4733 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4734 AMDGPU::OpName::src2))
4735 return false;
4736 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4737 }
4738
4739 return isLiteralOperandLegal(InstDesc, OpInfo);
4740}
4741
4742bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4743 const MachineOperand &MO) const {
4744 if (MO.isImm())
4745 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4746
4747 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4748 "unexpected imm-like operand kind");
4749 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4750 return isLiteralOperandLegal(InstDesc, OpInfo);
4751}
4752
4754 // 2 32-bit inline constants packed into one.
4755 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4756 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4757}
4758
4759bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4760 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4761 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4762 return false;
4763
4764 int Op32 = AMDGPU::getVOPe32(Opcode);
4765 if (Op32 == -1)
4766 return false;
4767
4768 return pseudoToMCOpcode(Op32) != -1;
4769}
4770
4771bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4772 // The src0_modifier operand is present on all instructions
4773 // that have modifiers.
4774
4775 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4776}
4777
4779 AMDGPU::OpName OpName) const {
4780 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4781 return Mods && Mods->getImm();
4782}
4783
4785 return any_of(ModifierOpNames,
4786 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4787}
4788
4790 const MachineRegisterInfo &MRI) const {
4791 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4792 // Can't shrink instruction with three operands.
4793 if (Src2) {
4794 switch (MI.getOpcode()) {
4795 default: return false;
4796
4797 case AMDGPU::V_ADDC_U32_e64:
4798 case AMDGPU::V_SUBB_U32_e64:
4799 case AMDGPU::V_SUBBREV_U32_e64: {
4800 const MachineOperand *Src1
4801 = getNamedOperand(MI, AMDGPU::OpName::src1);
4802 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4803 return false;
4804 // Additional verification is needed for sdst/src2.
4805 return true;
4806 }
4807 case AMDGPU::V_MAC_F16_e64:
4808 case AMDGPU::V_MAC_F32_e64:
4809 case AMDGPU::V_MAC_LEGACY_F32_e64:
4810 case AMDGPU::V_FMAC_F16_e64:
4811 case AMDGPU::V_FMAC_F16_t16_e64:
4812 case AMDGPU::V_FMAC_F16_fake16_e64:
4813 case AMDGPU::V_FMAC_F32_e64:
4814 case AMDGPU::V_FMAC_F64_e64:
4815 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4816 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4817 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4818 return false;
4819 break;
4820
4821 case AMDGPU::V_CNDMASK_B32_e64:
4822 break;
4823 }
4824 }
4825
4826 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4827 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4828 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4829 return false;
4830
4831 // We don't need to check src0, all input types are legal, so just make sure
4832 // src0 isn't using any modifiers.
4833 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4834 return false;
4835
4836 // Can it be shrunk to a valid 32 bit opcode?
4837 if (!hasVALU32BitEncoding(MI.getOpcode()))
4838 return false;
4839
4840 // Check output modifiers
4841 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4842 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4843 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4844 // TODO: Can we avoid checking bound_ctrl/fi here?
4845 // They are only used by permlane*_swap special case.
4846 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4847 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4848}
4849
4850// Set VCC operand with all flags from \p Orig, except for setting it as
4851// implicit.
4853 const MachineOperand &Orig) {
4854
4855 for (MachineOperand &Use : MI.implicit_operands()) {
4856 if (Use.isUse() &&
4857 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4858 Use.setIsUndef(Orig.isUndef());
4859 Use.setIsKill(Orig.isKill());
4860 return;
4861 }
4862 }
4863}
4864
4866 unsigned Op32) const {
4867 MachineBasicBlock *MBB = MI.getParent();
4868
4869 const MCInstrDesc &Op32Desc = get(Op32);
4870 MachineInstrBuilder Inst32 =
4871 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4872 .setMIFlags(MI.getFlags());
4873
4874 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4875 // For VOPC instructions, this is replaced by an implicit def of vcc.
4876
4877 // We assume the defs of the shrunk opcode are in the same order, and the
4878 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4879 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4880 Inst32.add(MI.getOperand(I));
4881
4882 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4883
4884 int Idx = MI.getNumExplicitDefs();
4885 for (const MachineOperand &Use : MI.explicit_uses()) {
4886 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4888 continue;
4889
4890 if (&Use == Src2) {
4891 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4892 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4893 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4894 // of vcc was already added during the initial BuildMI, but we
4895 // 1) may need to change vcc to vcc_lo to preserve the original register
4896 // 2) have to preserve the original flags.
4897 copyFlagsToImplicitVCC(*Inst32, *Src2);
4898 continue;
4899 }
4900 }
4901
4902 Inst32.add(Use);
4903 }
4904
4905 // FIXME: Losing implicit operands
4906 fixImplicitOperands(*Inst32);
4907 return Inst32;
4908}
4909
4911 // Null is free
4912 Register Reg = RegOp.getReg();
4913 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4914 return false;
4915
4916 // SGPRs use the constant bus
4917
4918 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4919 // physical register operands should also count, except for exec.
4920 if (RegOp.isImplicit())
4921 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4922
4923 // SGPRs use the constant bus
4924 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4925 AMDGPU::SReg_64RegClass.contains(Reg);
4926}
4927
4929 const MachineRegisterInfo &MRI) const {
4930 Register Reg = RegOp.getReg();
4931 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4932 : physRegUsesConstantBus(RegOp);
4933}
4934
4936 const MachineOperand &MO,
4937 const MCOperandInfo &OpInfo) const {
4938 // Literal constants use the constant bus.
4939 if (!MO.isReg())
4940 return !isInlineConstant(MO, OpInfo);
4941
4942 Register Reg = MO.getReg();
4943 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4945}
4946
4948 for (const MachineOperand &MO : MI.implicit_operands()) {
4949 // We only care about reads.
4950 if (MO.isDef())
4951 continue;
4952
4953 switch (MO.getReg()) {
4954 case AMDGPU::VCC:
4955 case AMDGPU::VCC_LO:
4956 case AMDGPU::VCC_HI:
4957 case AMDGPU::M0:
4958 case AMDGPU::FLAT_SCR:
4959 return MO.getReg();
4960
4961 default:
4962 break;
4963 }
4964 }
4965
4966 return Register();
4967}
4968
4969static bool shouldReadExec(const MachineInstr &MI) {
4970 if (SIInstrInfo::isVALU(MI)) {
4971 switch (MI.getOpcode()) {
4972 case AMDGPU::V_READLANE_B32:
4973 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4974 case AMDGPU::V_WRITELANE_B32:
4975 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4976 return false;
4977 }
4978
4979 return true;
4980 }
4981
4982 if (MI.isPreISelOpcode() ||
4983 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4986 return false;
4987
4988 return true;
4989}
4990
4991static bool isRegOrFI(const MachineOperand &MO) {
4992 return MO.isReg() || MO.isFI();
4993}
4994
4995static bool isSubRegOf(const SIRegisterInfo &TRI,
4996 const MachineOperand &SuperVec,
4997 const MachineOperand &SubReg) {
4998 if (SubReg.getReg().isPhysical())
4999 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5000
5001 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5002 SubReg.getReg() == SuperVec.getReg();
5003}
5004
5005// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5006bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5007 const MachineRegisterInfo &MRI,
5008 StringRef &ErrInfo) const {
5009 Register DstReg = MI.getOperand(0).getReg();
5010 Register SrcReg = MI.getOperand(1).getReg();
5011 // This is a check for copy from vector register to SGPR
5012 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5013 ErrInfo = "illegal copy from vector register to SGPR";
5014 return false;
5015 }
5016 return true;
5017}
5018
5020 StringRef &ErrInfo) const {
5021 uint16_t Opcode = MI.getOpcode();
5022 const MachineFunction *MF = MI.getMF();
5023 const MachineRegisterInfo &MRI = MF->getRegInfo();
5024
5025 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5026 // Find a better property to recognize the point where instruction selection
5027 // is just done.
5028 // We can only enforce this check after SIFixSGPRCopies pass so that the
5029 // illegal copies are legalized and thereafter we don't expect a pass
5030 // inserting similar copies.
5031 if (!MRI.isSSA() && MI.isCopy())
5032 return verifyCopy(MI, MRI, ErrInfo);
5033
5034 if (SIInstrInfo::isGenericOpcode(Opcode))
5035 return true;
5036
5037 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5038 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5039 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5040 int Src3Idx = -1;
5041 if (Src0Idx == -1) {
5042 // VOPD V_DUAL_* instructions use different operand names.
5043 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5044 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5045 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5046 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5047 }
5048
5049 // Make sure the number of operands is correct.
5050 const MCInstrDesc &Desc = get(Opcode);
5051 if (!Desc.isVariadic() &&
5052 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5053 ErrInfo = "Instruction has wrong number of operands.";
5054 return false;
5055 }
5056
5057 if (MI.isInlineAsm()) {
5058 // Verify register classes for inlineasm constraints.
5059 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5060 I != E; ++I) {
5061 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5062 if (!RC)
5063 continue;
5064
5065 const MachineOperand &Op = MI.getOperand(I);
5066 if (!Op.isReg())
5067 continue;
5068
5069 Register Reg = Op.getReg();
5070 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5071 ErrInfo = "inlineasm operand has incorrect register class.";
5072 return false;
5073 }
5074 }
5075
5076 return true;
5077 }
5078
5079 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5080 ErrInfo = "missing memory operand from image instruction.";
5081 return false;
5082 }
5083
5084 // Make sure the register classes are correct.
5085 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5086 const MachineOperand &MO = MI.getOperand(i);
5087 if (MO.isFPImm()) {
5088 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5089 "all fp values to integers.";
5090 return false;
5091 }
5092
5093 const MCOperandInfo &OpInfo = Desc.operands()[i];
5094 int16_t RegClass = getOpRegClassID(OpInfo);
5095
5096 switch (OpInfo.OperandType) {
5098 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5099 ErrInfo = "Illegal immediate value for operand.";
5100 return false;
5101 }
5102 break;
5115 break;
5117 break;
5118 break;
5132 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5133 ErrInfo = "Illegal immediate value for operand.";
5134 return false;
5135 }
5136 break;
5137 }
5139 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5140 ErrInfo = "Expected inline constant for operand.";
5141 return false;
5142 }
5143 break;
5147 break;
5152 // Check if this operand is an immediate.
5153 // FrameIndex operands will be replaced by immediates, so they are
5154 // allowed.
5155 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5156 ErrInfo = "Expected immediate, but got non-immediate";
5157 return false;
5158 }
5159 break;
5163 break;
5164 default:
5165 if (OpInfo.isGenericType())
5166 continue;
5167 break;
5168 }
5169
5170 if (!MO.isReg())
5171 continue;
5172 Register Reg = MO.getReg();
5173 if (!Reg)
5174 continue;
5175
5176 // FIXME: Ideally we would have separate instruction definitions with the
5177 // aligned register constraint.
5178 // FIXME: We do not verify inline asm operands, but custom inline asm
5179 // verification is broken anyway
5180 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5181 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5182 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5183 if (const TargetRegisterClass *SubRC =
5184 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5185 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5186 if (RC)
5187 RC = SubRC;
5188 }
5189 }
5190
5191 // Check that this is the aligned version of the class.
5192 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5193 ErrInfo = "Subtarget requires even aligned vector registers";
5194 return false;
5195 }
5196 }
5197
5198 if (RegClass != -1) {
5199 if (Reg.isVirtual())
5200 continue;
5201
5202 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5203 if (!RC->contains(Reg)) {
5204 ErrInfo = "Operand has incorrect register class.";
5205 return false;
5206 }
5207 }
5208 }
5209
5210 // Verify SDWA
5211 if (isSDWA(MI)) {
5212 if (!ST.hasSDWA()) {
5213 ErrInfo = "SDWA is not supported on this target";
5214 return false;
5215 }
5216
5217 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5218 AMDGPU::OpName::dst_sel}) {
5219 const MachineOperand *MO = getNamedOperand(MI, Op);
5220 if (!MO)
5221 continue;
5222 int64_t Imm = MO->getImm();
5223 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5224 ErrInfo = "Invalid SDWA selection";
5225 return false;
5226 }
5227 }
5228
5229 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5230
5231 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5232 if (OpIdx == -1)
5233 continue;
5234 const MachineOperand &MO = MI.getOperand(OpIdx);
5235
5236 if (!ST.hasSDWAScalar()) {
5237 // Only VGPRS on VI
5238 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5239 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5240 return false;
5241 }
5242 } else {
5243 // No immediates on GFX9
5244 if (!MO.isReg()) {
5245 ErrInfo =
5246 "Only reg allowed as operands in SDWA instructions on GFX9+";
5247 return false;
5248 }
5249 }
5250 }
5251
5252 if (!ST.hasSDWAOmod()) {
5253 // No omod allowed on VI
5254 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5255 if (OMod != nullptr &&
5256 (!OMod->isImm() || OMod->getImm() != 0)) {
5257 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5258 return false;
5259 }
5260 }
5261
5262 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5263 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5264 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5265 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5266 const MachineOperand *Src0ModsMO =
5267 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5268 unsigned Mods = Src0ModsMO->getImm();
5269 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5270 Mods & SISrcMods::SEXT) {
5271 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5272 return false;
5273 }
5274 }
5275
5276 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5277 if (isVOPC(BasicOpcode)) {
5278 if (!ST.hasSDWASdst() && DstIdx != -1) {
5279 // Only vcc allowed as dst on VI for VOPC
5280 const MachineOperand &Dst = MI.getOperand(DstIdx);
5281 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5282 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5283 return false;
5284 }
5285 } else if (!ST.hasSDWAOutModsVOPC()) {
5286 // No clamp allowed on GFX9 for VOPC
5287 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5288 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5289 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5290 return false;
5291 }
5292
5293 // No omod allowed on GFX9 for VOPC
5294 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5295 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5296 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5297 return false;
5298 }
5299 }
5300 }
5301
5302 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5303 if (DstUnused && DstUnused->isImm() &&
5304 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5305 const MachineOperand &Dst = MI.getOperand(DstIdx);
5306 if (!Dst.isReg() || !Dst.isTied()) {
5307 ErrInfo = "Dst register should have tied register";
5308 return false;
5309 }
5310
5311 const MachineOperand &TiedMO =
5312 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5313 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5314 ErrInfo =
5315 "Dst register should be tied to implicit use of preserved register";
5316 return false;
5317 }
5318 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5319 ErrInfo = "Dst register should use same physical register as preserved";
5320 return false;
5321 }
5322 }
5323 }
5324
5325 // Verify MIMG / VIMAGE / VSAMPLE
5326 if (isImage(Opcode) && !MI.mayStore()) {
5327 // Ensure that the return type used is large enough for all the options
5328 // being used TFE/LWE require an extra result register.
5329 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5330 if (DMask) {
5331 uint64_t DMaskImm = DMask->getImm();
5332 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5333 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5334 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5335 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5336
5337 // Adjust for packed 16 bit values
5338 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5339 RegCount = divideCeil(RegCount, 2);
5340
5341 // Adjust if using LWE or TFE
5342 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5343 RegCount += 1;
5344
5345 const uint32_t DstIdx =
5346 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5347 const MachineOperand &Dst = MI.getOperand(DstIdx);
5348 if (Dst.isReg()) {
5349 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5350 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5351 if (RegCount > DstSize) {
5352 ErrInfo = "Image instruction returns too many registers for dst "
5353 "register class";
5354 return false;
5355 }
5356 }
5357 }
5358 }
5359
5360 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5361 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5362 unsigned ConstantBusCount = 0;
5363 bool UsesLiteral = false;
5364 const MachineOperand *LiteralVal = nullptr;
5365
5366 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5367 if (ImmIdx != -1) {
5368 ++ConstantBusCount;
5369 UsesLiteral = true;
5370 LiteralVal = &MI.getOperand(ImmIdx);
5371 }
5372
5373 SmallVector<Register, 2> SGPRsUsed;
5374 Register SGPRUsed;
5375
5376 // Only look at the true operands. Only a real operand can use the constant
5377 // bus, and we don't want to check pseudo-operands like the source modifier
5378 // flags.
5379 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5380 if (OpIdx == -1)
5381 continue;
5382 const MachineOperand &MO = MI.getOperand(OpIdx);
5383 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5384 if (MO.isReg()) {
5385 SGPRUsed = MO.getReg();
5386 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5387 ++ConstantBusCount;
5388 SGPRsUsed.push_back(SGPRUsed);
5389 }
5390 } else if (!MO.isFI()) { // Treat FI like a register.
5391 if (!UsesLiteral) {
5392 ++ConstantBusCount;
5393 UsesLiteral = true;
5394 LiteralVal = &MO;
5395 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5396 assert(isVOP2(MI) || isVOP3(MI));
5397 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5398 return false;
5399 }
5400 }
5401 }
5402 }
5403
5404 SGPRUsed = findImplicitSGPRRead(MI);
5405 if (SGPRUsed) {
5406 // Implicit uses may safely overlap true operands
5407 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5408 return !RI.regsOverlap(SGPRUsed, SGPR);
5409 })) {
5410 ++ConstantBusCount;
5411 SGPRsUsed.push_back(SGPRUsed);
5412 }
5413 }
5414
5415 // v_writelane_b32 is an exception from constant bus restriction:
5416 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5417 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5418 Opcode != AMDGPU::V_WRITELANE_B32) {
5419 ErrInfo = "VOP* instruction violates constant bus restriction";
5420 return false;
5421 }
5422
5423 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5424 ErrInfo = "VOP3 instruction uses literal";
5425 return false;
5426 }
5427 }
5428
5429 // Special case for writelane - this can break the multiple constant bus rule,
5430 // but still can't use more than one SGPR register
5431 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5432 unsigned SGPRCount = 0;
5433 Register SGPRUsed;
5434
5435 for (int OpIdx : {Src0Idx, Src1Idx}) {
5436 if (OpIdx == -1)
5437 break;
5438
5439 const MachineOperand &MO = MI.getOperand(OpIdx);
5440
5441 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5442 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5443 if (MO.getReg() != SGPRUsed)
5444 ++SGPRCount;
5445 SGPRUsed = MO.getReg();
5446 }
5447 }
5448 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5449 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5450 return false;
5451 }
5452 }
5453 }
5454
5455 // Verify misc. restrictions on specific instructions.
5456 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5457 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5458 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5459 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5460 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5461 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5462 if (!compareMachineOp(Src0, Src1) &&
5463 !compareMachineOp(Src0, Src2)) {
5464 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5465 return false;
5466 }
5467 }
5468 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5469 SISrcMods::ABS) ||
5470 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5471 SISrcMods::ABS) ||
5472 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5473 SISrcMods::ABS)) {
5474 ErrInfo = "ABS not allowed in VOP3B instructions";
5475 return false;
5476 }
5477 }
5478
5479 if (isSOP2(MI) || isSOPC(MI)) {
5480 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5481 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5482
5483 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5484 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5485 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5486 !Src0.isIdenticalTo(Src1)) {
5487 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5488 return false;
5489 }
5490 }
5491
5492 if (isSOPK(MI)) {
5493 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5494 if (Desc.isBranch()) {
5495 if (!Op->isMBB()) {
5496 ErrInfo = "invalid branch target for SOPK instruction";
5497 return false;
5498 }
5499 } else {
5500 uint64_t Imm = Op->getImm();
5501 if (sopkIsZext(Opcode)) {
5502 if (!isUInt<16>(Imm)) {
5503 ErrInfo = "invalid immediate for SOPK instruction";
5504 return false;
5505 }
5506 } else {
5507 if (!isInt<16>(Imm)) {
5508 ErrInfo = "invalid immediate for SOPK instruction";
5509 return false;
5510 }
5511 }
5512 }
5513 }
5514
5515 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5516 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5517 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5518 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5519 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5520 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5521
5522 const unsigned StaticNumOps =
5523 Desc.getNumOperands() + Desc.implicit_uses().size();
5524 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5525
5526 // Require additional implicit operands. This allows a fixup done by the
5527 // post RA scheduler where the main implicit operand is killed and
5528 // implicit-defs are added for sub-registers that remain live after this
5529 // instruction.
5530 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5531 ErrInfo = "missing implicit register operands";
5532 return false;
5533 }
5534
5535 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5536 if (IsDst) {
5537 if (!Dst->isUse()) {
5538 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5539 return false;
5540 }
5541
5542 unsigned UseOpIdx;
5543 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5544 UseOpIdx != StaticNumOps + 1) {
5545 ErrInfo = "movrel implicit operands should be tied";
5546 return false;
5547 }
5548 }
5549
5550 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5551 const MachineOperand &ImpUse
5552 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5553 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5554 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5555 ErrInfo = "src0 should be subreg of implicit vector use";
5556 return false;
5557 }
5558 }
5559
5560 // Make sure we aren't losing exec uses in the td files. This mostly requires
5561 // being careful when using let Uses to try to add other use registers.
5562 if (shouldReadExec(MI)) {
5563 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5564 ErrInfo = "VALU instruction does not implicitly read exec mask";
5565 return false;
5566 }
5567 }
5568
5569 if (isSMRD(MI)) {
5570 if (MI.mayStore() &&
5571 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5572 // The register offset form of scalar stores may only use m0 as the
5573 // soffset register.
5574 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5575 if (Soff && Soff->getReg() != AMDGPU::M0) {
5576 ErrInfo = "scalar stores must use m0 as offset register";
5577 return false;
5578 }
5579 }
5580 }
5581
5582 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5583 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5584 if (Offset->getImm() != 0) {
5585 ErrInfo = "subtarget does not support offsets in flat instructions";
5586 return false;
5587 }
5588 }
5589
5590 if (isDS(MI) && !ST.hasGDS()) {
5591 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5592 if (GDSOp && GDSOp->getImm() != 0) {
5593 ErrInfo = "GDS is not supported on this subtarget";
5594 return false;
5595 }
5596 }
5597
5598 if (isImage(MI)) {
5599 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5600 if (DimOp) {
5601 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5602 AMDGPU::OpName::vaddr0);
5603 AMDGPU::OpName RSrcOpName =
5604 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5605 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5606 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5607 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5608 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5609 const AMDGPU::MIMGDimInfo *Dim =
5611
5612 if (!Dim) {
5613 ErrInfo = "dim is out of range";
5614 return false;
5615 }
5616
5617 bool IsA16 = false;
5618 if (ST.hasR128A16()) {
5619 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5620 IsA16 = R128A16->getImm() != 0;
5621 } else if (ST.hasA16()) {
5622 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5623 IsA16 = A16->getImm() != 0;
5624 }
5625
5626 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5627
5628 unsigned AddrWords =
5629 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5630
5631 unsigned VAddrWords;
5632 if (IsNSA) {
5633 VAddrWords = RsrcIdx - VAddr0Idx;
5634 if (ST.hasPartialNSAEncoding() &&
5635 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5636 unsigned LastVAddrIdx = RsrcIdx - 1;
5637 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5638 }
5639 } else {
5640 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5641 if (AddrWords > 12)
5642 AddrWords = 16;
5643 }
5644
5645 if (VAddrWords != AddrWords) {
5646 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5647 << " but got " << VAddrWords << "\n");
5648 ErrInfo = "bad vaddr size";
5649 return false;
5650 }
5651 }
5652 }
5653
5654 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5655 if (DppCt) {
5656 using namespace AMDGPU::DPP;
5657
5658 unsigned DC = DppCt->getImm();
5659 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5660 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5661 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5662 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5663 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5664 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5665 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5666 ErrInfo = "Invalid dpp_ctrl value";
5667 return false;
5668 }
5669 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5670 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5671 ErrInfo = "Invalid dpp_ctrl value: "
5672 "wavefront shifts are not supported on GFX10+";
5673 return false;
5674 }
5675 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5676 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5677 ErrInfo = "Invalid dpp_ctrl value: "
5678 "broadcasts are not supported on GFX10+";
5679 return false;
5680 }
5681 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5682 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5683 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5684 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5685 !ST.hasGFX90AInsts()) {
5686 ErrInfo = "Invalid dpp_ctrl value: "
5687 "row_newbroadcast/row_share is not supported before "
5688 "GFX90A/GFX10";
5689 return false;
5690 }
5691 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5692 ErrInfo = "Invalid dpp_ctrl value: "
5693 "row_share and row_xmask are not supported before GFX10";
5694 return false;
5695 }
5696 }
5697
5698 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5700 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5701 ErrInfo = "Invalid dpp_ctrl value: "
5702 "DP ALU dpp only support row_newbcast";
5703 return false;
5704 }
5705 }
5706
5707 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5708 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5709 AMDGPU::OpName DataName =
5710 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5711 const MachineOperand *Data = getNamedOperand(MI, DataName);
5712 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5713 if (Data && !Data->isReg())
5714 Data = nullptr;
5715
5716 if (ST.hasGFX90AInsts()) {
5717 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5718 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5719 ErrInfo = "Invalid register class: "
5720 "vdata and vdst should be both VGPR or AGPR";
5721 return false;
5722 }
5723 if (Data && Data2 &&
5724 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5725 ErrInfo = "Invalid register class: "
5726 "both data operands should be VGPR or AGPR";
5727 return false;
5728 }
5729 } else {
5730 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5731 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5732 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5733 ErrInfo = "Invalid register class: "
5734 "agpr loads and stores not supported on this GPU";
5735 return false;
5736 }
5737 }
5738 }
5739
5740 if (ST.needsAlignedVGPRs()) {
5741 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5743 if (!Op)
5744 return true;
5745 Register Reg = Op->getReg();
5746 if (Reg.isPhysical())
5747 return !(RI.getHWRegIndex(Reg) & 1);
5748 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5749 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5750 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5751 };
5752
5753 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5754 Opcode == AMDGPU::DS_GWS_BARRIER) {
5755
5756 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5757 ErrInfo = "Subtarget requires even aligned vector registers "
5758 "for DS_GWS instructions";
5759 return false;
5760 }
5761 }
5762
5763 if (isMIMG(MI)) {
5764 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5765 ErrInfo = "Subtarget requires even aligned vector registers "
5766 "for vaddr operand of image instructions";
5767 return false;
5768 }
5769 }
5770 }
5771
5772 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5773 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5774 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5775 ErrInfo = "Invalid register class: "
5776 "v_accvgpr_write with an SGPR is not supported on this GPU";
5777 return false;
5778 }
5779 }
5780
5781 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5782 const MachineOperand &SrcOp = MI.getOperand(1);
5783 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5784 ErrInfo = "pseudo expects only physical SGPRs";
5785 return false;
5786 }
5787 }
5788
5789 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5790 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5791 if (!ST.hasScaleOffset()) {
5792 ErrInfo = "Subtarget does not support offset scaling";
5793 return false;
5794 }
5795 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5796 ErrInfo = "Instruction does not support offset scaling";
5797 return false;
5798 }
5799 }
5800 }
5801
5802 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5803 // information.
5804 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5805 for (unsigned I = 0; I < 3; ++I) {
5807 return false;
5808 }
5809 }
5810
5811 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5812 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5813 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5814 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5815 &AMDGPU::SReg_64RegClass) ||
5816 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5817 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5818 return false;
5819 }
5820 }
5821
5822 return true;
5823}
5824
5825// It is more readable to list mapped opcodes on the same line.
5826// clang-format off
5827
5829 switch (MI.getOpcode()) {
5830 default: return AMDGPU::INSTRUCTION_LIST_END;
5831 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5832 case AMDGPU::COPY: return AMDGPU::COPY;
5833 case AMDGPU::PHI: return AMDGPU::PHI;
5834 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5835 case AMDGPU::WQM: return AMDGPU::WQM;
5836 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5837 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5838 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5839 case AMDGPU::S_MOV_B32: {
5840 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5841 return MI.getOperand(1).isReg() ||
5842 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5843 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5844 }
5845 case AMDGPU::S_ADD_I32:
5846 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5847 case AMDGPU::S_ADDC_U32:
5848 return AMDGPU::V_ADDC_U32_e32;
5849 case AMDGPU::S_SUB_I32:
5850 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5851 // FIXME: These are not consistently handled, and selected when the carry is
5852 // used.
5853 case AMDGPU::S_ADD_U32:
5854 return AMDGPU::V_ADD_CO_U32_e32;
5855 case AMDGPU::S_SUB_U32:
5856 return AMDGPU::V_SUB_CO_U32_e32;
5857 case AMDGPU::S_ADD_U64_PSEUDO:
5858 return AMDGPU::V_ADD_U64_PSEUDO;
5859 case AMDGPU::S_SUB_U64_PSEUDO:
5860 return AMDGPU::V_SUB_U64_PSEUDO;
5861 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5862 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5863 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5864 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5865 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5866 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5867 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5868 case AMDGPU::S_XNOR_B32:
5869 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5870 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5871 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5872 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5873 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5874 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5875 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5876 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5877 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5878 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5879 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5880 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5881 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5882 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5883 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5884 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5885 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5886 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5887 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5888 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5889 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5890 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5891 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5892 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5893 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5894 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5895 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5896 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5897 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5898 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5899 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5900 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5901 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5902 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5903 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5904 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5905 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5906 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5907 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5908 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5909 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5910 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5911 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5912 case AMDGPU::S_CVT_F32_F16:
5913 case AMDGPU::S_CVT_HI_F32_F16:
5914 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5915 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5916 case AMDGPU::S_CVT_F16_F32:
5917 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5918 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5919 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5920 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5921 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5922 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5923 case AMDGPU::S_CEIL_F16:
5924 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5925 : AMDGPU::V_CEIL_F16_fake16_e64;
5926 case AMDGPU::S_FLOOR_F16:
5927 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5928 : AMDGPU::V_FLOOR_F16_fake16_e64;
5929 case AMDGPU::S_TRUNC_F16:
5930 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5931 : AMDGPU::V_TRUNC_F16_fake16_e64;
5932 case AMDGPU::S_RNDNE_F16:
5933 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5934 : AMDGPU::V_RNDNE_F16_fake16_e64;
5935 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5936 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5937 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5938 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5939 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5940 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5941 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5942 case AMDGPU::S_ADD_F16:
5943 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5944 : AMDGPU::V_ADD_F16_fake16_e64;
5945 case AMDGPU::S_SUB_F16:
5946 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5947 : AMDGPU::V_SUB_F16_fake16_e64;
5948 case AMDGPU::S_MIN_F16:
5949 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5950 : AMDGPU::V_MIN_F16_fake16_e64;
5951 case AMDGPU::S_MAX_F16:
5952 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5953 : AMDGPU::V_MAX_F16_fake16_e64;
5954 case AMDGPU::S_MINIMUM_F16:
5955 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5956 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5957 case AMDGPU::S_MAXIMUM_F16:
5958 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5959 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5960 case AMDGPU::S_MUL_F16:
5961 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5962 : AMDGPU::V_MUL_F16_fake16_e64;
5963 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5964 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5965 case AMDGPU::S_FMAC_F16:
5966 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5967 : AMDGPU::V_FMAC_F16_fake16_e64;
5968 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5969 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5970 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5971 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5972 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5973 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5974 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5975 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5976 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5977 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5978 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5979 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5980 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5981 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5982 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5983 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5984 case AMDGPU::S_CMP_LT_F16:
5985 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5986 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5987 case AMDGPU::S_CMP_EQ_F16:
5988 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5989 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5990 case AMDGPU::S_CMP_LE_F16:
5991 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5992 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5993 case AMDGPU::S_CMP_GT_F16:
5994 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5995 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5996 case AMDGPU::S_CMP_LG_F16:
5997 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5998 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5999 case AMDGPU::S_CMP_GE_F16:
6000 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6001 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6002 case AMDGPU::S_CMP_O_F16:
6003 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6004 : AMDGPU::V_CMP_O_F16_fake16_e64;
6005 case AMDGPU::S_CMP_U_F16:
6006 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6007 : AMDGPU::V_CMP_U_F16_fake16_e64;
6008 case AMDGPU::S_CMP_NGE_F16:
6009 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6010 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6011 case AMDGPU::S_CMP_NLG_F16:
6012 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6013 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6014 case AMDGPU::S_CMP_NGT_F16:
6015 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6016 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6017 case AMDGPU::S_CMP_NLE_F16:
6018 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6019 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6020 case AMDGPU::S_CMP_NEQ_F16:
6021 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6022 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6023 case AMDGPU::S_CMP_NLT_F16:
6024 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6025 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6026 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6027 case AMDGPU::V_S_EXP_F16_e64:
6028 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6029 : AMDGPU::V_EXP_F16_fake16_e64;
6030 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6031 case AMDGPU::V_S_LOG_F16_e64:
6032 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6033 : AMDGPU::V_LOG_F16_fake16_e64;
6034 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6035 case AMDGPU::V_S_RCP_F16_e64:
6036 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6037 : AMDGPU::V_RCP_F16_fake16_e64;
6038 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6039 case AMDGPU::V_S_RSQ_F16_e64:
6040 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6041 : AMDGPU::V_RSQ_F16_fake16_e64;
6042 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6043 case AMDGPU::V_S_SQRT_F16_e64:
6044 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6045 : AMDGPU::V_SQRT_F16_fake16_e64;
6046 }
6048 "Unexpected scalar opcode without corresponding vector one!");
6049}
6050
6051// clang-format on
6052
6056 const DebugLoc &DL, Register Reg,
6057 bool IsSCCLive,
6058 SlotIndexes *Indexes) const {
6059 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6060 const SIInstrInfo *TII = ST.getInstrInfo();
6062 if (IsSCCLive) {
6063 // Insert two move instructions, one to save the original value of EXEC and
6064 // the other to turn on all bits in EXEC. This is required as we can't use
6065 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6066 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6068 auto FlipExecMI =
6069 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6070 if (Indexes) {
6071 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6072 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6073 }
6074 } else {
6075 auto SaveExec =
6076 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6077 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6078 if (Indexes)
6079 Indexes->insertMachineInstrInMaps(*SaveExec);
6080 }
6081}
6082
6085 const DebugLoc &DL, Register Reg,
6086 SlotIndexes *Indexes) const {
6088 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6089 .addReg(Reg, RegState::Kill);
6090 if (Indexes)
6091 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6092}
6093
6097 "Not a whole wave func");
6098 MachineBasicBlock &MBB = *MF.begin();
6099 for (MachineInstr &MI : MBB)
6100 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6101 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6102 return &MI;
6103
6104 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6105}
6106
6108 unsigned OpNo) const {
6109 const MCInstrDesc &Desc = get(MI.getOpcode());
6110 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6111 Desc.operands()[OpNo].RegClass == -1) {
6112 Register Reg = MI.getOperand(OpNo).getReg();
6113
6114 if (Reg.isVirtual()) {
6115 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6116 return MRI.getRegClass(Reg);
6117 }
6118 return RI.getPhysRegBaseClass(Reg);
6119 }
6120
6121 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6122 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6123}
6124
6127 MachineBasicBlock *MBB = MI.getParent();
6128 MachineOperand &MO = MI.getOperand(OpIdx);
6129 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6130 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6131 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6132 unsigned Size = RI.getRegSizeInBits(*RC);
6133 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6134 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6135 : AMDGPU::V_MOV_B32_e32;
6136 if (MO.isReg())
6137 Opcode = AMDGPU::COPY;
6138 else if (RI.isSGPRClass(RC))
6139 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6140
6141 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6142 Register Reg = MRI.createVirtualRegister(VRC);
6143 DebugLoc DL = MBB->findDebugLoc(I);
6144 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6145 MO.ChangeToRegister(Reg, false);
6146}
6147
6150 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6151 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6152 if (!SuperReg.getReg().isVirtual())
6153 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6154
6155 MachineBasicBlock *MBB = MI->getParent();
6156 const DebugLoc &DL = MI->getDebugLoc();
6157 Register SubReg = MRI.createVirtualRegister(SubRC);
6158
6159 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6160 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6161 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6162 return SubReg;
6163}
6164
6167 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6168 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6169 if (Op.isImm()) {
6170 if (SubIdx == AMDGPU::sub0)
6171 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6172 if (SubIdx == AMDGPU::sub1)
6173 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6174
6175 llvm_unreachable("Unhandled register index for immediate");
6176 }
6177
6178 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6179 SubIdx, SubRC);
6180 return MachineOperand::CreateReg(SubReg, false);
6181}
6182
6183// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6184void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6185 assert(Inst.getNumExplicitOperands() == 3);
6186 MachineOperand Op1 = Inst.getOperand(1);
6187 Inst.removeOperand(1);
6188 Inst.addOperand(Op1);
6189}
6190
6192 const MCOperandInfo &OpInfo,
6193 const MachineOperand &MO) const {
6194 if (!MO.isReg())
6195 return false;
6196
6197 Register Reg = MO.getReg();
6198
6199 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6200 if (Reg.isPhysical())
6201 return DRC->contains(Reg);
6202
6203 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6204
6205 if (MO.getSubReg()) {
6206 const MachineFunction *MF = MO.getParent()->getMF();
6207 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6208 if (!SuperRC)
6209 return false;
6210 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6211 }
6212
6213 return RI.getCommonSubClass(DRC, RC) != nullptr;
6214}
6215
6217 const MachineOperand &MO) const {
6218 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6219 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6220 unsigned Opc = MI.getOpcode();
6221
6222 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6223 // information.
6224 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6225 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6226 constexpr AMDGPU::OpName OpNames[] = {
6227 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6228
6229 for (auto [I, OpName] : enumerate(OpNames)) {
6230 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6231 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6233 return false;
6234 }
6235 }
6236
6237 if (!isLegalRegOperand(MRI, OpInfo, MO))
6238 return false;
6239
6240 // check Accumulate GPR operand
6241 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6242 if (IsAGPR && !ST.hasMAIInsts())
6243 return false;
6244 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6245 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6246 return false;
6247 // Atomics should have both vdst and vdata either vgpr or agpr.
6248 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6249 const int DataIdx = AMDGPU::getNamedOperandIdx(
6250 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6251 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6252 MI.getOperand(DataIdx).isReg() &&
6253 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6254 return false;
6255 if ((int)OpIdx == DataIdx) {
6256 if (VDstIdx != -1 &&
6257 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6258 return false;
6259 // DS instructions with 2 src operands also must have tied RC.
6260 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6261 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6262 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6263 return false;
6264 }
6265
6266 // Check V_ACCVGPR_WRITE_B32_e64
6267 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6268 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6269 RI.isSGPRReg(MRI, MO.getReg()))
6270 return false;
6271
6272 if (ST.hasFlatScratchHiInB64InstHazard() &&
6273 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6274 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6275 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6276 64)
6277 return false;
6278 }
6279 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6280 return false;
6281 }
6282
6283 return true;
6284}
6285
6287 const MCOperandInfo &OpInfo,
6288 const MachineOperand &MO) const {
6289 if (MO.isReg())
6290 return isLegalRegOperand(MRI, OpInfo, MO);
6291
6292 // Handle non-register types that are treated like immediates.
6293 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6294 return true;
6295}
6296
6298 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6299 const MachineOperand *MO) const {
6300 constexpr unsigned NumOps = 3;
6301 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6302 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6303 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6304 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6305
6306 assert(SrcN < NumOps);
6307
6308 if (!MO) {
6309 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6310 if (SrcIdx == -1)
6311 return true;
6312 MO = &MI.getOperand(SrcIdx);
6313 }
6314
6315 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6316 return true;
6317
6318 int ModsIdx =
6319 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6320 if (ModsIdx == -1)
6321 return true;
6322
6323 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6324 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6325 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6326
6327 return !OpSel && !OpSelHi;
6328}
6329
6331 const MachineOperand *MO) const {
6332 const MachineFunction &MF = *MI.getMF();
6333 const MachineRegisterInfo &MRI = MF.getRegInfo();
6334 const MCInstrDesc &InstDesc = MI.getDesc();
6335 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6336 int64_t RegClass = getOpRegClassID(OpInfo);
6337 const TargetRegisterClass *DefinedRC =
6338 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6339 if (!MO)
6340 MO = &MI.getOperand(OpIdx);
6341
6342 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6343
6344 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6345 const MachineOperand *UsedLiteral = nullptr;
6346
6347 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6348 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6349
6350 // TODO: Be more permissive with frame indexes.
6351 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6352 if (!LiteralLimit--)
6353 return false;
6354
6355 UsedLiteral = MO;
6356 }
6357
6359 if (MO->isReg())
6360 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6361
6362 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6363 if (i == OpIdx)
6364 continue;
6365 const MachineOperand &Op = MI.getOperand(i);
6366 if (Op.isReg()) {
6367 if (Op.isUse()) {
6368 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6369 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6370 if (--ConstantBusLimit <= 0)
6371 return false;
6372 }
6373 }
6374 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6375 !isInlineConstant(Op, InstDesc.operands()[i])) {
6376 // The same literal may be used multiple times.
6377 if (!UsedLiteral)
6378 UsedLiteral = &Op;
6379 else if (UsedLiteral->isIdenticalTo(Op))
6380 continue;
6381
6382 if (!LiteralLimit--)
6383 return false;
6384 if (--ConstantBusLimit <= 0)
6385 return false;
6386 }
6387 }
6388 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6389 // There can be at most one literal operand, but it can be repeated.
6390 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6391 if (i == OpIdx)
6392 continue;
6393 const MachineOperand &Op = MI.getOperand(i);
6394 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6395 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6396 !Op.isIdenticalTo(*MO))
6397 return false;
6398
6399 // Do not fold a non-inlineable and non-register operand into an
6400 // instruction that already has a frame index. The frame index handling
6401 // code could not handle well when a frame index co-exists with another
6402 // non-register operand, unless that operand is an inlineable immediate.
6403 if (Op.isFI())
6404 return false;
6405 }
6406 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6407 isF16PseudoScalarTrans(MI.getOpcode())) {
6408 return false;
6409 }
6410
6411 if (MO->isReg()) {
6412 if (!DefinedRC)
6413 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6414 return isLegalRegOperand(MI, OpIdx, *MO);
6415 }
6416
6417 if (MO->isImm()) {
6418 uint64_t Imm = MO->getImm();
6419 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6420 bool Is64BitOp = Is64BitFPOp ||
6421 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6422 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6423 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6424 if (Is64BitOp &&
6425 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6426 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6427 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6428 return false;
6429
6430 // FIXME: We can use sign extended 64-bit literals, but only for signed
6431 // operands. At the moment we do not know if an operand is signed.
6432 // Such operand will be encoded as its low 32 bits and then either
6433 // correctly sign extended or incorrectly zero extended by HW.
6434 // If 64-bit literals are supported and the literal will be encoded
6435 // as full 64 bit we still can use it.
6436 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6437 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6438 return false;
6439 }
6440 }
6441
6442 // Handle non-register types that are treated like immediates.
6443 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6444
6445 if (!DefinedRC) {
6446 // This operand expects an immediate.
6447 return true;
6448 }
6449
6450 return isImmOperandLegal(MI, OpIdx, *MO);
6451}
6452
6454 bool IsGFX950Only = ST.hasGFX950Insts();
6455 bool IsGFX940Only = ST.hasGFX940Insts();
6456
6457 if (!IsGFX950Only && !IsGFX940Only)
6458 return false;
6459
6460 if (!isVALU(MI))
6461 return false;
6462
6463 // V_COS, V_EXP, V_RCP, etc.
6464 if (isTRANS(MI))
6465 return true;
6466
6467 // DOT2, DOT2C, DOT4, etc.
6468 if (isDOT(MI))
6469 return true;
6470
6471 // MFMA, SMFMA
6472 if (isMFMA(MI))
6473 return true;
6474
6475 unsigned Opcode = MI.getOpcode();
6476 switch (Opcode) {
6477 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6478 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6479 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6480 case AMDGPU::V_MQSAD_U32_U8_e64:
6481 case AMDGPU::V_PK_ADD_F16:
6482 case AMDGPU::V_PK_ADD_F32:
6483 case AMDGPU::V_PK_ADD_I16:
6484 case AMDGPU::V_PK_ADD_U16:
6485 case AMDGPU::V_PK_ASHRREV_I16:
6486 case AMDGPU::V_PK_FMA_F16:
6487 case AMDGPU::V_PK_FMA_F32:
6488 case AMDGPU::V_PK_FMAC_F16_e32:
6489 case AMDGPU::V_PK_FMAC_F16_e64:
6490 case AMDGPU::V_PK_LSHLREV_B16:
6491 case AMDGPU::V_PK_LSHRREV_B16:
6492 case AMDGPU::V_PK_MAD_I16:
6493 case AMDGPU::V_PK_MAD_U16:
6494 case AMDGPU::V_PK_MAX_F16:
6495 case AMDGPU::V_PK_MAX_I16:
6496 case AMDGPU::V_PK_MAX_U16:
6497 case AMDGPU::V_PK_MIN_F16:
6498 case AMDGPU::V_PK_MIN_I16:
6499 case AMDGPU::V_PK_MIN_U16:
6500 case AMDGPU::V_PK_MOV_B32:
6501 case AMDGPU::V_PK_MUL_F16:
6502 case AMDGPU::V_PK_MUL_F32:
6503 case AMDGPU::V_PK_MUL_LO_U16:
6504 case AMDGPU::V_PK_SUB_I16:
6505 case AMDGPU::V_PK_SUB_U16:
6506 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6507 return true;
6508 default:
6509 return false;
6510 }
6511}
6512
6514 MachineInstr &MI) const {
6515 unsigned Opc = MI.getOpcode();
6516 const MCInstrDesc &InstrDesc = get(Opc);
6517
6518 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6519 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6520
6521 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6522 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6523
6524 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6525 // we need to only have one constant bus use before GFX10.
6526 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6527 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6528 RI.isSGPRReg(MRI, Src0.getReg()))
6529 legalizeOpWithMove(MI, Src0Idx);
6530
6531 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6532 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6533 // src0/src1 with V_READFIRSTLANE.
6534 if (Opc == AMDGPU::V_WRITELANE_B32) {
6535 const DebugLoc &DL = MI.getDebugLoc();
6536 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6537 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6538 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6539 .add(Src0);
6540 Src0.ChangeToRegister(Reg, false);
6541 }
6542 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6543 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6544 const DebugLoc &DL = MI.getDebugLoc();
6545 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6546 .add(Src1);
6547 Src1.ChangeToRegister(Reg, false);
6548 }
6549 return;
6550 }
6551
6552 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6553 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6554 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6555 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6556 legalizeOpWithMove(MI, Src2Idx);
6557 }
6558
6559 // VOP2 src0 instructions support all operand types, so we don't need to check
6560 // their legality. If src1 is already legal, we don't need to do anything.
6561 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6562 return;
6563
6564 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6565 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6566 // select is uniform.
6567 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6568 RI.isVGPR(MRI, Src1.getReg())) {
6569 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6570 const DebugLoc &DL = MI.getDebugLoc();
6571 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6572 .add(Src1);
6573 Src1.ChangeToRegister(Reg, false);
6574 return;
6575 }
6576
6577 // We do not use commuteInstruction here because it is too aggressive and will
6578 // commute if it is possible. We only want to commute here if it improves
6579 // legality. This can be called a fairly large number of times so don't waste
6580 // compile time pointlessly swapping and checking legality again.
6581 if (HasImplicitSGPR || !MI.isCommutable()) {
6582 legalizeOpWithMove(MI, Src1Idx);
6583 return;
6584 }
6585
6586 // If src0 can be used as src1, commuting will make the operands legal.
6587 // Otherwise we have to give up and insert a move.
6588 //
6589 // TODO: Other immediate-like operand kinds could be commuted if there was a
6590 // MachineOperand::ChangeTo* for them.
6591 if ((!Src1.isImm() && !Src1.isReg()) ||
6592 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6593 legalizeOpWithMove(MI, Src1Idx);
6594 return;
6595 }
6596
6597 int CommutedOpc = commuteOpcode(MI);
6598 if (CommutedOpc == -1) {
6599 legalizeOpWithMove(MI, Src1Idx);
6600 return;
6601 }
6602
6603 MI.setDesc(get(CommutedOpc));
6604
6605 Register Src0Reg = Src0.getReg();
6606 unsigned Src0SubReg = Src0.getSubReg();
6607 bool Src0Kill = Src0.isKill();
6608
6609 if (Src1.isImm())
6610 Src0.ChangeToImmediate(Src1.getImm());
6611 else if (Src1.isReg()) {
6612 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6613 Src0.setSubReg(Src1.getSubReg());
6614 } else
6615 llvm_unreachable("Should only have register or immediate operands");
6616
6617 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6618 Src1.setSubReg(Src0SubReg);
6620}
6621
6622// Legalize VOP3 operands. All operand types are supported for any operand
6623// but only one literal constant and only starting from GFX10.
6625 MachineInstr &MI) const {
6626 unsigned Opc = MI.getOpcode();
6627
6628 int VOP3Idx[3] = {
6629 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6630 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6631 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6632 };
6633
6634 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6635 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6636 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6637 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6638 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6639 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6640 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6641 // src1 and src2 must be scalar
6642 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6643 const DebugLoc &DL = MI.getDebugLoc();
6644 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6645 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6646 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6647 .add(Src1);
6648 Src1.ChangeToRegister(Reg, false);
6649 }
6650 if (VOP3Idx[2] != -1) {
6651 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6652 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6653 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6654 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6655 .add(Src2);
6656 Src2.ChangeToRegister(Reg, false);
6657 }
6658 }
6659 }
6660
6661 // Find the one SGPR operand we are allowed to use.
6662 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6663 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6664 SmallDenseSet<unsigned> SGPRsUsed;
6665 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6666 if (SGPRReg) {
6667 SGPRsUsed.insert(SGPRReg);
6668 --ConstantBusLimit;
6669 }
6670
6671 for (int Idx : VOP3Idx) {
6672 if (Idx == -1)
6673 break;
6674 MachineOperand &MO = MI.getOperand(Idx);
6675
6676 if (!MO.isReg()) {
6677 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6678 continue;
6679
6680 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6681 --LiteralLimit;
6682 --ConstantBusLimit;
6683 continue;
6684 }
6685
6686 --LiteralLimit;
6687 --ConstantBusLimit;
6688 legalizeOpWithMove(MI, Idx);
6689 continue;
6690 }
6691
6692 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6693 continue; // VGPRs are legal
6694
6695 // We can use one SGPR in each VOP3 instruction prior to GFX10
6696 // and two starting from GFX10.
6697 if (SGPRsUsed.count(MO.getReg()))
6698 continue;
6699 if (ConstantBusLimit > 0) {
6700 SGPRsUsed.insert(MO.getReg());
6701 --ConstantBusLimit;
6702 continue;
6703 }
6704
6705 // If we make it this far, then the operand is not legal and we must
6706 // legalize it.
6707 legalizeOpWithMove(MI, Idx);
6708 }
6709
6710 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6711 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6712 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6713 legalizeOpWithMove(MI, VOP3Idx[2]);
6714
6715 // Fix the register class of packed FP32 instructions on gfx12+. See
6716 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6718 for (unsigned I = 0; I < 3; ++I) {
6720 legalizeOpWithMove(MI, VOP3Idx[I]);
6721 }
6722 }
6723}
6724
6727 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6728 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6729 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6730 if (DstRC)
6731 SRC = RI.getCommonSubClass(SRC, DstRC);
6732
6733 Register DstReg = MRI.createVirtualRegister(SRC);
6734 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6735
6736 if (RI.hasAGPRs(VRC)) {
6737 VRC = RI.getEquivalentVGPRClass(VRC);
6738 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6739 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6740 get(TargetOpcode::COPY), NewSrcReg)
6741 .addReg(SrcReg);
6742 SrcReg = NewSrcReg;
6743 }
6744
6745 if (SubRegs == 1) {
6746 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6747 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6748 .addReg(SrcReg);
6749 return DstReg;
6750 }
6751
6753 for (unsigned i = 0; i < SubRegs; ++i) {
6754 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6755 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6756 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6757 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6758 SRegs.push_back(SGPR);
6759 }
6760
6762 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6763 get(AMDGPU::REG_SEQUENCE), DstReg);
6764 for (unsigned i = 0; i < SubRegs; ++i) {
6765 MIB.addReg(SRegs[i]);
6766 MIB.addImm(RI.getSubRegFromChannel(i));
6767 }
6768 return DstReg;
6769}
6770
6772 MachineInstr &MI) const {
6773
6774 // If the pointer is store in VGPRs, then we need to move them to
6775 // SGPRs using v_readfirstlane. This is safe because we only select
6776 // loads with uniform pointers to SMRD instruction so we know the
6777 // pointer value is uniform.
6778 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6779 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6780 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6781 SBase->setReg(SGPR);
6782 }
6783 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6784 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6785 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6786 SOff->setReg(SGPR);
6787 }
6788}
6789
6791 unsigned Opc = Inst.getOpcode();
6792 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6793 if (OldSAddrIdx < 0)
6794 return false;
6795
6796 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6797
6798 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6799 if (NewOpc < 0)
6801 if (NewOpc < 0)
6802 return false;
6803
6805 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6806 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6807 return false;
6808
6809 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6810 if (NewVAddrIdx < 0)
6811 return false;
6812
6813 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6814
6815 // Check vaddr, it shall be zero or absent.
6816 MachineInstr *VAddrDef = nullptr;
6817 if (OldVAddrIdx >= 0) {
6818 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6819 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6820 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6821 !VAddrDef->getOperand(1).isImm() ||
6822 VAddrDef->getOperand(1).getImm() != 0)
6823 return false;
6824 }
6825
6826 const MCInstrDesc &NewDesc = get(NewOpc);
6827 Inst.setDesc(NewDesc);
6828
6829 // Callers expect iterator to be valid after this call, so modify the
6830 // instruction in place.
6831 if (OldVAddrIdx == NewVAddrIdx) {
6832 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6833 // Clear use list from the old vaddr holding a zero register.
6834 MRI.removeRegOperandFromUseList(&NewVAddr);
6835 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6836 Inst.removeOperand(OldSAddrIdx);
6837 // Update the use list with the pointer we have just moved from vaddr to
6838 // saddr position. Otherwise new vaddr will be missing from the use list.
6839 MRI.removeRegOperandFromUseList(&NewVAddr);
6840 MRI.addRegOperandToUseList(&NewVAddr);
6841 } else {
6842 assert(OldSAddrIdx == NewVAddrIdx);
6843
6844 if (OldVAddrIdx >= 0) {
6845 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6846 AMDGPU::OpName::vdst_in);
6847
6848 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6849 // it asserts. Untie the operands for now and retie them afterwards.
6850 if (NewVDstIn != -1) {
6851 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6852 Inst.untieRegOperand(OldVDstIn);
6853 }
6854
6855 Inst.removeOperand(OldVAddrIdx);
6856
6857 if (NewVDstIn != -1) {
6858 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6859 Inst.tieOperands(NewVDst, NewVDstIn);
6860 }
6861 }
6862 }
6863
6864 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6865 VAddrDef->eraseFromParent();
6866
6867 return true;
6868}
6869
6870// FIXME: Remove this when SelectionDAG is obsoleted.
6872 MachineInstr &MI) const {
6873 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6874 return;
6875
6876 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6877 // thinks they are uniform, so a readfirstlane should be valid.
6878 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6879 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6880 return;
6881
6883 return;
6884
6885 const TargetRegisterClass *DeclaredRC =
6886 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6887
6888 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6889 SAddr->setReg(ToSGPR);
6890}
6891
6894 const TargetRegisterClass *DstRC,
6897 const DebugLoc &DL) const {
6898 Register OpReg = Op.getReg();
6899 unsigned OpSubReg = Op.getSubReg();
6900
6901 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6902 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6903
6904 // Check if operand is already the correct register class.
6905 if (DstRC == OpRC)
6906 return;
6907
6908 Register DstReg = MRI.createVirtualRegister(DstRC);
6909 auto Copy =
6910 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6911 Op.setReg(DstReg);
6912
6913 MachineInstr *Def = MRI.getVRegDef(OpReg);
6914 if (!Def)
6915 return;
6916
6917 // Try to eliminate the copy if it is copying an immediate value.
6918 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6919 foldImmediate(*Copy, *Def, OpReg, &MRI);
6920
6921 bool ImpDef = Def->isImplicitDef();
6922 while (!ImpDef && Def && Def->isCopy()) {
6923 if (Def->getOperand(1).getReg().isPhysical())
6924 break;
6925 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6926 ImpDef = Def && Def->isImplicitDef();
6927 }
6928 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6929 !ImpDef)
6930 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6931}
6932
6933// Emit the actual waterfall loop, executing the wrapped instruction for each
6934// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6935// iteration, in the worst case we execute 64 (once per lane).
6936static void
6939 MachineBasicBlock &LoopBB,
6940 MachineBasicBlock &BodyBB,
6941 const DebugLoc &DL,
6942 ArrayRef<MachineOperand *> ScalarOps) {
6943 MachineFunction &MF = *LoopBB.getParent();
6944 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6945 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6947 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6948
6950 Register CondReg;
6951
6952 for (MachineOperand *ScalarOp : ScalarOps) {
6953 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6954 unsigned NumSubRegs = RegSize / 32;
6955 Register VScalarOp = ScalarOp->getReg();
6956
6957 if (NumSubRegs == 1) {
6958 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6959
6960 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6961 .addReg(VScalarOp);
6962
6963 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6964
6965 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6966 .addReg(CurReg)
6967 .addReg(VScalarOp);
6968
6969 // Combine the comparison results with AND.
6970 if (!CondReg) // First.
6971 CondReg = NewCondReg;
6972 else { // If not the first, we create an AND.
6973 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6974 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6975 .addReg(CondReg)
6976 .addReg(NewCondReg);
6977 CondReg = AndReg;
6978 }
6979
6980 // Update ScalarOp operand to use the SGPR ScalarOp.
6981 ScalarOp->setReg(CurReg);
6982 ScalarOp->setIsKill();
6983 } else {
6984 SmallVector<Register, 8> ReadlanePieces;
6985 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6986 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6987 "Unhandled register size");
6988
6989 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6990 Register CurRegLo =
6991 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6992 Register CurRegHi =
6993 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6994
6995 // Read the next variant <- also loop target.
6996 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6997 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6998
6999 // Read the next variant <- also loop target.
7000 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7001 .addReg(VScalarOp, VScalarOpUndef,
7002 TRI->getSubRegFromChannel(Idx + 1));
7003
7004 ReadlanePieces.push_back(CurRegLo);
7005 ReadlanePieces.push_back(CurRegHi);
7006
7007 // Comparison is to be done as 64-bit.
7008 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7009 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7010 .addReg(CurRegLo)
7011 .addImm(AMDGPU::sub0)
7012 .addReg(CurRegHi)
7013 .addImm(AMDGPU::sub1);
7014
7015 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7016 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7017 NewCondReg)
7018 .addReg(CurReg);
7019 if (NumSubRegs <= 2)
7020 Cmp.addReg(VScalarOp);
7021 else
7022 Cmp.addReg(VScalarOp, VScalarOpUndef,
7023 TRI->getSubRegFromChannel(Idx, 2));
7024
7025 // Combine the comparison results with AND.
7026 if (!CondReg) // First.
7027 CondReg = NewCondReg;
7028 else { // If not the first, we create an AND.
7029 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7030 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7031 .addReg(CondReg)
7032 .addReg(NewCondReg);
7033 CondReg = AndReg;
7034 }
7035 } // End for loop.
7036
7037 const auto *SScalarOpRC =
7038 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7039 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7040
7041 // Build scalar ScalarOp.
7042 auto Merge =
7043 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7044 unsigned Channel = 0;
7045 for (Register Piece : ReadlanePieces) {
7046 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7047 }
7048
7049 // Update ScalarOp operand to use the SGPR ScalarOp.
7050 ScalarOp->setReg(SScalarOp);
7051 ScalarOp->setIsKill();
7052 }
7053 }
7054
7055 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7056 MRI.setSimpleHint(SaveExec, CondReg);
7057
7058 // Update EXEC to matching lanes, saving original to SaveExec.
7059 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7060 .addReg(CondReg, RegState::Kill);
7061
7062 // The original instruction is here; we insert the terminators after it.
7063 I = BodyBB.end();
7064
7065 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7066 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7067 .addReg(LMC.ExecReg)
7068 .addReg(SaveExec);
7069
7070 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7071}
7072
7073// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7074// with SGPRs by iterating over all unique values across all lanes.
7075// Returns the loop basic block that now contains \p MI.
7076static MachineBasicBlock *
7080 MachineBasicBlock::iterator Begin = nullptr,
7081 MachineBasicBlock::iterator End = nullptr) {
7082 MachineBasicBlock &MBB = *MI.getParent();
7083 MachineFunction &MF = *MBB.getParent();
7084 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7085 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7087 if (!Begin.isValid())
7088 Begin = &MI;
7089 if (!End.isValid()) {
7090 End = &MI;
7091 ++End;
7092 }
7093 const DebugLoc &DL = MI.getDebugLoc();
7095 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7096
7097 // Save SCC. Waterfall Loop may overwrite SCC.
7098 Register SaveSCCReg;
7099
7100 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7101 // rather than unlimited scan everywhere
7102 bool SCCNotDead =
7103 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7104 std::numeric_limits<unsigned>::max()) !=
7106 if (SCCNotDead) {
7107 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7108 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7109 .addImm(1)
7110 .addImm(0);
7111 }
7112
7113 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7114
7115 // Save the EXEC mask
7116 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7117
7118 // Killed uses in the instruction we are waterfalling around will be
7119 // incorrect due to the added control-flow.
7121 ++AfterMI;
7122 for (auto I = Begin; I != AfterMI; I++) {
7123 for (auto &MO : I->all_uses())
7124 MRI.clearKillFlags(MO.getReg());
7125 }
7126
7127 // To insert the loop we need to split the block. Move everything after this
7128 // point to a new block, and insert a new empty block between the two.
7131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7133 ++MBBI;
7134
7135 MF.insert(MBBI, LoopBB);
7136 MF.insert(MBBI, BodyBB);
7137 MF.insert(MBBI, RemainderBB);
7138
7139 LoopBB->addSuccessor(BodyBB);
7140 BodyBB->addSuccessor(LoopBB);
7141 BodyBB->addSuccessor(RemainderBB);
7142
7143 // Move Begin to MI to the BodyBB, and the remainder of the block to
7144 // RemainderBB.
7145 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7146 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7147 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7148
7149 MBB.addSuccessor(LoopBB);
7150
7151 // Update dominators. We know that MBB immediately dominates LoopBB, that
7152 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7153 // RemainderBB. RemainderBB immediately dominates all of the successors
7154 // transferred to it from MBB that MBB used to properly dominate.
7155 if (MDT) {
7156 MDT->addNewBlock(LoopBB, &MBB);
7157 MDT->addNewBlock(BodyBB, LoopBB);
7158 MDT->addNewBlock(RemainderBB, BodyBB);
7159 for (auto &Succ : RemainderBB->successors()) {
7160 if (MDT->properlyDominates(&MBB, Succ)) {
7161 MDT->changeImmediateDominator(Succ, RemainderBB);
7162 }
7163 }
7164 }
7165
7166 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7167
7168 MachineBasicBlock::iterator First = RemainderBB->begin();
7169 // Restore SCC
7170 if (SCCNotDead) {
7171 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7172 .addReg(SaveSCCReg, RegState::Kill)
7173 .addImm(0);
7174 }
7175
7176 // Restore the EXEC mask
7177 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7178 .addReg(SaveExec);
7179 return BodyBB;
7180}
7181
7182// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7183static std::tuple<unsigned, unsigned>
7185 MachineBasicBlock &MBB = *MI.getParent();
7186 MachineFunction &MF = *MBB.getParent();
7188
7189 // Extract the ptr from the resource descriptor.
7190 unsigned RsrcPtr =
7191 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7192 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7193
7194 // Create an empty resource descriptor
7195 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7196 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7197 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7198 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7199 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7200
7201 // Zero64 = 0
7202 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7203 .addImm(0);
7204
7205 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7206 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7207 .addImm(Lo_32(RsrcDataFormat));
7208
7209 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7210 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7211 .addImm(Hi_32(RsrcDataFormat));
7212
7213 // NewSRsrc = {Zero64, SRsrcFormat}
7214 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7215 .addReg(Zero64)
7216 .addImm(AMDGPU::sub0_sub1)
7217 .addReg(SRsrcFormatLo)
7218 .addImm(AMDGPU::sub2)
7219 .addReg(SRsrcFormatHi)
7220 .addImm(AMDGPU::sub3);
7221
7222 return std::tuple(RsrcPtr, NewSRsrc);
7223}
7224
7227 MachineDominatorTree *MDT) const {
7228 MachineFunction &MF = *MI.getMF();
7230 MachineBasicBlock *CreatedBB = nullptr;
7231
7232 // Legalize VOP2
7233 if (isVOP2(MI) || isVOPC(MI)) {
7235 return CreatedBB;
7236 }
7237
7238 // Legalize VOP3
7239 if (isVOP3(MI)) {
7241 return CreatedBB;
7242 }
7243
7244 // Legalize SMRD
7245 if (isSMRD(MI)) {
7247 return CreatedBB;
7248 }
7249
7250 // Legalize FLAT
7251 if (isFLAT(MI)) {
7253 return CreatedBB;
7254 }
7255
7256 // Legalize REG_SEQUENCE and PHI
7257 // The register class of the operands much be the same type as the register
7258 // class of the output.
7259 if (MI.getOpcode() == AMDGPU::PHI) {
7260 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7261 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7262 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7263 continue;
7264 const TargetRegisterClass *OpRC =
7265 MRI.getRegClass(MI.getOperand(i).getReg());
7266 if (RI.hasVectorRegisters(OpRC)) {
7267 VRC = OpRC;
7268 } else {
7269 SRC = OpRC;
7270 }
7271 }
7272
7273 // If any of the operands are VGPR registers, then they all most be
7274 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7275 // them.
7276 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7277 if (!VRC) {
7278 assert(SRC);
7279 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7280 VRC = &AMDGPU::VReg_1RegClass;
7281 } else
7282 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7283 ? RI.getEquivalentAGPRClass(SRC)
7284 : RI.getEquivalentVGPRClass(SRC);
7285 } else {
7286 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7287 ? RI.getEquivalentAGPRClass(VRC)
7288 : RI.getEquivalentVGPRClass(VRC);
7289 }
7290 RC = VRC;
7291 } else {
7292 RC = SRC;
7293 }
7294
7295 // Update all the operands so they have the same type.
7296 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7297 MachineOperand &Op = MI.getOperand(I);
7298 if (!Op.isReg() || !Op.getReg().isVirtual())
7299 continue;
7300
7301 // MI is a PHI instruction.
7302 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7304
7305 // Avoid creating no-op copies with the same src and dst reg class. These
7306 // confuse some of the machine passes.
7307 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7308 }
7309 }
7310
7311 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7312 // VGPR dest type and SGPR sources, insert copies so all operands are
7313 // VGPRs. This seems to help operand folding / the register coalescer.
7314 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7315 MachineBasicBlock *MBB = MI.getParent();
7316 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7317 if (RI.hasVGPRs(DstRC)) {
7318 // Update all the operands so they are VGPR register classes. These may
7319 // not be the same register class because REG_SEQUENCE supports mixing
7320 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7321 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7322 MachineOperand &Op = MI.getOperand(I);
7323 if (!Op.isReg() || !Op.getReg().isVirtual())
7324 continue;
7325
7326 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7327 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7328 if (VRC == OpRC)
7329 continue;
7330
7331 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7332 Op.setIsKill();
7333 }
7334 }
7335
7336 return CreatedBB;
7337 }
7338
7339 // Legalize INSERT_SUBREG
7340 // src0 must have the same register class as dst
7341 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7342 Register Dst = MI.getOperand(0).getReg();
7343 Register Src0 = MI.getOperand(1).getReg();
7344 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7345 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7346 if (DstRC != Src0RC) {
7347 MachineBasicBlock *MBB = MI.getParent();
7348 MachineOperand &Op = MI.getOperand(1);
7349 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7350 }
7351 return CreatedBB;
7352 }
7353
7354 // Legalize SI_INIT_M0
7355 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7356 MachineOperand &Src = MI.getOperand(0);
7357 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7358 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7359 return CreatedBB;
7360 }
7361
7362 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7363 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7364 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7365 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7366 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7367 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7368 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7369 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7370 MachineOperand &Src = MI.getOperand(1);
7371 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7372 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7373 return CreatedBB;
7374 }
7375
7376 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7377 //
7378 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7379 // scratch memory access. In both cases, the legalization never involves
7380 // conversion to the addr64 form.
7382 (isMUBUF(MI) || isMTBUF(MI)))) {
7383 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7384 ? AMDGPU::OpName::rsrc
7385 : AMDGPU::OpName::srsrc;
7386 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7387 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7388 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7389
7390 AMDGPU::OpName SampOpName =
7391 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7392 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7393 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7394 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7395
7396 return CreatedBB;
7397 }
7398
7399 // Legalize SI_CALL
7400 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7401 MachineOperand *Dest = &MI.getOperand(0);
7402 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7403 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7404 // following copies, we also need to move copies from and to physical
7405 // registers into the loop block.
7406 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7407 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7408
7409 // Also move the copies to physical registers into the loop block
7410 MachineBasicBlock &MBB = *MI.getParent();
7412 while (Start->getOpcode() != FrameSetupOpcode)
7413 --Start;
7415 while (End->getOpcode() != FrameDestroyOpcode)
7416 ++End;
7417 // Also include following copies of the return value
7418 ++End;
7419 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7420 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7421 ++End;
7422 CreatedBB =
7423 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7424 }
7425 }
7426
7427 // Legalize s_sleep_var.
7428 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7429 const DebugLoc &DL = MI.getDebugLoc();
7430 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7431 int Src0Idx =
7432 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7433 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7434 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7435 .add(Src0);
7436 Src0.ChangeToRegister(Reg, false);
7437 return nullptr;
7438 }
7439
7440 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7441 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7442 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7443 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7444 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7445 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7446 for (MachineOperand &Src : MI.explicit_operands()) {
7447 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7448 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7449 }
7450 return CreatedBB;
7451 }
7452
7453 // Legalize MUBUF instructions.
7454 bool isSoffsetLegal = true;
7455 int SoffsetIdx =
7456 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7457 if (SoffsetIdx != -1) {
7458 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7459 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7460 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7461 isSoffsetLegal = false;
7462 }
7463 }
7464
7465 bool isRsrcLegal = true;
7466 int RsrcIdx =
7467 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7468 if (RsrcIdx != -1) {
7469 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7470 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7471 isRsrcLegal = false;
7472 }
7473
7474 // The operands are legal.
7475 if (isRsrcLegal && isSoffsetLegal)
7476 return CreatedBB;
7477
7478 if (!isRsrcLegal) {
7479 // Legalize a VGPR Rsrc
7480 //
7481 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7482 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7483 // a zero-value SRsrc.
7484 //
7485 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7486 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7487 // above.
7488 //
7489 // Otherwise we are on non-ADDR64 hardware, and/or we have
7490 // idxen/offen/bothen and we fall back to a waterfall loop.
7491
7492 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7493 MachineBasicBlock &MBB = *MI.getParent();
7494
7495 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7496 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7497 // This is already an ADDR64 instruction so we need to add the pointer
7498 // extracted from the resource descriptor to the current value of VAddr.
7499 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7500 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7501 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7502
7503 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7504 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7505 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7506
7507 unsigned RsrcPtr, NewSRsrc;
7508 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7509
7510 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7511 const DebugLoc &DL = MI.getDebugLoc();
7512 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7513 .addDef(CondReg0)
7514 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7515 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7516 .addImm(0);
7517
7518 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7519 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7520 .addDef(CondReg1, RegState::Dead)
7521 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7522 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7523 .addReg(CondReg0, RegState::Kill)
7524 .addImm(0);
7525
7526 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7527 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7528 .addReg(NewVAddrLo)
7529 .addImm(AMDGPU::sub0)
7530 .addReg(NewVAddrHi)
7531 .addImm(AMDGPU::sub1);
7532
7533 VAddr->setReg(NewVAddr);
7534 Rsrc->setReg(NewSRsrc);
7535 } else if (!VAddr && ST.hasAddr64()) {
7536 // This instructions is the _OFFSET variant, so we need to convert it to
7537 // ADDR64.
7538 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7539 "FIXME: Need to emit flat atomics here");
7540
7541 unsigned RsrcPtr, NewSRsrc;
7542 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7543
7544 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7545 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7546 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7547 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7548 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7549
7550 // Atomics with return have an additional tied operand and are
7551 // missing some of the special bits.
7552 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7553 MachineInstr *Addr64;
7554
7555 if (!VDataIn) {
7556 // Regular buffer load / store.
7558 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7559 .add(*VData)
7560 .addReg(NewVAddr)
7561 .addReg(NewSRsrc)
7562 .add(*SOffset)
7563 .add(*Offset);
7564
7565 if (const MachineOperand *CPol =
7566 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7567 MIB.addImm(CPol->getImm());
7568 }
7569
7570 if (const MachineOperand *TFE =
7571 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7572 MIB.addImm(TFE->getImm());
7573 }
7574
7575 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7576
7577 MIB.cloneMemRefs(MI);
7578 Addr64 = MIB;
7579 } else {
7580 // Atomics with return.
7581 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7582 .add(*VData)
7583 .add(*VDataIn)
7584 .addReg(NewVAddr)
7585 .addReg(NewSRsrc)
7586 .add(*SOffset)
7587 .add(*Offset)
7588 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7589 .cloneMemRefs(MI);
7590 }
7591
7592 MI.removeFromParent();
7593
7594 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7595 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7596 NewVAddr)
7597 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7598 .addImm(AMDGPU::sub0)
7599 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7600 .addImm(AMDGPU::sub1);
7601 } else {
7602 // Legalize a VGPR Rsrc and soffset together.
7603 if (!isSoffsetLegal) {
7604 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7605 CreatedBB =
7606 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7607 return CreatedBB;
7608 }
7609 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7610 return CreatedBB;
7611 }
7612 }
7613
7614 // Legalize a VGPR soffset.
7615 if (!isSoffsetLegal) {
7616 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7617 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7618 return CreatedBB;
7619 }
7620 return CreatedBB;
7621}
7622
7624 InstrList.insert(MI);
7625 // Add MBUF instructiosn to deferred list.
7626 int RsrcIdx =
7627 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7628 if (RsrcIdx != -1) {
7629 DeferredList.insert(MI);
7630 }
7631}
7632
7634 return DeferredList.contains(MI);
7635}
7636
7637// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7638// lowering (change spgr to vgpr).
7639// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7640// size. Need to legalize the size of the operands during the vgpr lowering
7641// chain. This can be removed after we have sgpr16 in place
7643 MachineRegisterInfo &MRI) const {
7644 if (!ST.useRealTrue16Insts())
7645 return;
7646
7647 unsigned Opcode = MI.getOpcode();
7648 MachineBasicBlock *MBB = MI.getParent();
7649 // Legalize operands and check for size mismatch
7650 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7651 OpIdx >= get(Opcode).getNumOperands() ||
7652 get(Opcode).operands()[OpIdx].RegClass == -1)
7653 return;
7654
7655 MachineOperand &Op = MI.getOperand(OpIdx);
7656 if (!Op.isReg() || !Op.getReg().isVirtual())
7657 return;
7658
7659 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7660 if (!RI.isVGPRClass(CurrRC))
7661 return;
7662
7663 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7664 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7665 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7666 Op.setSubReg(AMDGPU::lo16);
7667 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7668 const DebugLoc &DL = MI.getDebugLoc();
7669 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7670 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7671 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7672 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7673 .addReg(Op.getReg())
7674 .addImm(AMDGPU::lo16)
7675 .addReg(Undef)
7676 .addImm(AMDGPU::hi16);
7677 Op.setReg(NewDstReg);
7678 }
7679}
7681 MachineRegisterInfo &MRI) const {
7682 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7684}
7685
7687 MachineDominatorTree *MDT) const {
7688
7689 while (!Worklist.empty()) {
7690 MachineInstr &Inst = *Worklist.top();
7691 Worklist.erase_top();
7692 // Skip MachineInstr in the deferred list.
7693 if (Worklist.isDeferred(&Inst))
7694 continue;
7695 moveToVALUImpl(Worklist, MDT, Inst);
7696 }
7697
7698 // Deferred list of instructions will be processed once
7699 // all the MachineInstr in the worklist are done.
7700 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7701 moveToVALUImpl(Worklist, MDT, *Inst);
7702 assert(Worklist.empty() &&
7703 "Deferred MachineInstr are not supposed to re-populate worklist");
7704 }
7705}
7706
7709 MachineInstr &Inst) const {
7710
7712 if (!MBB)
7713 return;
7714 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7715 unsigned Opcode = Inst.getOpcode();
7716 unsigned NewOpcode = getVALUOp(Inst);
7717 const DebugLoc &DL = Inst.getDebugLoc();
7718
7719 // Handle some special cases
7720 switch (Opcode) {
7721 default:
7722 break;
7723 case AMDGPU::S_ADD_I32:
7724 case AMDGPU::S_SUB_I32: {
7725 // FIXME: The u32 versions currently selected use the carry.
7726 bool Changed;
7727 MachineBasicBlock *CreatedBBTmp = nullptr;
7728 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7729 if (Changed)
7730 return;
7731
7732 // Default handling
7733 break;
7734 }
7735
7736 case AMDGPU::S_MUL_U64:
7737 if (ST.hasVectorMulU64()) {
7738 NewOpcode = AMDGPU::V_MUL_U64_e64;
7739 break;
7740 }
7741 // Split s_mul_u64 in 32-bit vector multiplications.
7742 splitScalarSMulU64(Worklist, Inst, MDT);
7743 Inst.eraseFromParent();
7744 return;
7745
7746 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7747 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7748 // This is a special case of s_mul_u64 where all the operands are either
7749 // zero extended or sign extended.
7750 splitScalarSMulPseudo(Worklist, Inst, MDT);
7751 Inst.eraseFromParent();
7752 return;
7753
7754 case AMDGPU::S_AND_B64:
7755 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7756 Inst.eraseFromParent();
7757 return;
7758
7759 case AMDGPU::S_OR_B64:
7760 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7761 Inst.eraseFromParent();
7762 return;
7763
7764 case AMDGPU::S_XOR_B64:
7765 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7766 Inst.eraseFromParent();
7767 return;
7768
7769 case AMDGPU::S_NAND_B64:
7770 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7771 Inst.eraseFromParent();
7772 return;
7773
7774 case AMDGPU::S_NOR_B64:
7775 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7776 Inst.eraseFromParent();
7777 return;
7778
7779 case AMDGPU::S_XNOR_B64:
7780 if (ST.hasDLInsts())
7781 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7782 else
7783 splitScalar64BitXnor(Worklist, Inst, MDT);
7784 Inst.eraseFromParent();
7785 return;
7786
7787 case AMDGPU::S_ANDN2_B64:
7788 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7789 Inst.eraseFromParent();
7790 return;
7791
7792 case AMDGPU::S_ORN2_B64:
7793 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7794 Inst.eraseFromParent();
7795 return;
7796
7797 case AMDGPU::S_BREV_B64:
7798 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7799 Inst.eraseFromParent();
7800 return;
7801
7802 case AMDGPU::S_NOT_B64:
7803 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7804 Inst.eraseFromParent();
7805 return;
7806
7807 case AMDGPU::S_BCNT1_I32_B64:
7808 splitScalar64BitBCNT(Worklist, Inst);
7809 Inst.eraseFromParent();
7810 return;
7811
7812 case AMDGPU::S_BFE_I64:
7813 splitScalar64BitBFE(Worklist, Inst);
7814 Inst.eraseFromParent();
7815 return;
7816
7817 case AMDGPU::S_FLBIT_I32_B64:
7818 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7819 Inst.eraseFromParent();
7820 return;
7821 case AMDGPU::S_FF1_I32_B64:
7822 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7823 Inst.eraseFromParent();
7824 return;
7825
7826 case AMDGPU::S_LSHL_B32:
7827 if (ST.hasOnlyRevVALUShifts()) {
7828 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7829 swapOperands(Inst);
7830 }
7831 break;
7832 case AMDGPU::S_ASHR_I32:
7833 if (ST.hasOnlyRevVALUShifts()) {
7834 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7835 swapOperands(Inst);
7836 }
7837 break;
7838 case AMDGPU::S_LSHR_B32:
7839 if (ST.hasOnlyRevVALUShifts()) {
7840 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7841 swapOperands(Inst);
7842 }
7843 break;
7844 case AMDGPU::S_LSHL_B64:
7845 if (ST.hasOnlyRevVALUShifts()) {
7846 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7847 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7848 : AMDGPU::V_LSHLREV_B64_e64;
7849 swapOperands(Inst);
7850 }
7851 break;
7852 case AMDGPU::S_ASHR_I64:
7853 if (ST.hasOnlyRevVALUShifts()) {
7854 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7855 swapOperands(Inst);
7856 }
7857 break;
7858 case AMDGPU::S_LSHR_B64:
7859 if (ST.hasOnlyRevVALUShifts()) {
7860 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7861 swapOperands(Inst);
7862 }
7863 break;
7864
7865 case AMDGPU::S_ABS_I32:
7866 lowerScalarAbs(Worklist, Inst);
7867 Inst.eraseFromParent();
7868 return;
7869
7870 case AMDGPU::S_ABSDIFF_I32:
7871 lowerScalarAbsDiff(Worklist, Inst);
7872 Inst.eraseFromParent();
7873 return;
7874
7875 case AMDGPU::S_CBRANCH_SCC0:
7876 case AMDGPU::S_CBRANCH_SCC1: {
7877 // Clear unused bits of vcc
7878 Register CondReg = Inst.getOperand(1).getReg();
7879 bool IsSCC = CondReg == AMDGPU::SCC;
7881 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7882 .addReg(LMC.ExecReg)
7883 .addReg(IsSCC ? LMC.VccReg : CondReg);
7884 Inst.removeOperand(1);
7885 } break;
7886
7887 case AMDGPU::S_BFE_U64:
7888 case AMDGPU::S_BFM_B64:
7889 llvm_unreachable("Moving this op to VALU not implemented");
7890
7891 case AMDGPU::S_PACK_LL_B32_B16:
7892 case AMDGPU::S_PACK_LH_B32_B16:
7893 case AMDGPU::S_PACK_HL_B32_B16:
7894 case AMDGPU::S_PACK_HH_B32_B16:
7895 movePackToVALU(Worklist, MRI, Inst);
7896 Inst.eraseFromParent();
7897 return;
7898
7899 case AMDGPU::S_XNOR_B32:
7900 lowerScalarXnor(Worklist, Inst);
7901 Inst.eraseFromParent();
7902 return;
7903
7904 case AMDGPU::S_NAND_B32:
7905 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7906 Inst.eraseFromParent();
7907 return;
7908
7909 case AMDGPU::S_NOR_B32:
7910 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7911 Inst.eraseFromParent();
7912 return;
7913
7914 case AMDGPU::S_ANDN2_B32:
7915 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7916 Inst.eraseFromParent();
7917 return;
7918
7919 case AMDGPU::S_ORN2_B32:
7920 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7921 Inst.eraseFromParent();
7922 return;
7923
7924 // TODO: remove as soon as everything is ready
7925 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7926 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7927 // can only be selected from the uniform SDNode.
7928 case AMDGPU::S_ADD_CO_PSEUDO:
7929 case AMDGPU::S_SUB_CO_PSEUDO: {
7930 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7931 ? AMDGPU::V_ADDC_U32_e64
7932 : AMDGPU::V_SUBB_U32_e64;
7933 const auto *CarryRC = RI.getWaveMaskRegClass();
7934
7935 Register CarryInReg = Inst.getOperand(4).getReg();
7936 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7937 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7938 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7939 .addReg(CarryInReg);
7940 }
7941
7942 Register CarryOutReg = Inst.getOperand(1).getReg();
7943
7944 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7945 MRI.getRegClass(Inst.getOperand(0).getReg())));
7946 MachineInstr *CarryOp =
7947 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7948 .addReg(CarryOutReg, RegState::Define)
7949 .add(Inst.getOperand(2))
7950 .add(Inst.getOperand(3))
7951 .addReg(CarryInReg)
7952 .addImm(0);
7953 legalizeOperands(*CarryOp);
7954 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7955 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7956 Inst.eraseFromParent();
7957 }
7958 return;
7959 case AMDGPU::S_UADDO_PSEUDO:
7960 case AMDGPU::S_USUBO_PSEUDO: {
7961 MachineOperand &Dest0 = Inst.getOperand(0);
7962 MachineOperand &Dest1 = Inst.getOperand(1);
7963 MachineOperand &Src0 = Inst.getOperand(2);
7964 MachineOperand &Src1 = Inst.getOperand(3);
7965
7966 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7967 ? AMDGPU::V_ADD_CO_U32_e64
7968 : AMDGPU::V_SUB_CO_U32_e64;
7969 const TargetRegisterClass *NewRC =
7970 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7971 Register DestReg = MRI.createVirtualRegister(NewRC);
7972 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7973 .addReg(Dest1.getReg(), RegState::Define)
7974 .add(Src0)
7975 .add(Src1)
7976 .addImm(0); // clamp bit
7977
7978 legalizeOperands(*NewInstr, MDT);
7979 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7980 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7981 Inst.eraseFromParent();
7982 }
7983 return;
7984 case AMDGPU::S_LSHL1_ADD_U32:
7985 case AMDGPU::S_LSHL2_ADD_U32:
7986 case AMDGPU::S_LSHL3_ADD_U32:
7987 case AMDGPU::S_LSHL4_ADD_U32: {
7988 MachineOperand &Dest = Inst.getOperand(0);
7989 MachineOperand &Src0 = Inst.getOperand(1);
7990 MachineOperand &Src1 = Inst.getOperand(2);
7991 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
7992 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
7993 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
7994 : 4);
7995
7996 const TargetRegisterClass *NewRC =
7997 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
7998 Register DestReg = MRI.createVirtualRegister(NewRC);
7999 MachineInstr *NewInstr =
8000 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8001 .add(Src0)
8002 .addImm(ShiftAmt)
8003 .add(Src1);
8004
8005 legalizeOperands(*NewInstr, MDT);
8006 MRI.replaceRegWith(Dest.getReg(), DestReg);
8007 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8008 Inst.eraseFromParent();
8009 }
8010 return;
8011 case AMDGPU::S_CSELECT_B32:
8012 case AMDGPU::S_CSELECT_B64:
8013 lowerSelect(Worklist, Inst, MDT);
8014 Inst.eraseFromParent();
8015 return;
8016 case AMDGPU::S_CMP_EQ_I32:
8017 case AMDGPU::S_CMP_LG_I32:
8018 case AMDGPU::S_CMP_GT_I32:
8019 case AMDGPU::S_CMP_GE_I32:
8020 case AMDGPU::S_CMP_LT_I32:
8021 case AMDGPU::S_CMP_LE_I32:
8022 case AMDGPU::S_CMP_EQ_U32:
8023 case AMDGPU::S_CMP_LG_U32:
8024 case AMDGPU::S_CMP_GT_U32:
8025 case AMDGPU::S_CMP_GE_U32:
8026 case AMDGPU::S_CMP_LT_U32:
8027 case AMDGPU::S_CMP_LE_U32:
8028 case AMDGPU::S_CMP_EQ_U64:
8029 case AMDGPU::S_CMP_LG_U64:
8030 case AMDGPU::S_CMP_LT_F32:
8031 case AMDGPU::S_CMP_EQ_F32:
8032 case AMDGPU::S_CMP_LE_F32:
8033 case AMDGPU::S_CMP_GT_F32:
8034 case AMDGPU::S_CMP_LG_F32:
8035 case AMDGPU::S_CMP_GE_F32:
8036 case AMDGPU::S_CMP_O_F32:
8037 case AMDGPU::S_CMP_U_F32:
8038 case AMDGPU::S_CMP_NGE_F32:
8039 case AMDGPU::S_CMP_NLG_F32:
8040 case AMDGPU::S_CMP_NGT_F32:
8041 case AMDGPU::S_CMP_NLE_F32:
8042 case AMDGPU::S_CMP_NEQ_F32:
8043 case AMDGPU::S_CMP_NLT_F32: {
8044 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8045 auto NewInstr =
8046 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8047 .setMIFlags(Inst.getFlags());
8048 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8049 0) {
8050 NewInstr
8051 .addImm(0) // src0_modifiers
8052 .add(Inst.getOperand(0)) // src0
8053 .addImm(0) // src1_modifiers
8054 .add(Inst.getOperand(1)) // src1
8055 .addImm(0); // clamp
8056 } else {
8057 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8058 }
8059 legalizeOperands(*NewInstr, MDT);
8060 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8061 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8062 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8063 Inst.eraseFromParent();
8064 return;
8065 }
8066 case AMDGPU::S_CMP_LT_F16:
8067 case AMDGPU::S_CMP_EQ_F16:
8068 case AMDGPU::S_CMP_LE_F16:
8069 case AMDGPU::S_CMP_GT_F16:
8070 case AMDGPU::S_CMP_LG_F16:
8071 case AMDGPU::S_CMP_GE_F16:
8072 case AMDGPU::S_CMP_O_F16:
8073 case AMDGPU::S_CMP_U_F16:
8074 case AMDGPU::S_CMP_NGE_F16:
8075 case AMDGPU::S_CMP_NLG_F16:
8076 case AMDGPU::S_CMP_NGT_F16:
8077 case AMDGPU::S_CMP_NLE_F16:
8078 case AMDGPU::S_CMP_NEQ_F16:
8079 case AMDGPU::S_CMP_NLT_F16: {
8080 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8081 auto NewInstr =
8082 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8083 .setMIFlags(Inst.getFlags());
8084 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8085 NewInstr
8086 .addImm(0) // src0_modifiers
8087 .add(Inst.getOperand(0)) // src0
8088 .addImm(0) // src1_modifiers
8089 .add(Inst.getOperand(1)) // src1
8090 .addImm(0); // clamp
8091 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8092 NewInstr.addImm(0); // op_sel0
8093 } else {
8094 NewInstr
8095 .add(Inst.getOperand(0))
8096 .add(Inst.getOperand(1));
8097 }
8098 legalizeOperandsVALUt16(*NewInstr, MRI);
8099 legalizeOperands(*NewInstr, MDT);
8100 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8101 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8102 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8103 Inst.eraseFromParent();
8104 return;
8105 }
8106 case AMDGPU::S_CVT_HI_F32_F16: {
8107 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8108 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8109 if (ST.useRealTrue16Insts()) {
8110 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8111 .add(Inst.getOperand(1));
8112 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8113 .addImm(0) // src0_modifiers
8114 .addReg(TmpReg, 0, AMDGPU::hi16)
8115 .addImm(0) // clamp
8116 .addImm(0) // omod
8117 .addImm(0); // op_sel0
8118 } else {
8119 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8120 .addImm(16)
8121 .add(Inst.getOperand(1));
8122 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8123 .addImm(0) // src0_modifiers
8124 .addReg(TmpReg)
8125 .addImm(0) // clamp
8126 .addImm(0); // omod
8127 }
8128
8129 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8130 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8131 Inst.eraseFromParent();
8132 return;
8133 }
8134 case AMDGPU::S_MINIMUM_F32:
8135 case AMDGPU::S_MAXIMUM_F32: {
8136 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8137 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8138 .addImm(0) // src0_modifiers
8139 .add(Inst.getOperand(1))
8140 .addImm(0) // src1_modifiers
8141 .add(Inst.getOperand(2))
8142 .addImm(0) // clamp
8143 .addImm(0); // omod
8144 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8145
8146 legalizeOperands(*NewInstr, MDT);
8147 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8148 Inst.eraseFromParent();
8149 return;
8150 }
8151 case AMDGPU::S_MINIMUM_F16:
8152 case AMDGPU::S_MAXIMUM_F16: {
8153 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8154 ? &AMDGPU::VGPR_16RegClass
8155 : &AMDGPU::VGPR_32RegClass);
8156 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8157 .addImm(0) // src0_modifiers
8158 .add(Inst.getOperand(1))
8159 .addImm(0) // src1_modifiers
8160 .add(Inst.getOperand(2))
8161 .addImm(0) // clamp
8162 .addImm(0) // omod
8163 .addImm(0); // opsel0
8164 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8165 legalizeOperandsVALUt16(*NewInstr, MRI);
8166 legalizeOperands(*NewInstr, MDT);
8167 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8168 Inst.eraseFromParent();
8169 return;
8170 }
8171 case AMDGPU::V_S_EXP_F16_e64:
8172 case AMDGPU::V_S_LOG_F16_e64:
8173 case AMDGPU::V_S_RCP_F16_e64:
8174 case AMDGPU::V_S_RSQ_F16_e64:
8175 case AMDGPU::V_S_SQRT_F16_e64: {
8176 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8177 ? &AMDGPU::VGPR_16RegClass
8178 : &AMDGPU::VGPR_32RegClass);
8179 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8180 .add(Inst.getOperand(1)) // src0_modifiers
8181 .add(Inst.getOperand(2))
8182 .add(Inst.getOperand(3)) // clamp
8183 .add(Inst.getOperand(4)) // omod
8184 .setMIFlags(Inst.getFlags());
8185 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8186 NewInstr.addImm(0); // opsel0
8187 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8188 legalizeOperandsVALUt16(*NewInstr, MRI);
8189 legalizeOperands(*NewInstr, MDT);
8190 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8191 Inst.eraseFromParent();
8192 return;
8193 }
8194 }
8195
8196 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8197 // We cannot move this instruction to the VALU, so we should try to
8198 // legalize its operands instead.
8199 legalizeOperands(Inst, MDT);
8200 return;
8201 }
8202 // Handle converting generic instructions like COPY-to-SGPR into
8203 // COPY-to-VGPR.
8204 if (NewOpcode == Opcode) {
8205 Register DstReg = Inst.getOperand(0).getReg();
8206 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8207
8208 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8209 // hope for the best.
8210 if (Inst.isCopy() && DstReg.isPhysical() &&
8211 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8212 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8213 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8214 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8215 .add(Inst.getOperand(1));
8216 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8217 DstReg)
8218 .addReg(NewDst);
8219
8220 Inst.eraseFromParent();
8221 return;
8222 }
8223
8224 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8225 Register NewDstReg = Inst.getOperand(1).getReg();
8226 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8227 if (const TargetRegisterClass *CommonRC =
8228 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8229 // Instead of creating a copy where src and dst are the same register
8230 // class, we just replace all uses of dst with src. These kinds of
8231 // copies interfere with the heuristics MachineSink uses to decide
8232 // whether or not to split a critical edge. Since the pass assumes
8233 // that copies will end up as machine instructions and not be
8234 // eliminated.
8235 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8236 MRI.replaceRegWith(DstReg, NewDstReg);
8237 MRI.clearKillFlags(NewDstReg);
8238 Inst.getOperand(0).setReg(DstReg);
8239
8240 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8241 llvm_unreachable("failed to constrain register");
8242
8243 Inst.eraseFromParent();
8244 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8245 for (MachineOperand &MO :
8246 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8247 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8248 }
8249
8250 return;
8251 }
8252 }
8253
8254 // If this is a v2s copy between 16bit and 32bit reg,
8255 // replace vgpr copy to reg_sequence/extract_subreg
8256 // This can be remove after we have sgpr16 in place
8257 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8258 Inst.getOperand(1).getReg().isVirtual() &&
8259 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8260 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8261 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8262 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8263 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8264 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8265 get(AMDGPU::IMPLICIT_DEF), Undef);
8266 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8267 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8268 .addReg(Inst.getOperand(1).getReg())
8269 .addImm(AMDGPU::lo16)
8270 .addReg(Undef)
8271 .addImm(AMDGPU::hi16);
8272 Inst.eraseFromParent();
8273 MRI.replaceRegWith(DstReg, NewDstReg);
8274 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8275 return;
8276 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8277 AMDGPU::lo16)) {
8278 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8279 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8280 MRI.replaceRegWith(DstReg, NewDstReg);
8281 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8282 return;
8283 }
8284 }
8285
8286 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8287 MRI.replaceRegWith(DstReg, NewDstReg);
8288 legalizeOperands(Inst, MDT);
8289 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8290 return;
8291 }
8292
8293 // Use the new VALU Opcode.
8294 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8295 .setMIFlags(Inst.getFlags());
8296 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8297 // Intersperse VOP3 modifiers among the SALU operands.
8298 NewInstr->addOperand(Inst.getOperand(0));
8299 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8300 AMDGPU::OpName::src0_modifiers) >= 0)
8301 NewInstr.addImm(0);
8302 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8303 const MachineOperand &Src = Inst.getOperand(1);
8304 NewInstr->addOperand(Src);
8305 }
8306
8307 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8308 // We are converting these to a BFE, so we need to add the missing
8309 // operands for the size and offset.
8310 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8311 NewInstr.addImm(0);
8312 NewInstr.addImm(Size);
8313 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8314 // The VALU version adds the second operand to the result, so insert an
8315 // extra 0 operand.
8316 NewInstr.addImm(0);
8317 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8318 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8319 // If we need to move this to VGPRs, we need to unpack the second
8320 // operand back into the 2 separate ones for bit offset and width.
8321 assert(OffsetWidthOp.isImm() &&
8322 "Scalar BFE is only implemented for constant width and offset");
8323 uint32_t Imm = OffsetWidthOp.getImm();
8324
8325 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8326 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8327 NewInstr.addImm(Offset);
8328 NewInstr.addImm(BitWidth);
8329 } else {
8330 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8331 AMDGPU::OpName::src1_modifiers) >= 0)
8332 NewInstr.addImm(0);
8333 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8334 NewInstr->addOperand(Inst.getOperand(2));
8335 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8336 AMDGPU::OpName::src2_modifiers) >= 0)
8337 NewInstr.addImm(0);
8338 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8339 NewInstr->addOperand(Inst.getOperand(3));
8340 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8341 NewInstr.addImm(0);
8342 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8343 NewInstr.addImm(0);
8344 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8345 NewInstr.addImm(0);
8346 }
8347 } else {
8348 // Just copy the SALU operands.
8349 for (const MachineOperand &Op : Inst.explicit_operands())
8350 NewInstr->addOperand(Op);
8351 }
8352
8353 // Remove any references to SCC. Vector instructions can't read from it, and
8354 // We're just about to add the implicit use / defs of VCC, and we don't want
8355 // both.
8356 for (MachineOperand &Op : Inst.implicit_operands()) {
8357 if (Op.getReg() == AMDGPU::SCC) {
8358 // Only propagate through live-def of SCC.
8359 if (Op.isDef() && !Op.isDead())
8360 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8361 if (Op.isUse())
8362 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8363 }
8364 }
8365 Inst.eraseFromParent();
8366 Register NewDstReg;
8367 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8368 Register DstReg = NewInstr->getOperand(0).getReg();
8369 assert(DstReg.isVirtual());
8370 // Update the destination register class.
8371 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8372 assert(NewDstRC);
8373 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8374 MRI.replaceRegWith(DstReg, NewDstReg);
8375 }
8376 fixImplicitOperands(*NewInstr);
8377
8378 legalizeOperandsVALUt16(*NewInstr, MRI);
8379
8380 // Legalize the operands
8381 legalizeOperands(*NewInstr, MDT);
8382 if (NewDstReg)
8383 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8384}
8385
8386// Add/sub require special handling to deal with carry outs.
8387std::pair<bool, MachineBasicBlock *>
8388SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8389 MachineDominatorTree *MDT) const {
8390 if (ST.hasAddNoCarry()) {
8391 // Assume there is no user of scc since we don't select this in that case.
8392 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8393 // is used.
8394
8395 MachineBasicBlock &MBB = *Inst.getParent();
8396 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8397
8398 Register OldDstReg = Inst.getOperand(0).getReg();
8399 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8400
8401 unsigned Opc = Inst.getOpcode();
8402 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8403
8404 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8405 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8406
8407 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8408 Inst.removeOperand(3);
8409
8410 Inst.setDesc(get(NewOpc));
8411 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8412 Inst.addImplicitDefUseOperands(*MBB.getParent());
8413 MRI.replaceRegWith(OldDstReg, ResultReg);
8414 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8415
8416 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8417 return std::pair(true, NewBB);
8418 }
8419
8420 return std::pair(false, nullptr);
8421}
8422
8423void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8424 MachineDominatorTree *MDT) const {
8425
8426 MachineBasicBlock &MBB = *Inst.getParent();
8427 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8428 MachineBasicBlock::iterator MII = Inst;
8429 DebugLoc DL = Inst.getDebugLoc();
8430
8431 MachineOperand &Dest = Inst.getOperand(0);
8432 MachineOperand &Src0 = Inst.getOperand(1);
8433 MachineOperand &Src1 = Inst.getOperand(2);
8434 MachineOperand &Cond = Inst.getOperand(3);
8435
8436 Register CondReg = Cond.getReg();
8437 bool IsSCC = (CondReg == AMDGPU::SCC);
8438
8439 // If this is a trivial select where the condition is effectively not SCC
8440 // (CondReg is a source of copy to SCC), then the select is semantically
8441 // equivalent to copying CondReg. Hence, there is no need to create
8442 // V_CNDMASK, we can just use that and bail out.
8443 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8444 (Src1.getImm() == 0)) {
8445 MRI.replaceRegWith(Dest.getReg(), CondReg);
8446 return;
8447 }
8448
8449 Register NewCondReg = CondReg;
8450 if (IsSCC) {
8451 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8452 NewCondReg = MRI.createVirtualRegister(TC);
8453
8454 // Now look for the closest SCC def if it is a copy
8455 // replacing the CondReg with the COPY source register
8456 bool CopyFound = false;
8457 for (MachineInstr &CandI :
8459 Inst.getParent()->rend())) {
8460 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8461 -1) {
8462 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8463 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8464 .addReg(CandI.getOperand(1).getReg());
8465 CopyFound = true;
8466 }
8467 break;
8468 }
8469 }
8470 if (!CopyFound) {
8471 // SCC def is not a copy
8472 // Insert a trivial select instead of creating a copy, because a copy from
8473 // SCC would semantically mean just copying a single bit, but we may need
8474 // the result to be a vector condition mask that needs preserving.
8475 unsigned Opcode =
8476 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8477 auto NewSelect =
8478 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8479 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8480 }
8481 }
8482
8483 Register NewDestReg = MRI.createVirtualRegister(
8484 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8485 MachineInstr *NewInst;
8486 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8487 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8488 .addImm(0)
8489 .add(Src1) // False
8490 .addImm(0)
8491 .add(Src0) // True
8492 .addReg(NewCondReg);
8493 } else {
8494 NewInst =
8495 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8496 .add(Src1) // False
8497 .add(Src0) // True
8498 .addReg(NewCondReg);
8499 }
8500 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8501 legalizeOperands(*NewInst, MDT);
8502 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8503}
8504
8505void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8506 MachineInstr &Inst) const {
8507 MachineBasicBlock &MBB = *Inst.getParent();
8508 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8509 MachineBasicBlock::iterator MII = Inst;
8510 DebugLoc DL = Inst.getDebugLoc();
8511
8512 MachineOperand &Dest = Inst.getOperand(0);
8513 MachineOperand &Src = Inst.getOperand(1);
8514 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8515 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8516
8517 unsigned SubOp = ST.hasAddNoCarry() ?
8518 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8519
8520 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8521 .addImm(0)
8522 .addReg(Src.getReg());
8523
8524 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8525 .addReg(Src.getReg())
8526 .addReg(TmpReg);
8527
8528 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8529 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8530}
8531
8532void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8533 MachineInstr &Inst) const {
8534 MachineBasicBlock &MBB = *Inst.getParent();
8535 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8536 MachineBasicBlock::iterator MII = Inst;
8537 const DebugLoc &DL = Inst.getDebugLoc();
8538
8539 MachineOperand &Dest = Inst.getOperand(0);
8540 MachineOperand &Src1 = Inst.getOperand(1);
8541 MachineOperand &Src2 = Inst.getOperand(2);
8542 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8543 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8544 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8545
8546 unsigned SubOp =
8547 ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8548
8549 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8550 .addReg(Src1.getReg())
8551 .addReg(Src2.getReg());
8552
8553 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8554
8555 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8556 .addReg(SubResultReg)
8557 .addReg(TmpReg);
8558
8559 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8560 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8561}
8562
8563void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8564 MachineInstr &Inst) const {
8565 MachineBasicBlock &MBB = *Inst.getParent();
8566 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8567 MachineBasicBlock::iterator MII = Inst;
8568 const DebugLoc &DL = Inst.getDebugLoc();
8569
8570 MachineOperand &Dest = Inst.getOperand(0);
8571 MachineOperand &Src0 = Inst.getOperand(1);
8572 MachineOperand &Src1 = Inst.getOperand(2);
8573
8574 if (ST.hasDLInsts()) {
8575 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8576 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8577 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8578
8579 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8580 .add(Src0)
8581 .add(Src1);
8582
8583 MRI.replaceRegWith(Dest.getReg(), NewDest);
8584 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8585 } else {
8586 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8587 // invert either source and then perform the XOR. If either source is a
8588 // scalar register, then we can leave the inversion on the scalar unit to
8589 // achieve a better distribution of scalar and vector instructions.
8590 bool Src0IsSGPR = Src0.isReg() &&
8591 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8592 bool Src1IsSGPR = Src1.isReg() &&
8593 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8594 MachineInstr *Xor;
8595 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8596 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8597
8598 // Build a pair of scalar instructions and add them to the work list.
8599 // The next iteration over the work list will lower these to the vector
8600 // unit as necessary.
8601 if (Src0IsSGPR) {
8602 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8603 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8604 .addReg(Temp)
8605 .add(Src1);
8606 } else if (Src1IsSGPR) {
8607 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8608 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8609 .add(Src0)
8610 .addReg(Temp);
8611 } else {
8612 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8613 .add(Src0)
8614 .add(Src1);
8615 MachineInstr *Not =
8616 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8617 Worklist.insert(Not);
8618 }
8619
8620 MRI.replaceRegWith(Dest.getReg(), NewDest);
8621
8622 Worklist.insert(Xor);
8623
8624 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8625 }
8626}
8627
8628void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8629 MachineInstr &Inst,
8630 unsigned Opcode) const {
8631 MachineBasicBlock &MBB = *Inst.getParent();
8632 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8633 MachineBasicBlock::iterator MII = Inst;
8634 const DebugLoc &DL = Inst.getDebugLoc();
8635
8636 MachineOperand &Dest = Inst.getOperand(0);
8637 MachineOperand &Src0 = Inst.getOperand(1);
8638 MachineOperand &Src1 = Inst.getOperand(2);
8639
8640 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8641 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8642
8643 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8644 .add(Src0)
8645 .add(Src1);
8646
8647 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8648 .addReg(Interm);
8649
8650 Worklist.insert(&Op);
8651 Worklist.insert(&Not);
8652
8653 MRI.replaceRegWith(Dest.getReg(), NewDest);
8654 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8655}
8656
8657void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8658 MachineInstr &Inst,
8659 unsigned Opcode) const {
8660 MachineBasicBlock &MBB = *Inst.getParent();
8661 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8662 MachineBasicBlock::iterator MII = Inst;
8663 const DebugLoc &DL = Inst.getDebugLoc();
8664
8665 MachineOperand &Dest = Inst.getOperand(0);
8666 MachineOperand &Src0 = Inst.getOperand(1);
8667 MachineOperand &Src1 = Inst.getOperand(2);
8668
8669 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8670 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8671
8672 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8673 .add(Src1);
8674
8675 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8676 .add(Src0)
8677 .addReg(Interm);
8678
8679 Worklist.insert(&Not);
8680 Worklist.insert(&Op);
8681
8682 MRI.replaceRegWith(Dest.getReg(), NewDest);
8683 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8684}
8685
8686void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8687 MachineInstr &Inst, unsigned Opcode,
8688 bool Swap) const {
8689 MachineBasicBlock &MBB = *Inst.getParent();
8690 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8691
8692 MachineOperand &Dest = Inst.getOperand(0);
8693 MachineOperand &Src0 = Inst.getOperand(1);
8694 DebugLoc DL = Inst.getDebugLoc();
8695
8696 MachineBasicBlock::iterator MII = Inst;
8697
8698 const MCInstrDesc &InstDesc = get(Opcode);
8699 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8700 MRI.getRegClass(Src0.getReg()) :
8701 &AMDGPU::SGPR_32RegClass;
8702
8703 const TargetRegisterClass *Src0SubRC =
8704 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8705
8706 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8707 AMDGPU::sub0, Src0SubRC);
8708
8709 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8710 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8711 const TargetRegisterClass *NewDestSubRC =
8712 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8713
8714 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8715 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8716
8717 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8718 AMDGPU::sub1, Src0SubRC);
8719
8720 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8721 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8722
8723 if (Swap)
8724 std::swap(DestSub0, DestSub1);
8725
8726 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8727 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8728 .addReg(DestSub0)
8729 .addImm(AMDGPU::sub0)
8730 .addReg(DestSub1)
8731 .addImm(AMDGPU::sub1);
8732
8733 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8734
8735 Worklist.insert(&LoHalf);
8736 Worklist.insert(&HiHalf);
8737
8738 // We don't need to legalizeOperands here because for a single operand, src0
8739 // will support any kind of input.
8740
8741 // Move all users of this moved value.
8742 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8743}
8744
8745// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8746// split the s_mul_u64 in 32-bit vector multiplications.
8747void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8748 MachineInstr &Inst,
8749 MachineDominatorTree *MDT) const {
8750 MachineBasicBlock &MBB = *Inst.getParent();
8751 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8752
8753 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8754 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8755 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8756
8757 MachineOperand &Dest = Inst.getOperand(0);
8758 MachineOperand &Src0 = Inst.getOperand(1);
8759 MachineOperand &Src1 = Inst.getOperand(2);
8760 const DebugLoc &DL = Inst.getDebugLoc();
8761 MachineBasicBlock::iterator MII = Inst;
8762
8763 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8764 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8765 const TargetRegisterClass *Src0SubRC =
8766 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8767 if (RI.isSGPRClass(Src0SubRC))
8768 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8769 const TargetRegisterClass *Src1SubRC =
8770 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8771 if (RI.isSGPRClass(Src1SubRC))
8772 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8773
8774 // First, we extract the low 32-bit and high 32-bit values from each of the
8775 // operands.
8776 MachineOperand Op0L =
8777 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8778 MachineOperand Op1L =
8779 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8780 MachineOperand Op0H =
8781 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8782 MachineOperand Op1H =
8783 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8784
8785 // The multilication is done as follows:
8786 //
8787 // Op1H Op1L
8788 // * Op0H Op0L
8789 // --------------------
8790 // Op1H*Op0L Op1L*Op0L
8791 // + Op1H*Op0H Op1L*Op0H
8792 // -----------------------------------------
8793 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8794 //
8795 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8796 // value and that would overflow.
8797 // The low 32-bit value is Op1L*Op0L.
8798 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8799
8800 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8801 MachineInstr *Op1L_Op0H =
8802 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8803 .add(Op1L)
8804 .add(Op0H);
8805
8806 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8807 MachineInstr *Op1H_Op0L =
8808 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8809 .add(Op1H)
8810 .add(Op0L);
8811
8812 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8813 MachineInstr *Carry =
8814 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8815 .add(Op1L)
8816 .add(Op0L);
8817
8818 MachineInstr *LoHalf =
8819 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8820 .add(Op1L)
8821 .add(Op0L);
8822
8823 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8824 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8825 .addReg(Op1L_Op0H_Reg)
8826 .addReg(Op1H_Op0L_Reg);
8827
8828 MachineInstr *HiHalf =
8829 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8830 .addReg(AddReg)
8831 .addReg(CarryReg);
8832
8833 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8834 .addReg(DestSub0)
8835 .addImm(AMDGPU::sub0)
8836 .addReg(DestSub1)
8837 .addImm(AMDGPU::sub1);
8838
8839 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8840
8841 // Try to legalize the operands in case we need to swap the order to keep it
8842 // valid.
8843 legalizeOperands(*Op1L_Op0H, MDT);
8844 legalizeOperands(*Op1H_Op0L, MDT);
8845 legalizeOperands(*Carry, MDT);
8846 legalizeOperands(*LoHalf, MDT);
8847 legalizeOperands(*Add, MDT);
8848 legalizeOperands(*HiHalf, MDT);
8849
8850 // Move all users of this moved value.
8851 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8852}
8853
8854// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8855// multiplications.
8856void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8857 MachineInstr &Inst,
8858 MachineDominatorTree *MDT) const {
8859 MachineBasicBlock &MBB = *Inst.getParent();
8860 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8861
8862 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8863 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8864 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8865
8866 MachineOperand &Dest = Inst.getOperand(0);
8867 MachineOperand &Src0 = Inst.getOperand(1);
8868 MachineOperand &Src1 = Inst.getOperand(2);
8869 const DebugLoc &DL = Inst.getDebugLoc();
8870 MachineBasicBlock::iterator MII = Inst;
8871
8872 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8873 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8874 const TargetRegisterClass *Src0SubRC =
8875 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8876 if (RI.isSGPRClass(Src0SubRC))
8877 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8878 const TargetRegisterClass *Src1SubRC =
8879 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8880 if (RI.isSGPRClass(Src1SubRC))
8881 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8882
8883 // First, we extract the low 32-bit and high 32-bit values from each of the
8884 // operands.
8885 MachineOperand Op0L =
8886 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8887 MachineOperand Op1L =
8888 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8889
8890 unsigned Opc = Inst.getOpcode();
8891 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8892 ? AMDGPU::V_MUL_HI_U32_e64
8893 : AMDGPU::V_MUL_HI_I32_e64;
8894 MachineInstr *HiHalf =
8895 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8896
8897 MachineInstr *LoHalf =
8898 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8899 .add(Op1L)
8900 .add(Op0L);
8901
8902 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8903 .addReg(DestSub0)
8904 .addImm(AMDGPU::sub0)
8905 .addReg(DestSub1)
8906 .addImm(AMDGPU::sub1);
8907
8908 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8909
8910 // Try to legalize the operands in case we need to swap the order to keep it
8911 // valid.
8912 legalizeOperands(*HiHalf, MDT);
8913 legalizeOperands(*LoHalf, MDT);
8914
8915 // Move all users of this moved value.
8916 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8917}
8918
8919void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8920 MachineInstr &Inst, unsigned Opcode,
8921 MachineDominatorTree *MDT) const {
8922 MachineBasicBlock &MBB = *Inst.getParent();
8923 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8924
8925 MachineOperand &Dest = Inst.getOperand(0);
8926 MachineOperand &Src0 = Inst.getOperand(1);
8927 MachineOperand &Src1 = Inst.getOperand(2);
8928 DebugLoc DL = Inst.getDebugLoc();
8929
8930 MachineBasicBlock::iterator MII = Inst;
8931
8932 const MCInstrDesc &InstDesc = get(Opcode);
8933 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8934 MRI.getRegClass(Src0.getReg()) :
8935 &AMDGPU::SGPR_32RegClass;
8936
8937 const TargetRegisterClass *Src0SubRC =
8938 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8939 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8940 MRI.getRegClass(Src1.getReg()) :
8941 &AMDGPU::SGPR_32RegClass;
8942
8943 const TargetRegisterClass *Src1SubRC =
8944 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8945
8946 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8947 AMDGPU::sub0, Src0SubRC);
8948 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8949 AMDGPU::sub0, Src1SubRC);
8950 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8951 AMDGPU::sub1, Src0SubRC);
8952 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8953 AMDGPU::sub1, Src1SubRC);
8954
8955 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8956 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8957 const TargetRegisterClass *NewDestSubRC =
8958 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8959
8960 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8961 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8962 .add(SrcReg0Sub0)
8963 .add(SrcReg1Sub0);
8964
8965 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8966 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8967 .add(SrcReg0Sub1)
8968 .add(SrcReg1Sub1);
8969
8970 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8971 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8972 .addReg(DestSub0)
8973 .addImm(AMDGPU::sub0)
8974 .addReg(DestSub1)
8975 .addImm(AMDGPU::sub1);
8976
8977 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8978
8979 Worklist.insert(&LoHalf);
8980 Worklist.insert(&HiHalf);
8981
8982 // Move all users of this moved value.
8983 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8984}
8985
8986void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8987 MachineInstr &Inst,
8988 MachineDominatorTree *MDT) const {
8989 MachineBasicBlock &MBB = *Inst.getParent();
8990 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8991
8992 MachineOperand &Dest = Inst.getOperand(0);
8993 MachineOperand &Src0 = Inst.getOperand(1);
8994 MachineOperand &Src1 = Inst.getOperand(2);
8995 const DebugLoc &DL = Inst.getDebugLoc();
8996
8997 MachineBasicBlock::iterator MII = Inst;
8998
8999 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9000
9001 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9002
9003 MachineOperand* Op0;
9004 MachineOperand* Op1;
9005
9006 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9007 Op0 = &Src0;
9008 Op1 = &Src1;
9009 } else {
9010 Op0 = &Src1;
9011 Op1 = &Src0;
9012 }
9013
9014 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9015 .add(*Op0);
9016
9017 Register NewDest = MRI.createVirtualRegister(DestRC);
9018
9019 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9020 .addReg(Interm)
9021 .add(*Op1);
9022
9023 MRI.replaceRegWith(Dest.getReg(), NewDest);
9024
9025 Worklist.insert(&Xor);
9026}
9027
9028void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9029 MachineInstr &Inst) const {
9030 MachineBasicBlock &MBB = *Inst.getParent();
9031 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9032
9033 MachineBasicBlock::iterator MII = Inst;
9034 const DebugLoc &DL = Inst.getDebugLoc();
9035
9036 MachineOperand &Dest = Inst.getOperand(0);
9037 MachineOperand &Src = Inst.getOperand(1);
9038
9039 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9040 const TargetRegisterClass *SrcRC = Src.isReg() ?
9041 MRI.getRegClass(Src.getReg()) :
9042 &AMDGPU::SGPR_32RegClass;
9043
9044 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9045 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9046
9047 const TargetRegisterClass *SrcSubRC =
9048 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9049
9050 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9051 AMDGPU::sub0, SrcSubRC);
9052 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9053 AMDGPU::sub1, SrcSubRC);
9054
9055 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9056
9057 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9058
9059 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9060
9061 // We don't need to legalize operands here. src0 for either instruction can be
9062 // an SGPR, and the second input is unused or determined here.
9063 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9064}
9065
9066void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9067 MachineInstr &Inst) const {
9068 MachineBasicBlock &MBB = *Inst.getParent();
9069 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9070 MachineBasicBlock::iterator MII = Inst;
9071 const DebugLoc &DL = Inst.getDebugLoc();
9072
9073 MachineOperand &Dest = Inst.getOperand(0);
9074 uint32_t Imm = Inst.getOperand(2).getImm();
9075 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9076 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9077
9078 (void) Offset;
9079
9080 // Only sext_inreg cases handled.
9081 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9082 Offset == 0 && "Not implemented");
9083
9084 if (BitWidth < 32) {
9085 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9086 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9087 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9088
9089 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9090 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
9091 .addImm(0)
9092 .addImm(BitWidth);
9093
9094 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9095 .addImm(31)
9096 .addReg(MidRegLo);
9097
9098 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9099 .addReg(MidRegLo)
9100 .addImm(AMDGPU::sub0)
9101 .addReg(MidRegHi)
9102 .addImm(AMDGPU::sub1);
9103
9104 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9105 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9106 return;
9107 }
9108
9109 MachineOperand &Src = Inst.getOperand(1);
9110 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9111 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9112
9113 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9114 .addImm(31)
9115 .addReg(Src.getReg(), 0, AMDGPU::sub0);
9116
9117 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9118 .addReg(Src.getReg(), 0, AMDGPU::sub0)
9119 .addImm(AMDGPU::sub0)
9120 .addReg(TmpReg)
9121 .addImm(AMDGPU::sub1);
9122
9123 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9124 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9125}
9126
9127void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9128 MachineInstr &Inst, unsigned Opcode,
9129 MachineDominatorTree *MDT) const {
9130 // (S_FLBIT_I32_B64 hi:lo) ->
9131 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9132 // (S_FF1_I32_B64 hi:lo) ->
9133 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9134
9135 MachineBasicBlock &MBB = *Inst.getParent();
9136 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9137 MachineBasicBlock::iterator MII = Inst;
9138 const DebugLoc &DL = Inst.getDebugLoc();
9139
9140 MachineOperand &Dest = Inst.getOperand(0);
9141 MachineOperand &Src = Inst.getOperand(1);
9142
9143 const MCInstrDesc &InstDesc = get(Opcode);
9144
9145 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9146 unsigned OpcodeAdd =
9147 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9148
9149 const TargetRegisterClass *SrcRC =
9150 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9151 const TargetRegisterClass *SrcSubRC =
9152 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9153
9154 MachineOperand SrcRegSub0 =
9155 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9156 MachineOperand SrcRegSub1 =
9157 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9158
9159 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9160 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9161 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9162 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9163
9164 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9165
9166 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9167
9168 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9169 .addReg(IsCtlz ? MidReg1 : MidReg2)
9170 .addImm(32)
9171 .addImm(1); // enable clamp
9172
9173 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9174 .addReg(MidReg3)
9175 .addReg(IsCtlz ? MidReg2 : MidReg1);
9176
9177 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9178
9179 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9180}
9181
9182void SIInstrInfo::addUsersToMoveToVALUWorklist(
9184 SIInstrWorklist &Worklist) const {
9185 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9186 MachineInstr &UseMI = *MO.getParent();
9187
9188 unsigned OpNo = 0;
9189
9190 switch (UseMI.getOpcode()) {
9191 case AMDGPU::COPY:
9192 case AMDGPU::WQM:
9193 case AMDGPU::SOFT_WQM:
9194 case AMDGPU::STRICT_WWM:
9195 case AMDGPU::STRICT_WQM:
9196 case AMDGPU::REG_SEQUENCE:
9197 case AMDGPU::PHI:
9198 case AMDGPU::INSERT_SUBREG:
9199 break;
9200 default:
9201 OpNo = MO.getOperandNo();
9202 break;
9203 }
9204
9205 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9206 MRI.constrainRegClass(DstReg, OpRC);
9207
9208 if (!RI.hasVectorRegisters(OpRC))
9209 Worklist.insert(&UseMI);
9210 else
9211 // Legalization could change user list.
9213 }
9214}
9215
9216void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9218 MachineInstr &Inst) const {
9219 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9220 MachineBasicBlock *MBB = Inst.getParent();
9221 MachineOperand &Src0 = Inst.getOperand(1);
9222 MachineOperand &Src1 = Inst.getOperand(2);
9223 const DebugLoc &DL = Inst.getDebugLoc();
9224
9225 if (ST.useRealTrue16Insts()) {
9226 Register SrcReg0, SrcReg1;
9227 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9228 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9229 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9230 } else {
9231 SrcReg0 = Src0.getReg();
9232 }
9233
9234 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9235 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9236 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9237 } else {
9238 SrcReg1 = Src1.getReg();
9239 }
9240
9241 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9242 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9243
9244 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9245 switch (Inst.getOpcode()) {
9246 case AMDGPU::S_PACK_LL_B32_B16:
9247 NewMI
9248 .addReg(SrcReg0, 0,
9249 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9250 .addImm(AMDGPU::lo16)
9251 .addReg(SrcReg1, 0,
9252 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9253 .addImm(AMDGPU::hi16);
9254 break;
9255 case AMDGPU::S_PACK_LH_B32_B16:
9256 NewMI
9257 .addReg(SrcReg0, 0,
9258 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9259 .addImm(AMDGPU::lo16)
9260 .addReg(SrcReg1, 0, AMDGPU::hi16)
9261 .addImm(AMDGPU::hi16);
9262 break;
9263 case AMDGPU::S_PACK_HL_B32_B16:
9264 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9265 .addImm(AMDGPU::lo16)
9266 .addReg(SrcReg1, 0,
9267 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9268 .addImm(AMDGPU::hi16);
9269 break;
9270 case AMDGPU::S_PACK_HH_B32_B16:
9271 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9272 .addImm(AMDGPU::lo16)
9273 .addReg(SrcReg1, 0, AMDGPU::hi16)
9274 .addImm(AMDGPU::hi16);
9275 break;
9276 default:
9277 llvm_unreachable("unhandled s_pack_* instruction");
9278 }
9279
9280 MachineOperand &Dest = Inst.getOperand(0);
9281 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9282 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9283 return;
9284 }
9285
9286 switch (Inst.getOpcode()) {
9287 case AMDGPU::S_PACK_LL_B32_B16: {
9288 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9289 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9290
9291 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9292 // 0.
9293 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9294 .addImm(0xffff);
9295
9296 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9297 .addReg(ImmReg, RegState::Kill)
9298 .add(Src0);
9299
9300 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9301 .add(Src1)
9302 .addImm(16)
9303 .addReg(TmpReg, RegState::Kill);
9304 break;
9305 }
9306 case AMDGPU::S_PACK_LH_B32_B16: {
9307 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9308 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9309 .addImm(0xffff);
9310 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9311 .addReg(ImmReg, RegState::Kill)
9312 .add(Src0)
9313 .add(Src1);
9314 break;
9315 }
9316 case AMDGPU::S_PACK_HL_B32_B16: {
9317 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9318 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9319 .addImm(16)
9320 .add(Src0);
9321 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9322 .add(Src1)
9323 .addImm(16)
9324 .addReg(TmpReg, RegState::Kill);
9325 break;
9326 }
9327 case AMDGPU::S_PACK_HH_B32_B16: {
9328 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9329 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9330 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9331 .addImm(16)
9332 .add(Src0);
9333 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9334 .addImm(0xffff0000);
9335 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9336 .add(Src1)
9337 .addReg(ImmReg, RegState::Kill)
9338 .addReg(TmpReg, RegState::Kill);
9339 break;
9340 }
9341 default:
9342 llvm_unreachable("unhandled s_pack_* instruction");
9343 }
9344
9345 MachineOperand &Dest = Inst.getOperand(0);
9346 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9347 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9348}
9349
9350void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9351 MachineInstr &SCCDefInst,
9352 SIInstrWorklist &Worklist,
9353 Register NewCond) const {
9354
9355 // Ensure that def inst defines SCC, which is still live.
9356 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9357 !Op.isDead() && Op.getParent() == &SCCDefInst);
9358 SmallVector<MachineInstr *, 4> CopyToDelete;
9359 // This assumes that all the users of SCC are in the same block
9360 // as the SCC def.
9361 for (MachineInstr &MI : // Skip the def inst itself.
9362 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9363 SCCDefInst.getParent()->end())) {
9364 // Check if SCC is used first.
9365 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9366 if (SCCIdx != -1) {
9367 if (MI.isCopy()) {
9368 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9369 Register DestReg = MI.getOperand(0).getReg();
9370
9371 MRI.replaceRegWith(DestReg, NewCond);
9372 CopyToDelete.push_back(&MI);
9373 } else {
9374
9375 if (NewCond.isValid())
9376 MI.getOperand(SCCIdx).setReg(NewCond);
9377
9378 Worklist.insert(&MI);
9379 }
9380 }
9381 // Exit if we find another SCC def.
9382 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9383 break;
9384 }
9385 for (auto &Copy : CopyToDelete)
9386 Copy->eraseFromParent();
9387}
9388
9389// Instructions that use SCC may be converted to VALU instructions. When that
9390// happens, the SCC register is changed to VCC_LO. The instruction that defines
9391// SCC must be changed to an instruction that defines VCC. This function makes
9392// sure that the instruction that defines SCC is added to the moveToVALU
9393// worklist.
9394void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9395 SIInstrWorklist &Worklist) const {
9396 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9397 // then there is nothing to do because the defining instruction has been
9398 // converted to a VALU already. If SCC then that instruction needs to be
9399 // converted to a VALU.
9400 for (MachineInstr &MI :
9401 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9402 SCCUseInst->getParent()->rend())) {
9403 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9404 break;
9405 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9406 Worklist.insert(&MI);
9407 break;
9408 }
9409 }
9410}
9411
9412const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9413 const MachineInstr &Inst) const {
9414 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9415
9416 switch (Inst.getOpcode()) {
9417 // For target instructions, getOpRegClass just returns the virtual register
9418 // class associated with the operand, so we need to find an equivalent VGPR
9419 // register class in order to move the instruction to the VALU.
9420 case AMDGPU::COPY:
9421 case AMDGPU::PHI:
9422 case AMDGPU::REG_SEQUENCE:
9423 case AMDGPU::INSERT_SUBREG:
9424 case AMDGPU::WQM:
9425 case AMDGPU::SOFT_WQM:
9426 case AMDGPU::STRICT_WWM:
9427 case AMDGPU::STRICT_WQM: {
9428 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9429 if (RI.isAGPRClass(SrcRC)) {
9430 if (RI.isAGPRClass(NewDstRC))
9431 return nullptr;
9432
9433 switch (Inst.getOpcode()) {
9434 case AMDGPU::PHI:
9435 case AMDGPU::REG_SEQUENCE:
9436 case AMDGPU::INSERT_SUBREG:
9437 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9438 break;
9439 default:
9440 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9441 }
9442
9443 if (!NewDstRC)
9444 return nullptr;
9445 } else {
9446 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9447 return nullptr;
9448
9449 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9450 if (!NewDstRC)
9451 return nullptr;
9452 }
9453
9454 return NewDstRC;
9455 }
9456 default:
9457 return NewDstRC;
9458 }
9459}
9460
9461// Find the one SGPR operand we are allowed to use.
9462Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9463 int OpIndices[3]) const {
9464 const MCInstrDesc &Desc = MI.getDesc();
9465
9466 // Find the one SGPR operand we are allowed to use.
9467 //
9468 // First we need to consider the instruction's operand requirements before
9469 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9470 // of VCC, but we are still bound by the constant bus requirement to only use
9471 // one.
9472 //
9473 // If the operand's class is an SGPR, we can never move it.
9474
9475 Register SGPRReg = findImplicitSGPRRead(MI);
9476 if (SGPRReg)
9477 return SGPRReg;
9478
9479 Register UsedSGPRs[3] = {Register()};
9480 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9481
9482 for (unsigned i = 0; i < 3; ++i) {
9483 int Idx = OpIndices[i];
9484 if (Idx == -1)
9485 break;
9486
9487 const MachineOperand &MO = MI.getOperand(Idx);
9488 if (!MO.isReg())
9489 continue;
9490
9491 // Is this operand statically required to be an SGPR based on the operand
9492 // constraints?
9493 const TargetRegisterClass *OpRC =
9494 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9495 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9496 if (IsRequiredSGPR)
9497 return MO.getReg();
9498
9499 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9500 Register Reg = MO.getReg();
9501 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9502 if (RI.isSGPRClass(RegRC))
9503 UsedSGPRs[i] = Reg;
9504 }
9505
9506 // We don't have a required SGPR operand, so we have a bit more freedom in
9507 // selecting operands to move.
9508
9509 // Try to select the most used SGPR. If an SGPR is equal to one of the
9510 // others, we choose that.
9511 //
9512 // e.g.
9513 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9514 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9515
9516 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9517 // prefer those.
9518
9519 if (UsedSGPRs[0]) {
9520 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9521 SGPRReg = UsedSGPRs[0];
9522 }
9523
9524 if (!SGPRReg && UsedSGPRs[1]) {
9525 if (UsedSGPRs[1] == UsedSGPRs[2])
9526 SGPRReg = UsedSGPRs[1];
9527 }
9528
9529 return SGPRReg;
9530}
9531
9533 AMDGPU::OpName OperandName) const {
9534 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9535 return nullptr;
9536
9537 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9538 if (Idx == -1)
9539 return nullptr;
9540
9541 return &MI.getOperand(Idx);
9542}
9543
9545 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9546 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9549 return (Format << 44) |
9550 (1ULL << 56) | // RESOURCE_LEVEL = 1
9551 (3ULL << 60); // OOB_SELECT = 3
9552 }
9553
9554 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9555 if (ST.isAmdHsaOS()) {
9556 // Set ATC = 1. GFX9 doesn't have this bit.
9557 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9558 RsrcDataFormat |= (1ULL << 56);
9559
9560 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9561 // BTW, it disables TC L2 and therefore decreases performance.
9562 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9563 RsrcDataFormat |= (2ULL << 59);
9564 }
9565
9566 return RsrcDataFormat;
9567}
9568
9572 0xffffffff; // Size;
9573
9574 // GFX9 doesn't have ELEMENT_SIZE.
9575 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9576 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9577 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9578 }
9579
9580 // IndexStride = 64 / 32.
9581 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9582 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9583
9584 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9585 // Clear them unless we want a huge stride.
9586 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9587 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9588 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9589
9590 return Rsrc23;
9591}
9592
9594 unsigned Opc = MI.getOpcode();
9595
9596 return isSMRD(Opc);
9597}
9598
9600 return get(Opc).mayLoad() &&
9601 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9602}
9603
9605 int &FrameIndex) const {
9606 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9607 if (!Addr || !Addr->isFI())
9608 return Register();
9609
9610 assert(!MI.memoperands_empty() &&
9611 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9612
9613 FrameIndex = Addr->getIndex();
9614 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9615}
9616
9618 int &FrameIndex) const {
9619 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9620 assert(Addr && Addr->isFI());
9621 FrameIndex = Addr->getIndex();
9622 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9623}
9624
9626 int &FrameIndex) const {
9627 if (!MI.mayLoad())
9628 return Register();
9629
9630 if (isMUBUF(MI) || isVGPRSpill(MI))
9631 return isStackAccess(MI, FrameIndex);
9632
9633 if (isSGPRSpill(MI))
9634 return isSGPRStackAccess(MI, FrameIndex);
9635
9636 return Register();
9637}
9638
9640 int &FrameIndex) const {
9641 if (!MI.mayStore())
9642 return Register();
9643
9644 if (isMUBUF(MI) || isVGPRSpill(MI))
9645 return isStackAccess(MI, FrameIndex);
9646
9647 if (isSGPRSpill(MI))
9648 return isSGPRStackAccess(MI, FrameIndex);
9649
9650 return Register();
9651}
9652
9654 unsigned Size = 0;
9656 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9657 while (++I != E && I->isInsideBundle()) {
9658 assert(!I->isBundle() && "No nested bundle!");
9660 }
9661
9662 return Size;
9663}
9664
9666 unsigned Opc = MI.getOpcode();
9668 unsigned DescSize = Desc.getSize();
9669
9670 // If we have a definitive size, we can use it. Otherwise we need to inspect
9671 // the operands to know the size.
9672 if (isFixedSize(MI)) {
9673 unsigned Size = DescSize;
9674
9675 // If we hit the buggy offset, an extra nop will be inserted in MC so
9676 // estimate the worst case.
9677 if (MI.isBranch() && ST.hasOffset3fBug())
9678 Size += 4;
9679
9680 return Size;
9681 }
9682
9683 // Instructions may have a 32-bit literal encoded after them. Check
9684 // operands that could ever be literals.
9685 if (isVALU(MI) || isSALU(MI)) {
9686 if (isDPP(MI))
9687 return DescSize;
9688 bool HasLiteral = false;
9689 unsigned LiteralSize = 4;
9690 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9691 const MachineOperand &Op = MI.getOperand(I);
9692 const MCOperandInfo &OpInfo = Desc.operands()[I];
9693 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9694 HasLiteral = true;
9695 if (ST.has64BitLiterals()) {
9696 switch (OpInfo.OperandType) {
9697 default:
9698 break;
9700 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9701 LiteralSize = 8;
9702 break;
9704 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9705 LiteralSize = 8;
9706 break;
9707 }
9708 }
9709 break;
9710 }
9711 }
9712 return HasLiteral ? DescSize + LiteralSize : DescSize;
9713 }
9714
9715 // Check whether we have extra NSA words.
9716 if (isMIMG(MI)) {
9717 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9718 if (VAddr0Idx < 0)
9719 return 8;
9720
9721 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9722 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9723 }
9724
9725 switch (Opc) {
9726 case TargetOpcode::BUNDLE:
9727 return getInstBundleSize(MI);
9728 case TargetOpcode::INLINEASM:
9729 case TargetOpcode::INLINEASM_BR: {
9730 const MachineFunction *MF = MI.getMF();
9731 const char *AsmStr = MI.getOperand(0).getSymbolName();
9732 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9733 }
9734 default:
9735 if (MI.isMetaInstruction())
9736 return 0;
9737
9738 // If D16 Pseudo inst, get correct MC code size
9739 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9740 if (D16Info) {
9741 // Assume d16_lo/hi inst are always in same size
9742 unsigned LoInstOpcode = D16Info->LoOp;
9743 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9744 DescSize = Desc.getSize();
9745 }
9746
9747 // If FMA Pseudo inst, get correct MC code size
9748 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9749 // All potential lowerings are the same size; arbitrarily pick one.
9750 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9751 DescSize = Desc.getSize();
9752 }
9753
9754 return DescSize;
9755 }
9756}
9757
9759 if (!isFLAT(MI))
9760 return false;
9761
9762 if (MI.memoperands_empty())
9763 return true;
9764
9765 for (const MachineMemOperand *MMO : MI.memoperands()) {
9766 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9767 return true;
9768 }
9769 return false;
9770}
9771
9774 static const std::pair<int, const char *> TargetIndices[] = {
9775 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9776 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9777 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9778 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9779 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9780 return ArrayRef(TargetIndices);
9781}
9782
9783/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9784/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9790
9791/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9792/// pass.
9797
9798// Called during:
9799// - pre-RA scheduling and post-RA scheduling
9802 const ScheduleDAGMI *DAG) const {
9803 // Borrowed from Arm Target
9804 // We would like to restrict this hazard recognizer to only
9805 // post-RA scheduling; we can tell that we're post-RA because we don't
9806 // track VRegLiveness.
9807 if (!DAG->hasVRegLiveness())
9808 return new GCNHazardRecognizer(DAG->MF);
9810}
9811
9812std::pair<unsigned, unsigned>
9814 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9815}
9816
9819 static const std::pair<unsigned, const char *> TargetFlags[] = {
9820 {MO_GOTPCREL, "amdgpu-gotprel"},
9821 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9822 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9823 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9824 {MO_REL32_LO, "amdgpu-rel32-lo"},
9825 {MO_REL32_HI, "amdgpu-rel32-hi"},
9826 {MO_REL64, "amdgpu-rel64"},
9827 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9828 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9829 {MO_ABS64, "amdgpu-abs64"},
9830 };
9831
9832 return ArrayRef(TargetFlags);
9833}
9834
9837 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9838 {
9839 {MONoClobber, "amdgpu-noclobber"},
9840 {MOLastUse, "amdgpu-last-use"},
9841 {MOCooperative, "amdgpu-cooperative"},
9842 };
9843
9844 return ArrayRef(TargetFlags);
9845}
9846
9848 const MachineFunction &MF) const {
9850 assert(SrcReg.isVirtual());
9851 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9852 return AMDGPU::WWM_COPY;
9853
9854 return AMDGPU::COPY;
9855}
9856
9858 Register Reg) const {
9859 // We need to handle instructions which may be inserted during register
9860 // allocation to handle the prolog. The initial prolog instruction may have
9861 // been separated from the start of the block by spills and copies inserted
9862 // needed by the prolog. However, the insertions for scalar registers can
9863 // always be placed at the BB top as they are independent of the exec mask
9864 // value.
9865 const MachineFunction *MF = MI.getMF();
9866 bool IsNullOrVectorRegister = true;
9867 if (Reg) {
9868 const MachineRegisterInfo &MRI = MF->getRegInfo();
9869 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9870 }
9871
9872 uint16_t Opcode = MI.getOpcode();
9874 return IsNullOrVectorRegister &&
9875 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9876 (Opcode == AMDGPU::IMPLICIT_DEF &&
9877 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9878 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9879 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9880}
9881
9885 const DebugLoc &DL,
9886 Register DestReg) const {
9887 if (ST.hasAddNoCarry())
9888 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9889
9890 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9891 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9892 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9893
9894 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9895 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9896}
9897
9900 const DebugLoc &DL,
9901 Register DestReg,
9902 RegScavenger &RS) const {
9903 if (ST.hasAddNoCarry())
9904 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9905
9906 // If available, prefer to use vcc.
9907 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9908 ? Register(RI.getVCC())
9909 : RS.scavengeRegisterBackwards(
9910 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9911 0, /* AllowSpill */ false);
9912
9913 // TODO: Users need to deal with this.
9914 if (!UnusedCarry.isValid())
9915 return MachineInstrBuilder();
9916
9917 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9918 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9919}
9920
9921bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9922 switch (Opcode) {
9923 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9924 case AMDGPU::SI_KILL_I1_TERMINATOR:
9925 return true;
9926 default:
9927 return false;
9928 }
9929}
9930
9932 switch (Opcode) {
9933 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9934 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9935 case AMDGPU::SI_KILL_I1_PSEUDO:
9936 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9937 default:
9938 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9939 }
9940}
9941
9942bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9943 return Imm <= getMaxMUBUFImmOffset(ST);
9944}
9945
9947 // GFX12 field is non-negative 24-bit signed byte offset.
9948 const unsigned OffsetBits =
9949 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9950 return (1 << OffsetBits) - 1;
9951}
9952
9954 if (!ST.isWave32())
9955 return;
9956
9957 if (MI.isInlineAsm())
9958 return;
9959
9960 for (auto &Op : MI.implicit_operands()) {
9961 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9962 Op.setReg(AMDGPU::VCC_LO);
9963 }
9964}
9965
9967 if (!isSMRD(MI))
9968 return false;
9969
9970 // Check that it is using a buffer resource.
9971 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9972 if (Idx == -1) // e.g. s_memtime
9973 return false;
9974
9975 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
9976 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9977}
9978
9979// Given Imm, split it into the values to put into the SOffset and ImmOffset
9980// fields in an MUBUF instruction. Return false if it is not possible (due to a
9981// hardware bug needing a workaround).
9982//
9983// The required alignment ensures that individual address components remain
9984// aligned if they are aligned to begin with. It also ensures that additional
9985// offsets within the given alignment can be added to the resulting ImmOffset.
9987 uint32_t &ImmOffset, Align Alignment) const {
9988 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9989 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9990 uint32_t Overflow = 0;
9991
9992 if (Imm > MaxImm) {
9993 if (Imm <= MaxImm + 64) {
9994 // Use an SOffset inline constant for 4..64
9995 Overflow = Imm - MaxImm;
9996 Imm = MaxImm;
9997 } else {
9998 // Try to keep the same value in SOffset for adjacent loads, so that
9999 // the corresponding register contents can be re-used.
10000 //
10001 // Load values with all low-bits (except for alignment bits) set into
10002 // SOffset, so that a larger range of values can be covered using
10003 // s_movk_i32.
10004 //
10005 // Atomic operations fail to work correctly when individual address
10006 // components are unaligned, even if their sum is aligned.
10007 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10008 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10009 Imm = Low;
10010 Overflow = High - Alignment.value();
10011 }
10012 }
10013
10014 if (Overflow > 0) {
10015 // There is a hardware bug in SI and CI which prevents address clamping in
10016 // MUBUF instructions from working correctly with SOffsets. The immediate
10017 // offset is unaffected.
10018 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10019 return false;
10020
10021 // It is not possible to set immediate in SOffset field on some targets.
10022 if (ST.hasRestrictedSOffset())
10023 return false;
10024 }
10025
10026 ImmOffset = Imm;
10027 SOffset = Overflow;
10028 return true;
10029}
10030
10031// Depending on the used address space and instructions, some immediate offsets
10032// are allowed and some are not.
10033// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10034// scratch instruction offsets can also be negative. On GFX12, offsets can be
10035// negative for all variants.
10036//
10037// There are several bugs related to these offsets:
10038// On gfx10.1, flat instructions that go into the global address space cannot
10039// use an offset.
10040//
10041// For scratch instructions, the address can be either an SGPR or a VGPR.
10042// The following offsets can be used, depending on the architecture (x means
10043// cannot be used):
10044// +----------------------------+------+------+
10045// | Address-Mode | SGPR | VGPR |
10046// +----------------------------+------+------+
10047// | gfx9 | | |
10048// | negative, 4-aligned offset | x | ok |
10049// | negative, unaligned offset | x | ok |
10050// +----------------------------+------+------+
10051// | gfx10 | | |
10052// | negative, 4-aligned offset | ok | ok |
10053// | negative, unaligned offset | ok | x |
10054// +----------------------------+------+------+
10055// | gfx10.3 | | |
10056// | negative, 4-aligned offset | ok | ok |
10057// | negative, unaligned offset | ok | ok |
10058// +----------------------------+------+------+
10059//
10060// This function ignores the addressing mode, so if an offset cannot be used in
10061// one addressing mode, it is considered illegal.
10062bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10063 uint64_t FlatVariant) const {
10064 // TODO: Should 0 be special cased?
10065 if (!ST.hasFlatInstOffsets())
10066 return false;
10067
10068 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10069 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10070 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10071 return false;
10072
10073 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10074 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10075 (Offset % 4) != 0) {
10076 return false;
10077 }
10078
10079 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10080 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10081 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10082}
10083
10084// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10085std::pair<int64_t, int64_t>
10086SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10087 uint64_t FlatVariant) const {
10088 int64_t RemainderOffset = COffsetVal;
10089 int64_t ImmField = 0;
10090
10091 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10092 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10093
10094 if (AllowNegative) {
10095 // Use signed division by a power of two to truncate towards 0.
10096 int64_t D = 1LL << NumBits;
10097 RemainderOffset = (COffsetVal / D) * D;
10098 ImmField = COffsetVal - RemainderOffset;
10099
10100 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10101 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10102 (ImmField % 4) != 0) {
10103 // Make ImmField a multiple of 4
10104 RemainderOffset += ImmField % 4;
10105 ImmField -= ImmField % 4;
10106 }
10107 } else if (COffsetVal >= 0) {
10108 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10109 RemainderOffset = COffsetVal - ImmField;
10110 }
10111
10112 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10113 assert(RemainderOffset + ImmField == COffsetVal);
10114 return {ImmField, RemainderOffset};
10115}
10116
10118 if (ST.hasNegativeScratchOffsetBug() &&
10119 FlatVariant == SIInstrFlags::FlatScratch)
10120 return false;
10121
10122 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10123}
10124
10125static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10126 switch (ST.getGeneration()) {
10127 default:
10128 break;
10131 return SIEncodingFamily::SI;
10134 return SIEncodingFamily::VI;
10140 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10142 }
10143 llvm_unreachable("Unknown subtarget generation!");
10144}
10145
10146bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10147 switch(MCOp) {
10148 // These opcodes use indirect register addressing so
10149 // they need special handling by codegen (currently missing).
10150 // Therefore it is too risky to allow these opcodes
10151 // to be selected by dpp combiner or sdwa peepholer.
10152 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10153 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10154 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10155 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10156 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10157 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10158 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10159 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10160 return true;
10161 default:
10162 return false;
10163 }
10164}
10165
10166#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10167 case OPCODE##_dpp: \
10168 case OPCODE##_e32: \
10169 case OPCODE##_e64: \
10170 case OPCODE##_e64_dpp: \
10171 case OPCODE##_sdwa:
10172
10173static bool isRenamedInGFX9(int Opcode) {
10174 switch (Opcode) {
10175 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10176 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10177 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10178 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10179 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10180 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10181 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10182 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10183 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10184 //
10185 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10186 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10187 case AMDGPU::V_FMA_F16_gfx9_e64:
10188 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10189 case AMDGPU::V_INTERP_P2_F16:
10190 case AMDGPU::V_MAD_F16_e64:
10191 case AMDGPU::V_MAD_U16_e64:
10192 case AMDGPU::V_MAD_I16_e64:
10193 return true;
10194 default:
10195 return false;
10196 }
10197}
10198
10199int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10200 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
10201
10202 unsigned Gen = subtargetEncodingFamily(ST);
10203
10204 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10206
10207 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10208 // subtarget has UnpackedD16VMem feature.
10209 // TODO: remove this when we discard GFX80 encoding.
10210 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10212
10213 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10214 switch (ST.getGeneration()) {
10215 default:
10217 break;
10220 break;
10223 break;
10224 }
10225 }
10226
10227 if (isMAI(Opcode)) {
10228 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10229 if (MFMAOp != -1)
10230 Opcode = MFMAOp;
10231 }
10232
10233 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10234
10235 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10237
10238 // -1 means that Opcode is already a native instruction.
10239 if (MCOp == -1)
10240 return Opcode;
10241
10242 if (ST.hasGFX90AInsts()) {
10243 uint16_t NMCOp = (uint16_t)-1;
10244 if (ST.hasGFX940Insts())
10246 if (NMCOp == (uint16_t)-1)
10248 if (NMCOp == (uint16_t)-1)
10250 if (NMCOp != (uint16_t)-1)
10251 MCOp = NMCOp;
10252 }
10253
10254 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10255 // no encoding in the given subtarget generation.
10256 if (MCOp == (uint16_t)-1)
10257 return -1;
10258
10259 if (isAsmOnlyOpcode(MCOp))
10260 return -1;
10261
10262 return MCOp;
10263}
10264
10265static
10267 assert(RegOpnd.isReg());
10268 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10269 getRegSubRegPair(RegOpnd);
10270}
10271
10274 assert(MI.isRegSequence());
10275 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10276 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10277 auto &RegOp = MI.getOperand(1 + 2 * I);
10278 return getRegOrUndef(RegOp);
10279 }
10281}
10282
10283// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10284// Following a subreg of reg:subreg isn't supported
10287 if (!RSR.SubReg)
10288 return false;
10289 switch (MI.getOpcode()) {
10290 default: break;
10291 case AMDGPU::REG_SEQUENCE:
10292 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10293 return true;
10294 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10295 case AMDGPU::INSERT_SUBREG:
10296 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10297 // inserted the subreg we're looking for
10298 RSR = getRegOrUndef(MI.getOperand(2));
10299 else { // the subreg in the rest of the reg
10300 auto R1 = getRegOrUndef(MI.getOperand(1));
10301 if (R1.SubReg) // subreg of subreg isn't supported
10302 return false;
10303 RSR.Reg = R1.Reg;
10304 }
10305 return true;
10306 }
10307 return false;
10308}
10309
10311 const MachineRegisterInfo &MRI) {
10312 assert(MRI.isSSA());
10313 if (!P.Reg.isVirtual())
10314 return nullptr;
10315
10316 auto RSR = P;
10317 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10318 while (auto *MI = DefInst) {
10319 DefInst = nullptr;
10320 switch (MI->getOpcode()) {
10321 case AMDGPU::COPY:
10322 case AMDGPU::V_MOV_B32_e32: {
10323 auto &Op1 = MI->getOperand(1);
10324 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10325 if (Op1.isUndef())
10326 return nullptr;
10327 RSR = getRegSubRegPair(Op1);
10328 DefInst = MRI.getVRegDef(RSR.Reg);
10329 }
10330 break;
10331 }
10332 default:
10333 if (followSubRegDef(*MI, RSR)) {
10334 if (!RSR.Reg)
10335 return nullptr;
10336 DefInst = MRI.getVRegDef(RSR.Reg);
10337 }
10338 }
10339 if (!DefInst)
10340 return MI;
10341 }
10342 return nullptr;
10343}
10344
10346 Register VReg,
10347 const MachineInstr &DefMI,
10348 const MachineInstr &UseMI) {
10349 assert(MRI.isSSA() && "Must be run on SSA");
10350
10351 auto *TRI = MRI.getTargetRegisterInfo();
10352 auto *DefBB = DefMI.getParent();
10353
10354 // Don't bother searching between blocks, although it is possible this block
10355 // doesn't modify exec.
10356 if (UseMI.getParent() != DefBB)
10357 return true;
10358
10359 const int MaxInstScan = 20;
10360 int NumInst = 0;
10361
10362 // Stop scan at the use.
10363 auto E = UseMI.getIterator();
10364 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10365 if (I->isDebugInstr())
10366 continue;
10367
10368 if (++NumInst > MaxInstScan)
10369 return true;
10370
10371 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10372 return true;
10373 }
10374
10375 return false;
10376}
10377
10379 Register VReg,
10380 const MachineInstr &DefMI) {
10381 assert(MRI.isSSA() && "Must be run on SSA");
10382
10383 auto *TRI = MRI.getTargetRegisterInfo();
10384 auto *DefBB = DefMI.getParent();
10385
10386 const int MaxUseScan = 10;
10387 int NumUse = 0;
10388
10389 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10390 auto &UseInst = *Use.getParent();
10391 // Don't bother searching between blocks, although it is possible this block
10392 // doesn't modify exec.
10393 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10394 return true;
10395
10396 if (++NumUse > MaxUseScan)
10397 return true;
10398 }
10399
10400 if (NumUse == 0)
10401 return false;
10402
10403 const int MaxInstScan = 20;
10404 int NumInst = 0;
10405
10406 // Stop scan when we have seen all the uses.
10407 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10408 assert(I != DefBB->end());
10409
10410 if (I->isDebugInstr())
10411 continue;
10412
10413 if (++NumInst > MaxInstScan)
10414 return true;
10415
10416 for (const MachineOperand &Op : I->operands()) {
10417 // We don't check reg masks here as they're used only on calls:
10418 // 1. EXEC is only considered const within one BB
10419 // 2. Call should be a terminator instruction if present in a BB
10420
10421 if (!Op.isReg())
10422 continue;
10423
10424 Register Reg = Op.getReg();
10425 if (Op.isUse()) {
10426 if (Reg == VReg && --NumUse == 0)
10427 return false;
10428 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10429 return true;
10430 }
10431 }
10432}
10433
10436 const DebugLoc &DL, Register Src, Register Dst) const {
10437 auto Cur = MBB.begin();
10438 if (Cur != MBB.end())
10439 do {
10440 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10441 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10442 ++Cur;
10443 } while (Cur != MBB.end() && Cur != LastPHIIt);
10444
10445 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10446 Dst);
10447}
10448
10451 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10452 if (InsPt != MBB.end() &&
10453 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10454 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10455 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10456 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10457 InsPt++;
10458 return BuildMI(MBB, InsPt, DL,
10459 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10460 .addReg(Src, 0, SrcSubReg)
10461 .addReg(AMDGPU::EXEC, RegState::Implicit);
10462 }
10463 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10464 Dst);
10465}
10466
10467bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10468
10471 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10472 VirtRegMap *VRM) const {
10473 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10474 //
10475 // %0:sreg_32 = COPY $m0
10476 //
10477 // We explicitly chose SReg_32 for the virtual register so such a copy might
10478 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10479 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10480 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10481 // TargetInstrInfo::foldMemoryOperand() is going to try.
10482 // A similar issue also exists with spilling and reloading $exec registers.
10483 //
10484 // To prevent that, constrain the %0 register class here.
10485 if (isFullCopyInstr(MI)) {
10486 Register DstReg = MI.getOperand(0).getReg();
10487 Register SrcReg = MI.getOperand(1).getReg();
10488 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10489 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10491 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10492 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10493 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10494 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10495 return nullptr;
10496 }
10497 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10498 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10499 return nullptr;
10500 }
10501 }
10502 }
10503
10504 return nullptr;
10505}
10506
10508 const MachineInstr &MI,
10509 unsigned *PredCost) const {
10510 if (MI.isBundle()) {
10512 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10513 unsigned Lat = 0, Count = 0;
10514 for (++I; I != E && I->isBundledWithPred(); ++I) {
10515 ++Count;
10516 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10517 }
10518 return Lat + Count - 1;
10519 }
10520
10521 return SchedModel.computeInstrLatency(&MI);
10522}
10523
10526 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10527 unsigned Opcode = MI.getOpcode();
10528
10529 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10530 Register Dst = MI.getOperand(0).getReg();
10531 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10532 : MI.getOperand(1).getReg();
10533 LLT DstTy = MRI.getType(Dst);
10534 LLT SrcTy = MRI.getType(Src);
10535 unsigned DstAS = DstTy.getAddressSpace();
10536 unsigned SrcAS = SrcTy.getAddressSpace();
10537 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10538 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10539 ST.hasGloballyAddressableScratch()
10542 };
10543
10544 // If the target supports globally addressable scratch, the mapping from
10545 // scratch memory to the flat aperture changes therefore an address space cast
10546 // is no longer uniform.
10547 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10548 return HandleAddrSpaceCast(MI);
10549
10550 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10551 auto IID = GI->getIntrinsicID();
10556
10557 switch (IID) {
10558 case Intrinsic::amdgcn_addrspacecast_nonnull:
10559 return HandleAddrSpaceCast(MI);
10560 case Intrinsic::amdgcn_if:
10561 case Intrinsic::amdgcn_else:
10562 // FIXME: Uniform if second result
10563 break;
10564 }
10565
10567 }
10568
10569 // Loads from the private and flat address spaces are divergent, because
10570 // threads can execute the load instruction with the same inputs and get
10571 // different results.
10572 //
10573 // All other loads are not divergent, because if threads issue loads with the
10574 // same arguments, they will always get the same result.
10575 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10576 Opcode == AMDGPU::G_SEXTLOAD) {
10577 if (MI.memoperands_empty())
10578 return InstructionUniformity::NeverUniform; // conservative assumption
10579
10580 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10581 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10582 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10583 })) {
10584 // At least one MMO in a non-global address space.
10586 }
10588 }
10589
10590 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10591 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10592 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10593 AMDGPU::isGenericAtomic(Opcode)) {
10595 }
10597}
10598
10601
10602 if (isNeverUniform(MI))
10604
10605 unsigned opcode = MI.getOpcode();
10606 if (opcode == AMDGPU::V_READLANE_B32 ||
10607 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10608 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10610
10611 if (isCopyInstr(MI)) {
10612 const MachineOperand &srcOp = MI.getOperand(1);
10613 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10614 const TargetRegisterClass *regClass =
10615 RI.getPhysRegBaseClass(srcOp.getReg());
10616 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10618 }
10620 }
10621
10622 // GMIR handling
10623 if (MI.isPreISelOpcode())
10625
10626 // Atomics are divergent because they are executed sequentially: when an
10627 // atomic operation refers to the same address in each thread, then each
10628 // thread after the first sees the value written by the previous thread as
10629 // original value.
10630
10631 if (isAtomic(MI))
10633
10634 // Loads from the private and flat address spaces are divergent, because
10635 // threads can execute the load instruction with the same inputs and get
10636 // different results.
10637 if (isFLAT(MI) && MI.mayLoad()) {
10638 if (MI.memoperands_empty())
10639 return InstructionUniformity::NeverUniform; // conservative assumption
10640
10641 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10642 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10643 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10644 })) {
10645 // At least one MMO in a non-global address space.
10647 }
10648
10650 }
10651
10652 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10653 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10654
10655 // FIXME: It's conceptually broken to report this for an instruction, and not
10656 // a specific def operand. For inline asm in particular, there could be mixed
10657 // uniform and divergent results.
10658 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10659 const MachineOperand &SrcOp = MI.getOperand(I);
10660 if (!SrcOp.isReg())
10661 continue;
10662
10663 Register Reg = SrcOp.getReg();
10664 if (!Reg || !SrcOp.readsReg())
10665 continue;
10666
10667 // If RegBank is null, this is unassigned or an unallocatable special
10668 // register, which are all scalars.
10669 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10670 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10672 }
10673
10674 // TODO: Uniformity check condtions above can be rearranged for more
10675 // redability
10676
10677 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10678 // currently turned into no-op COPYs by SelectionDAG ISel and are
10679 // therefore no longer recognizable.
10680
10682}
10683
10685 switch (MF.getFunction().getCallingConv()) {
10687 return 1;
10689 return 2;
10691 return 3;
10695 const Function &F = MF.getFunction();
10696 F.getContext().diagnose(DiagnosticInfoUnsupported(
10697 F, "ds_ordered_count unsupported for this calling conv"));
10698 [[fallthrough]];
10699 }
10702 case CallingConv::C:
10703 case CallingConv::Fast:
10704 default:
10705 // Assume other calling conventions are various compute callable functions
10706 return 0;
10707 }
10708}
10709
10711 Register &SrcReg2, int64_t &CmpMask,
10712 int64_t &CmpValue) const {
10713 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10714 return false;
10715
10716 switch (MI.getOpcode()) {
10717 default:
10718 break;
10719 case AMDGPU::S_CMP_EQ_U32:
10720 case AMDGPU::S_CMP_EQ_I32:
10721 case AMDGPU::S_CMP_LG_U32:
10722 case AMDGPU::S_CMP_LG_I32:
10723 case AMDGPU::S_CMP_LT_U32:
10724 case AMDGPU::S_CMP_LT_I32:
10725 case AMDGPU::S_CMP_GT_U32:
10726 case AMDGPU::S_CMP_GT_I32:
10727 case AMDGPU::S_CMP_LE_U32:
10728 case AMDGPU::S_CMP_LE_I32:
10729 case AMDGPU::S_CMP_GE_U32:
10730 case AMDGPU::S_CMP_GE_I32:
10731 case AMDGPU::S_CMP_EQ_U64:
10732 case AMDGPU::S_CMP_LG_U64:
10733 SrcReg = MI.getOperand(0).getReg();
10734 if (MI.getOperand(1).isReg()) {
10735 if (MI.getOperand(1).getSubReg())
10736 return false;
10737 SrcReg2 = MI.getOperand(1).getReg();
10738 CmpValue = 0;
10739 } else if (MI.getOperand(1).isImm()) {
10740 SrcReg2 = Register();
10741 CmpValue = MI.getOperand(1).getImm();
10742 } else {
10743 return false;
10744 }
10745 CmpMask = ~0;
10746 return true;
10747 case AMDGPU::S_CMPK_EQ_U32:
10748 case AMDGPU::S_CMPK_EQ_I32:
10749 case AMDGPU::S_CMPK_LG_U32:
10750 case AMDGPU::S_CMPK_LG_I32:
10751 case AMDGPU::S_CMPK_LT_U32:
10752 case AMDGPU::S_CMPK_LT_I32:
10753 case AMDGPU::S_CMPK_GT_U32:
10754 case AMDGPU::S_CMPK_GT_I32:
10755 case AMDGPU::S_CMPK_LE_U32:
10756 case AMDGPU::S_CMPK_LE_I32:
10757 case AMDGPU::S_CMPK_GE_U32:
10758 case AMDGPU::S_CMPK_GE_I32:
10759 SrcReg = MI.getOperand(0).getReg();
10760 SrcReg2 = Register();
10761 CmpValue = MI.getOperand(1).getImm();
10762 CmpMask = ~0;
10763 return true;
10764 }
10765
10766 return false;
10767}
10768
10769// SCC is already valid after SCCValid.
10770// SCCRedefine will redefine SCC to the same value already available after
10771// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10772// update kill/dead flags if necessary.
10773static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10774 const SIRegisterInfo &RI) {
10775 MachineInstr *KillsSCC = nullptr;
10776 if (SCCValid->getParent() != SCCRedefine->getParent())
10777 return false;
10778 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10779 SCCRedefine->getIterator())) {
10780 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10781 return false;
10782 if (MI.killsRegister(AMDGPU::SCC, &RI))
10783 KillsSCC = &MI;
10784 }
10785 if (MachineOperand *SccDef =
10786 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10787 SccDef->setIsDead(false);
10788 if (KillsSCC)
10789 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10790 SCCRedefine->eraseFromParent();
10791 return true;
10792}
10793
10794static bool foldableSelect(const MachineInstr &Def) {
10795 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10796 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10797 return false;
10798 bool Op1IsNonZeroImm =
10799 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10800 bool Op2IsZeroImm =
10801 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10802 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10803 return false;
10804 return true;
10805}
10806
10808 Register SrcReg2, int64_t CmpMask,
10809 int64_t CmpValue,
10810 const MachineRegisterInfo *MRI) const {
10811 if (!SrcReg || SrcReg.isPhysical())
10812 return false;
10813
10814 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10815 return false;
10816
10817 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10818 this]() -> bool {
10819 if (CmpValue != 0)
10820 return false;
10821
10822 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10823 if (!Def)
10824 return false;
10825
10826 // For S_OP that set SCC = DST!=0, do the transformation
10827 //
10828 // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10829
10830 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
10831 // for S_CSELECT* already has the same value that will be calculated by
10832 // s_cmp_lg_*
10833 //
10834 // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10835 // imm), 0)
10836 if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
10837 return false;
10838
10839 if (!optimizeSCC(Def, &CmpInstr, RI))
10840 return false;
10841
10842 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
10843 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
10844 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
10845 // sX = s_cselect_b64 (non-zero imm), 0
10846 // sLo = copy sX.sub0
10847 // sHi = copy sX.sub1
10848 // sY = s_or_b32 sLo, sHi
10849 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
10850 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
10851 const MachineOperand &OrOpnd1 = Def->getOperand(1);
10852 const MachineOperand &OrOpnd2 = Def->getOperand(2);
10853 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
10854 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
10855 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
10856 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
10857 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
10858 Def2->getOperand(1).isReg() &&
10859 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
10860 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
10861 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
10862 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
10863 if (Select && foldableSelect(*Select))
10864 optimizeSCC(Select, Def, RI);
10865 }
10866 }
10867 }
10868 return true;
10869 };
10870
10871 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10872 this](int64_t ExpectedValue, unsigned SrcSize,
10873 bool IsReversible, bool IsSigned) -> bool {
10874 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10875 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10876 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10877 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10878 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10879 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10880 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10881 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10882 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10883 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10884 //
10885 // Signed ge/gt are not used for the sign bit.
10886 //
10887 // If result of the AND is unused except in the compare:
10888 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10889 //
10890 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10891 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10892 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10893 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10894 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10895 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10896
10897 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10898 if (!Def)
10899 return false;
10900
10901 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10902 Def->getOpcode() != AMDGPU::S_AND_B64)
10903 return false;
10904
10905 int64_t Mask;
10906 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10907 if (MO->isImm())
10908 Mask = MO->getImm();
10909 else if (!getFoldableImm(MO, Mask))
10910 return false;
10911 Mask &= maxUIntN(SrcSize);
10912 return isPowerOf2_64(Mask);
10913 };
10914
10915 MachineOperand *SrcOp = &Def->getOperand(1);
10916 if (isMask(SrcOp))
10917 SrcOp = &Def->getOperand(2);
10918 else if (isMask(&Def->getOperand(2)))
10919 SrcOp = &Def->getOperand(1);
10920 else
10921 return false;
10922
10923 // A valid Mask is required to have a single bit set, hence a non-zero and
10924 // power-of-two value. This verifies that we will not do 64-bit shift below.
10925 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10926 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10927 if (IsSigned && BitNo == SrcSize - 1)
10928 return false;
10929
10930 ExpectedValue <<= BitNo;
10931
10932 bool IsReversedCC = false;
10933 if (CmpValue != ExpectedValue) {
10934 if (!IsReversible)
10935 return false;
10936 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10937 if (!IsReversedCC)
10938 return false;
10939 }
10940
10941 Register DefReg = Def->getOperand(0).getReg();
10942 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10943 return false;
10944
10945 if (!optimizeSCC(Def, &CmpInstr, RI))
10946 return false;
10947
10948 if (!MRI->use_nodbg_empty(DefReg)) {
10949 assert(!IsReversedCC);
10950 return true;
10951 }
10952
10953 // Replace AND with unused result with a S_BITCMP.
10954 MachineBasicBlock *MBB = Def->getParent();
10955
10956 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10957 : AMDGPU::S_BITCMP1_B32
10958 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10959 : AMDGPU::S_BITCMP1_B64;
10960
10961 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10962 .add(*SrcOp)
10963 .addImm(BitNo);
10964 Def->eraseFromParent();
10965
10966 return true;
10967 };
10968
10969 switch (CmpInstr.getOpcode()) {
10970 default:
10971 break;
10972 case AMDGPU::S_CMP_EQ_U32:
10973 case AMDGPU::S_CMP_EQ_I32:
10974 case AMDGPU::S_CMPK_EQ_U32:
10975 case AMDGPU::S_CMPK_EQ_I32:
10976 return optimizeCmpAnd(1, 32, true, false);
10977 case AMDGPU::S_CMP_GE_U32:
10978 case AMDGPU::S_CMPK_GE_U32:
10979 return optimizeCmpAnd(1, 32, false, false);
10980 case AMDGPU::S_CMP_GE_I32:
10981 case AMDGPU::S_CMPK_GE_I32:
10982 return optimizeCmpAnd(1, 32, false, true);
10983 case AMDGPU::S_CMP_EQ_U64:
10984 return optimizeCmpAnd(1, 64, true, false);
10985 case AMDGPU::S_CMP_LG_U32:
10986 case AMDGPU::S_CMP_LG_I32:
10987 case AMDGPU::S_CMPK_LG_U32:
10988 case AMDGPU::S_CMPK_LG_I32:
10989 return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
10990 case AMDGPU::S_CMP_GT_U32:
10991 case AMDGPU::S_CMPK_GT_U32:
10992 return optimizeCmpAnd(0, 32, false, false);
10993 case AMDGPU::S_CMP_GT_I32:
10994 case AMDGPU::S_CMPK_GT_I32:
10995 return optimizeCmpAnd(0, 32, false, true);
10996 case AMDGPU::S_CMP_LG_U64:
10997 return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
10998 }
10999
11000 return false;
11001}
11002
11004 AMDGPU::OpName OpName) const {
11005 if (!ST.needsAlignedVGPRs())
11006 return;
11007
11008 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11009 if (OpNo < 0)
11010 return;
11011 MachineOperand &Op = MI.getOperand(OpNo);
11012 if (getOpSize(MI, OpNo) > 4)
11013 return;
11014
11015 // Add implicit aligned super-reg to force alignment on the data operand.
11016 const DebugLoc &DL = MI.getDebugLoc();
11017 MachineBasicBlock *BB = MI.getParent();
11019 Register DataReg = Op.getReg();
11020 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11021 Register Undef = MRI.createVirtualRegister(
11022 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11023 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11024 Register NewVR =
11025 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11026 : &AMDGPU::VReg_64_Align2RegClass);
11027 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11028 .addReg(DataReg, 0, Op.getSubReg())
11029 .addImm(AMDGPU::sub0)
11030 .addReg(Undef)
11031 .addImm(AMDGPU::sub1);
11032 Op.setReg(NewVR);
11033 Op.setSubReg(AMDGPU::sub0);
11034 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11035}
11036
11038 if (isIGLP(*MI))
11039 return false;
11040
11042}
11043
11045 if (!isWMMA(MI) && !isSWMMAC(MI))
11046 return false;
11047
11048 if (AMDGPU::isGFX1250(ST))
11049 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11050
11051 return true;
11052}
11053
11055 unsigned Opcode = MI.getOpcode();
11056
11057 if (AMDGPU::isGFX12Plus(ST))
11058 return isDOT(MI) || isXDLWMMA(MI);
11059
11060 if (!isMAI(MI) || isDGEMM(Opcode) ||
11061 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11062 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11063 return false;
11064
11065 if (!ST.hasGFX940Insts())
11066 return true;
11067
11068 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11069}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, const SIRegisterInfo &RI)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:221
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool setsSCCifResultIsNonZero(const MachineInstr &MI)
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:587
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:589
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:586
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:588
@ TI_CONSTDATA_START
Definition AMDGPU.h:585
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.