LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197}
198
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(ToCycle)) {
223 FromCycle->getExitingBlocks(ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opc0) && isDS(Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(0) != Load1->getOperand(0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opc0).NumDefs;
279 Offset1Idx -= get(Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opc0) && isSMRD(Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Load0);
292 if (NumOps != getNumOperandsNoGlue(Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(0) != Load1->getOperand(0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
306 const ConstantSDNode *Load1Offset =
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opc0).NumDefs;
336 OffIdx1 -= get(Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(OffIdx0);
339 SDValue Off1 = Load1->getOperand(OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(LdSt)) {
378 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(64);
395 else
396 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
432 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(RSrc);
448 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(LdSt, AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
469 return true;
470 }
471
472 if (isImage(LdSt)) {
473 auto RsrcOpName =
474 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
476 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(&LdSt.getOperand(I));
482 } else {
483 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(LdSt)) {
495 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(BaseOp);
499 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(BaseOp);
514 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(BaseOp);
517 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
533 const MachineInstr &MI2,
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(Base1);
554 Base2 = getUnderlyingObject(Base2);
555
556 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
563 int64_t Offset1, bool OffsetIsScalable1,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
634 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(SrcReg, getKillRegState(KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(SrcReg, &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(DefOp.getReg(), &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(DefOp.getReg(), &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(ImpUseSuperReg,
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(std::next(MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
718 *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
732 /* RestoreAfter */ false, 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749 .addReg(SrcReg, getKillRegState(KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(ImpUseSuperReg,
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(Tmp, RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
761}
762
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, 2);
786 DestSubReg = RI.getSubReg(DestReg, SubIdx);
787 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794 .addReg(SrcSubReg)
795 .addReg(SrcReg, RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(FirstMI, LastMI);
807
808 FirstMI->addOperand(
809 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(SrcReg, &RI);
813}
814
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
821 unsigned Size = RI.getRegSizeInBits(*RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(DestReg);
843 Size = RI.getRegSizeInBits(*RC);
844 SrcRC = RI.getPhysRegBaseClass(SrcReg);
845 SrcSize = RI.getRegSizeInBits(*SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(MBB, MI, DL, get(Opc), DestReg)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(1)
865 .addImm(0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(0)
875 .addReg(SrcReg, getKillRegState(KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(1)
892 .addImm(0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(0)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
911 .addReg(SrcReg, getKillRegState(KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
924 .addReg(SrcReg, getKillRegState(KillSrc))
925 .addImm(0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
939 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(SrcReg, getKillRegState(KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(SrcReg, getKillRegState(KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
954 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
969 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
979 .addReg(NewSrcReg, getKillRegState(KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(SrcReg);
1003 } else {
1004 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(0) // src0_modifiers
1006 .addReg(SrcReg)
1007 .addImm(0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1019 .addReg(NewSrcReg, getKillRegState(KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1024 .addImm(0) // src0_modifiers
1025 .addReg(NewSrcReg)
1026 .addImm(0) // clamp
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(SrcReg, getKillRegState(KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1047 .addReg(SrcReg)
1049 .addReg(SrcReg)
1050 .addImm(0) // op_sel_lo
1051 .addImm(0) // op_sel_hi
1052 .addImm(0) // neg_lo
1053 .addImm(0) // neg_hi
1054 .addImm(0) // clamp
1055 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(SrcRC)) {
1063 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1067 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(*RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1120 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1130 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1133 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1135 .addReg(SrcSubReg)
1137 .addReg(SrcSubReg)
1138 .addImm(0) // op_sel_lo
1139 .addImm(0) // op_sel_hi
1140 .addImm(0) // neg_lo
1141 .addImm(0) // neg_hi
1142 .addImm(0) // clamp
1143 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1182 const DebugLoc &DL, Register DstReg,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1194 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1195 .add(Cond[0]);
1196 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1197 .addImm(0)
1198 .addReg(FalseReg)
1199 .addImm(0)
1200 .addReg(TrueReg)
1201 .addReg(SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1208 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1209 .addImm(0)
1210 .addReg(FalseReg)
1211 .addImm(0)
1212 .addReg(TrueReg)
1213 .addReg(SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1231 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1232 .add(RegOp);
1233 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1234 .addImm(0)
1235 .addReg(FalseReg)
1236 .addImm(0)
1237 .addReg(TrueReg)
1238 .addReg(SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1245 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1246 .add(RegOp);
1247 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addReg(SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1258 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1259 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1272 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1273 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1274 .addImm(0)
1275 .addReg(FalseReg)
1276 .addImm(0)
1277 .addReg(TrueReg)
1278 .addReg(SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1296 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1297 .addImm(Value)
1298 .addReg(SrcReg);
1299
1300 return Reg;
1301}
1302
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1309 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1310 .addImm(Value)
1311 .addReg(SrcReg);
1312
1313 return Reg;
1314}
1315
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO:
1330 case AMDGPU::V_MOV_B16_t16_e32: {
1331 const MachineOperand &Src0 = MI.getOperand(1);
1332 if (Src0.isImm()) {
1333 ImmVal = Src0.getImm();
1334 return MI.getOperand(0).getReg() == Reg;
1335 }
1336
1337 return false;
1338 }
1339 case AMDGPU::V_MOV_B16_t16_e64: {
1340 const MachineOperand &Src0 = MI.getOperand(2);
1341 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1342 ImmVal = Src0.getImm();
1343 return MI.getOperand(0).getReg() == Reg;
1344 }
1345
1346 return false;
1347 }
1348 case AMDGPU::S_BREV_B32:
1349 case AMDGPU::V_BFREV_B32_e32:
1350 case AMDGPU::V_BFREV_B32_e64: {
1351 const MachineOperand &Src0 = MI.getOperand(1);
1352 if (Src0.isImm()) {
1353 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1354 return MI.getOperand(0).getReg() == Reg;
1355 }
1356
1357 return false;
1358 }
1359 case AMDGPU::S_NOT_B32:
1360 case AMDGPU::V_NOT_B32_e32:
1361 case AMDGPU::V_NOT_B32_e64: {
1362 const MachineOperand &Src0 = MI.getOperand(1);
1363 if (Src0.isImm()) {
1364 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1365 return MI.getOperand(0).getReg() == Reg;
1366 }
1367
1368 return false;
1369 }
1370 default:
1371 return false;
1372 }
1373}
1374
1376
1377 if (RI.isAGPRClass(DstRC))
1378 return AMDGPU::COPY;
1379 if (RI.getRegSizeInBits(*DstRC) == 16) {
1380 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1381 // before RA.
1382 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1383 }
1384 if (RI.getRegSizeInBits(*DstRC) == 32)
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1386 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1387 return AMDGPU::S_MOV_B64;
1388 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1389 return AMDGPU::V_MOV_B64_PSEUDO;
1390 return AMDGPU::COPY;
1391}
1392
1393const MCInstrDesc &
1395 bool IsIndirectSrc) const {
1396 if (IsIndirectSrc) {
1397 if (VecSize <= 32) // 4 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1399 if (VecSize <= 64) // 8 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1401 if (VecSize <= 96) // 12 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1403 if (VecSize <= 128) // 16 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1405 if (VecSize <= 160) // 20 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1407 if (VecSize <= 192) // 24 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1409 if (VecSize <= 224) // 28 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 192) // 24 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1441 if (VecSize <= 224) // 28 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1443 if (VecSize <= 256) // 32 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1445 if (VecSize <= 288) // 36 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1447 if (VecSize <= 320) // 40 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1449 if (VecSize <= 352) // 44 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1451 if (VecSize <= 384) // 48 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1453 if (VecSize <= 512) // 64 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1455 if (VecSize <= 1024) // 128 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1457
1458 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1459}
1460
1461static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1462 if (VecSize <= 32) // 4 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1464 if (VecSize <= 64) // 8 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1466 if (VecSize <= 96) // 12 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1468 if (VecSize <= 128) // 16 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1470 if (VecSize <= 160) // 20 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1472 if (VecSize <= 192) // 24 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1474 if (VecSize <= 224) // 28 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1476 if (VecSize <= 256) // 32 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478 if (VecSize <= 288) // 36 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480 if (VecSize <= 320) // 40 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482 if (VecSize <= 352) // 44 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484 if (VecSize <= 384) // 48 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486 if (VecSize <= 512) // 64 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488 if (VecSize <= 1024) // 128 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490
1491 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492}
1493
1494static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495 if (VecSize <= 32) // 4 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497 if (VecSize <= 64) // 8 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499 if (VecSize <= 96) // 12 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501 if (VecSize <= 128) // 16 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503 if (VecSize <= 160) // 20 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505 if (VecSize <= 192) // 24 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1507 if (VecSize <= 224) // 28 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1509 if (VecSize <= 256) // 32 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1511 if (VecSize <= 288) // 36 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1513 if (VecSize <= 320) // 40 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1515 if (VecSize <= 352) // 44 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1517 if (VecSize <= 384) // 48 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1528 if (VecSize <= 64) // 8 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1530 if (VecSize <= 128) // 16 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1534 if (VecSize <= 512) // 64 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1536 if (VecSize <= 1024) // 128 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1538
1539 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1540}
1541
1542const MCInstrDesc &
1543SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1544 bool IsSGPR) const {
1545 if (IsSGPR) {
1546 switch (EltSize) {
1547 case 32:
1548 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1549 case 64:
1550 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1551 default:
1552 llvm_unreachable("invalid reg indexing elt size");
1553 }
1554 }
1555
1556 assert(EltSize == 32 && "invalid reg indexing elt size");
1558}
1559
1560static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1561 switch (Size) {
1562 case 4:
1563 return AMDGPU::SI_SPILL_S32_SAVE;
1564 case 8:
1565 return AMDGPU::SI_SPILL_S64_SAVE;
1566 case 12:
1567 return AMDGPU::SI_SPILL_S96_SAVE;
1568 case 16:
1569 return AMDGPU::SI_SPILL_S128_SAVE;
1570 case 20:
1571 return AMDGPU::SI_SPILL_S160_SAVE;
1572 case 24:
1573 return AMDGPU::SI_SPILL_S192_SAVE;
1574 case 28:
1575 return AMDGPU::SI_SPILL_S224_SAVE;
1576 case 32:
1577 return AMDGPU::SI_SPILL_S256_SAVE;
1578 case 36:
1579 return AMDGPU::SI_SPILL_S288_SAVE;
1580 case 40:
1581 return AMDGPU::SI_SPILL_S320_SAVE;
1582 case 44:
1583 return AMDGPU::SI_SPILL_S352_SAVE;
1584 case 48:
1585 return AMDGPU::SI_SPILL_S384_SAVE;
1586 case 64:
1587 return AMDGPU::SI_SPILL_S512_SAVE;
1588 case 128:
1589 return AMDGPU::SI_SPILL_S1024_SAVE;
1590 default:
1591 llvm_unreachable("unknown register size");
1592 }
1593}
1594
1595static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1596 switch (Size) {
1597 case 2:
1598 return AMDGPU::SI_SPILL_V16_SAVE;
1599 case 4:
1600 return AMDGPU::SI_SPILL_V32_SAVE;
1601 case 8:
1602 return AMDGPU::SI_SPILL_V64_SAVE;
1603 case 12:
1604 return AMDGPU::SI_SPILL_V96_SAVE;
1605 case 16:
1606 return AMDGPU::SI_SPILL_V128_SAVE;
1607 case 20:
1608 return AMDGPU::SI_SPILL_V160_SAVE;
1609 case 24:
1610 return AMDGPU::SI_SPILL_V192_SAVE;
1611 case 28:
1612 return AMDGPU::SI_SPILL_V224_SAVE;
1613 case 32:
1614 return AMDGPU::SI_SPILL_V256_SAVE;
1615 case 36:
1616 return AMDGPU::SI_SPILL_V288_SAVE;
1617 case 40:
1618 return AMDGPU::SI_SPILL_V320_SAVE;
1619 case 44:
1620 return AMDGPU::SI_SPILL_V352_SAVE;
1621 case 48:
1622 return AMDGPU::SI_SPILL_V384_SAVE;
1623 case 64:
1624 return AMDGPU::SI_SPILL_V512_SAVE;
1625 case 128:
1626 return AMDGPU::SI_SPILL_V1024_SAVE;
1627 default:
1628 llvm_unreachable("unknown register size");
1629 }
1630}
1631
1632static unsigned getAVSpillSaveOpcode(unsigned Size) {
1633 switch (Size) {
1634 case 4:
1635 return AMDGPU::SI_SPILL_AV32_SAVE;
1636 case 8:
1637 return AMDGPU::SI_SPILL_AV64_SAVE;
1638 case 12:
1639 return AMDGPU::SI_SPILL_AV96_SAVE;
1640 case 16:
1641 return AMDGPU::SI_SPILL_AV128_SAVE;
1642 case 20:
1643 return AMDGPU::SI_SPILL_AV160_SAVE;
1644 case 24:
1645 return AMDGPU::SI_SPILL_AV192_SAVE;
1646 case 28:
1647 return AMDGPU::SI_SPILL_AV224_SAVE;
1648 case 32:
1649 return AMDGPU::SI_SPILL_AV256_SAVE;
1650 case 36:
1651 return AMDGPU::SI_SPILL_AV288_SAVE;
1652 case 40:
1653 return AMDGPU::SI_SPILL_AV320_SAVE;
1654 case 44:
1655 return AMDGPU::SI_SPILL_AV352_SAVE;
1656 case 48:
1657 return AMDGPU::SI_SPILL_AV384_SAVE;
1658 case 64:
1659 return AMDGPU::SI_SPILL_AV512_SAVE;
1660 case 128:
1661 return AMDGPU::SI_SPILL_AV1024_SAVE;
1662 default:
1663 llvm_unreachable("unknown register size");
1664 }
1665}
1666
1667static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1668 bool IsVectorSuperClass) {
1669 // Currently, there is only 32-bit WWM register spills needed.
1670 if (Size != 4)
1671 llvm_unreachable("unknown wwm register spill size");
1672
1673 if (IsVectorSuperClass)
1674 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1675
1676 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1677}
1678
1680 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1681 const SIMachineFunctionInfo &MFI) const {
1682 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1683
1684 // Choose the right opcode if spilling a WWM register.
1686 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1687
1688 // TODO: Check if AGPRs are available
1689 if (ST.hasMAIInsts())
1690 return getAVSpillSaveOpcode(Size);
1691
1693}
1694
1697 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1699 MachineFunction *MF = MBB.getParent();
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = RI.getSpillSize(*RC);
1710
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 Register VReg, unsigned SubReg,
1892 MachineInstr::MIFlag Flags) const {
1893 MachineFunction *MF = MBB.getParent();
1895 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1896 const DebugLoc &DL = MBB.findDebugLoc(MI);
1897 unsigned SpillSize = RI.getSpillSize(*RC);
1898
1899 MachinePointerInfo PtrInfo
1900 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1901
1903 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1904 FrameInfo.getObjectAlign(FrameIndex));
1905
1906 if (RI.isSGPRClass(RC)) {
1907 MFI->setHasSpilledSGPRs();
1908 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1909 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1910 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1911
1912 // FIXME: Maybe this should not include a memoperand because it will be
1913 // lowered to non-memory instructions.
1914 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1915 if (DestReg.isVirtual() && SpillSize == 4) {
1917 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1918 }
1919
1920 if (RI.spillSGPRToVGPR())
1921 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1922 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1923 .addFrameIndex(FrameIndex) // addr
1924 .addMemOperand(MMO)
1926
1927 return;
1928 }
1929
1930 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1931 SpillSize, *MFI);
1932 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1933 .addFrameIndex(FrameIndex) // vaddr
1934 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1935 .addImm(0) // offset
1936 .addMemOperand(MMO);
1937}
1938
1943
1946 unsigned Quantity) const {
1947 DebugLoc DL = MBB.findDebugLoc(MI);
1948 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, MaxSNopCount);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1958 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1978 MachineFunction *MF = MBB.getParent();
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *ContBB = &MBB;
1984 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1985
1986 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1987 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1988 TrapBB = MF->CreateMachineBasicBlock();
1989 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1990 MF->push_back(TrapBB);
1991 MBB.addSuccessor(TrapBB);
1992 } else {
1993 // Since we're adding HaltLoopBB and modifying the CFG, we must return a
1994 // different block to signal the change.
1995 ContBB = HaltLoopBB;
1996 }
1997
1998 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1999 // will be a nop.
2000 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2001 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2002 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2003 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2004 DoorbellReg)
2006 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2007 .addUse(AMDGPU::M0);
2008 Register DoorbellRegMasked =
2009 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2010 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2011 .addUse(DoorbellReg)
2012 .addImm(DoorbellIDMask);
2013 Register SetWaveAbortBit =
2014 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2015 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2016 .addUse(DoorbellRegMasked)
2017 .addImm(ECQueueWaveAbort);
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2019 .addUse(SetWaveAbortBit);
2020 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2022 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2023 .addUse(AMDGPU::TTMP2);
2024 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2025 TrapBB->addSuccessor(HaltLoopBB);
2026
2027 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2028 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2029 .addMBB(HaltLoopBB);
2030 MF->push_back(HaltLoopBB);
2031 HaltLoopBB->addSuccessor(HaltLoopBB);
2032
2033 return ContBB;
2034}
2035
2037 switch (MI.getOpcode()) {
2038 default:
2039 if (MI.isMetaInstruction())
2040 return 0;
2041 return 1; // FIXME: Do wait states equal cycles?
2042
2043 case AMDGPU::S_NOP:
2044 return MI.getOperand(0).getImm() + 1;
2045 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2046 // hazard, even if one exist, won't really be visible. Should we handle it?
2047 }
2048}
2049
2051 MachineBasicBlock &MBB = *MI.getParent();
2052 DebugLoc DL = MBB.findDebugLoc(MI);
2054 switch (MI.getOpcode()) {
2055 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2056 case AMDGPU::S_MOV_B64_term:
2057 // This is only a terminator to get the correct spill code placement during
2058 // register allocation.
2059 MI.setDesc(get(AMDGPU::S_MOV_B64));
2060 break;
2061
2062 case AMDGPU::S_MOV_B32_term:
2063 // This is only a terminator to get the correct spill code placement during
2064 // register allocation.
2065 MI.setDesc(get(AMDGPU::S_MOV_B32));
2066 break;
2067
2068 case AMDGPU::S_XOR_B64_term:
2069 // This is only a terminator to get the correct spill code placement during
2070 // register allocation.
2071 MI.setDesc(get(AMDGPU::S_XOR_B64));
2072 break;
2073
2074 case AMDGPU::S_XOR_B32_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(AMDGPU::S_XOR_B32));
2078 break;
2079 case AMDGPU::S_OR_B64_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_OR_B64));
2083 break;
2084 case AMDGPU::S_OR_B32_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_OR_B32));
2088 break;
2089
2090 case AMDGPU::S_ANDN2_B64_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2094 break;
2095
2096 case AMDGPU::S_ANDN2_B32_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2100 break;
2101
2102 case AMDGPU::S_AND_B64_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_AND_B64));
2106 break;
2107
2108 case AMDGPU::S_AND_B32_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(AMDGPU::S_AND_B32));
2112 break;
2113
2114 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2118 break;
2119
2120 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2121 // This is only a terminator to get the correct spill code placement during
2122 // register allocation.
2123 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2124 break;
2125
2126 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2127 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2128 break;
2129
2130 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2131 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2132 break;
2133 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2134 Register Dst = MI.getOperand(0).getReg();
2135 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2136 MI.setDesc(
2137 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2138 break;
2139 }
2140 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2141 Register Dst = MI.getOperand(0).getReg();
2142 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2143 int64_t Imm = MI.getOperand(1).getImm();
2144
2145 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2146 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2147 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2150 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2151 .addImm(SignExtend64<32>(Imm >> 32))
2153 MI.eraseFromParent();
2154 break;
2155 }
2156
2157 [[fallthrough]];
2158 }
2159 case AMDGPU::V_MOV_B64_PSEUDO: {
2160 Register Dst = MI.getOperand(0).getReg();
2161 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2162 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2163
2164 const MachineOperand &SrcOp = MI.getOperand(1);
2165 // FIXME: Will this work for 64-bit floating point immediates?
2166 assert(!SrcOp.isFPImm());
2167 if (ST.hasMovB64()) {
2168 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2169 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2170 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2171 break;
2172 }
2173 if (SrcOp.isImm()) {
2174 APInt Imm(64, SrcOp.getImm());
2175 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2176 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2177 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2178 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2180 .addImm(Lo.getSExtValue())
2182 .addImm(Lo.getSExtValue())
2183 .addImm(0) // op_sel_lo
2184 .addImm(0) // op_sel_hi
2185 .addImm(0) // neg_lo
2186 .addImm(0) // neg_hi
2187 .addImm(0); // clamp
2188 } else {
2189 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2190 .addImm(Lo.getSExtValue())
2192 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2193 .addImm(Hi.getSExtValue())
2195 }
2196 } else {
2197 assert(SrcOp.isReg());
2198 if (ST.hasPkMovB32() &&
2199 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2200 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2201 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2202 .addReg(SrcOp.getReg())
2204 .addReg(SrcOp.getReg())
2205 .addImm(0) // op_sel_lo
2206 .addImm(0) // op_sel_hi
2207 .addImm(0) // neg_lo
2208 .addImm(0) // neg_hi
2209 .addImm(0); // clamp
2210 } else {
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2212 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2214 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2215 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2217 }
2218 }
2219 MI.eraseFromParent();
2220 break;
2221 }
2222 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2224 break;
2225 }
2226 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2227 const MachineOperand &SrcOp = MI.getOperand(1);
2228 assert(!SrcOp.isFPImm());
2229
2230 if (ST.has64BitLiterals()) {
2231 MI.setDesc(get(AMDGPU::S_MOV_B64));
2232 break;
2233 }
2234
2235 APInt Imm(64, SrcOp.getImm());
2236 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2237 MI.setDesc(get(AMDGPU::S_MOV_B64));
2238 break;
2239 }
2240
2241 Register Dst = MI.getOperand(0).getReg();
2242 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2243 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2244
2245 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2246 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2247 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2248 .addImm(Lo.getSExtValue())
2250 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2251 .addImm(Hi.getSExtValue())
2253 MI.eraseFromParent();
2254 break;
2255 }
2256 case AMDGPU::V_SET_INACTIVE_B32: {
2257 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2258 Register DstReg = MI.getOperand(0).getReg();
2259 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2260 .add(MI.getOperand(3))
2261 .add(MI.getOperand(4))
2262 .add(MI.getOperand(1))
2263 .add(MI.getOperand(2))
2264 .add(MI.getOperand(5));
2265 MI.eraseFromParent();
2266 break;
2267 }
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2275 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2276 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2277 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2278 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2279 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2280 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2281 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2294 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2295 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2296 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2297 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2298 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2299 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2300 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2301 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2302
2303 unsigned Opc;
2304 if (RI.hasVGPRs(EltRC)) {
2305 Opc = AMDGPU::V_MOVRELD_B32_e32;
2306 } else {
2307 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2308 : AMDGPU::S_MOVRELD_B32;
2309 }
2310
2311 const MCInstrDesc &OpDesc = get(Opc);
2312 Register VecReg = MI.getOperand(0).getReg();
2313 bool IsUndef = MI.getOperand(1).isUndef();
2314 unsigned SubReg = MI.getOperand(3).getImm();
2315 assert(VecReg == MI.getOperand(1).getReg());
2316
2318 BuildMI(MBB, MI, DL, OpDesc)
2319 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2320 .add(MI.getOperand(2))
2322 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2323
2324 const int ImpDefIdx =
2325 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2326 const int ImpUseIdx = ImpDefIdx + 1;
2327 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2328 MI.eraseFromParent();
2329 break;
2330 }
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2336 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2337 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2338 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2339 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2340 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2341 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2342 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2343 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2344 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2345 assert(ST.useVGPRIndexMode());
2346 Register VecReg = MI.getOperand(0).getReg();
2347 bool IsUndef = MI.getOperand(1).isUndef();
2348 MachineOperand &Idx = MI.getOperand(3);
2349 Register SubReg = MI.getOperand(4).getImm();
2350
2351 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2352 .add(Idx)
2354 SetOn->getOperand(3).setIsUndef();
2355
2356 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2358 BuildMI(MBB, MI, DL, OpDesc)
2359 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2360 .add(MI.getOperand(2))
2362 .addReg(VecReg,
2363 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2364
2365 const int ImpDefIdx =
2366 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2367 const int ImpUseIdx = ImpDefIdx + 1;
2368 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2369
2370 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2371
2372 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2373
2374 MI.eraseFromParent();
2375 break;
2376 }
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2380 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2381 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2382 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2383 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2384 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2385 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2386 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2387 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2388 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2389 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2390 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2391 assert(ST.useVGPRIndexMode());
2392 Register Dst = MI.getOperand(0).getReg();
2393 Register VecReg = MI.getOperand(1).getReg();
2394 bool IsUndef = MI.getOperand(1).isUndef();
2395 Register Idx = MI.getOperand(2).getReg();
2396 Register SubReg = MI.getOperand(3).getImm();
2397
2398 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2399 .addReg(Idx)
2401 SetOn->getOperand(3).setIsUndef();
2402
2403 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2404 .addDef(Dst)
2405 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2406 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2407
2408 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2409
2410 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2411
2412 MI.eraseFromParent();
2413 break;
2414 }
2415 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2416 MachineFunction &MF = *MBB.getParent();
2417 Register Reg = MI.getOperand(0).getReg();
2418 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2419 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2420 MachineOperand OpLo = MI.getOperand(1);
2421 MachineOperand OpHi = MI.getOperand(2);
2422
2423 // Create a bundle so these instructions won't be re-ordered by the
2424 // post-RA scheduler.
2425 MIBundleBuilder Bundler(MBB, MI);
2426 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2427
2428 // What we want here is an offset from the value returned by s_getpc (which
2429 // is the address of the s_add_u32 instruction) to the global variable, but
2430 // since the encoding of $symbol starts 4 bytes after the start of the
2431 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2432 // small. This requires us to add 4 to the global variable offset in order
2433 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2434 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2435 // instruction.
2436
2437 int64_t Adjust = 0;
2438 if (ST.hasGetPCZeroExtension()) {
2439 // Fix up hardware that does not sign-extend the 48-bit PC value by
2440 // inserting: s_sext_i32_i16 reghi, reghi
2441 Bundler.append(
2442 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2443 Adjust += 4;
2444 }
2445
2446 if (OpLo.isGlobal())
2447 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2448 Bundler.append(
2449 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2450
2451 if (OpHi.isGlobal())
2452 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2453 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2454 .addReg(RegHi)
2455 .add(OpHi));
2456
2457 finalizeBundle(MBB, Bundler.begin());
2458
2459 MI.eraseFromParent();
2460 break;
2461 }
2462 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2463 MachineFunction &MF = *MBB.getParent();
2464 Register Reg = MI.getOperand(0).getReg();
2465 MachineOperand Op = MI.getOperand(1);
2466
2467 // Create a bundle so these instructions won't be re-ordered by the
2468 // post-RA scheduler.
2469 MIBundleBuilder Bundler(MBB, MI);
2470 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2471 if (Op.isGlobal())
2472 Op.setOffset(Op.getOffset() + 4);
2473 Bundler.append(
2474 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2475
2476 finalizeBundle(MBB, Bundler.begin());
2477
2478 MI.eraseFromParent();
2479 break;
2480 }
2481 case AMDGPU::ENTER_STRICT_WWM: {
2482 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2483 // Whole Wave Mode is entered.
2484 MI.setDesc(get(LMC.OrSaveExecOpc));
2485 break;
2486 }
2487 case AMDGPU::ENTER_STRICT_WQM: {
2488 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2489 // STRICT_WQM is entered.
2490 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2491 .addReg(LMC.ExecReg);
2492 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2493
2494 MI.eraseFromParent();
2495 break;
2496 }
2497 case AMDGPU::EXIT_STRICT_WWM:
2498 case AMDGPU::EXIT_STRICT_WQM: {
2499 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2500 // WWM/STICT_WQM is exited.
2501 MI.setDesc(get(LMC.MovOpc));
2502 break;
2503 }
2504 case AMDGPU::SI_RETURN: {
2505 const MachineFunction *MF = MBB.getParent();
2506 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2507 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2508 // Hiding the return address use with SI_RETURN may lead to extra kills in
2509 // the function and missing live-ins. We are fine in practice because callee
2510 // saved register handling ensures the register value is restored before
2511 // RET, but we need the undef flag here to appease the MachineVerifier
2512 // liveness checks.
2514 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2515 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2516
2517 MIB.copyImplicitOps(MI);
2518 MI.eraseFromParent();
2519 break;
2520 }
2521
2522 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2523 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2524 MI.setDesc(get(AMDGPU::S_MUL_U64));
2525 break;
2526
2527 case AMDGPU::S_GETPC_B64_pseudo:
2528 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2529 if (ST.hasGetPCZeroExtension()) {
2530 Register Dst = MI.getOperand(0).getReg();
2531 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2532 // Fix up hardware that does not sign-extend the 48-bit PC value by
2533 // inserting: s_sext_i32_i16 dsthi, dsthi
2534 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2535 DstHi)
2536 .addReg(DstHi);
2537 }
2538 break;
2539
2540 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2541 assert(ST.hasBF16PackedInsts());
2542 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2543 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2544 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2545 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2546 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2547 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2548 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2549 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2550 break;
2551 }
2552
2553 return true;
2554}
2555
2558 unsigned SubIdx,
2559 const MachineInstr &Orig) const {
2560
2561 // Try shrinking the instruction to remat only the part needed for current
2562 // context.
2563 // TODO: Handle more cases.
2564 unsigned Opcode = Orig.getOpcode();
2565 switch (Opcode) {
2566 case AMDGPU::S_LOAD_DWORDX16_IMM:
2567 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2568 if (SubIdx != 0)
2569 break;
2570
2571 if (I == MBB.end())
2572 break;
2573
2574 if (I->isBundled())
2575 break;
2576
2577 // Look for a single use of the register that is also a subreg.
2578 Register RegToFind = Orig.getOperand(0).getReg();
2579 MachineOperand *UseMO = nullptr;
2580 for (auto &CandMO : I->operands()) {
2581 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2582 continue;
2583 if (UseMO) {
2584 UseMO = nullptr;
2585 break;
2586 }
2587 UseMO = &CandMO;
2588 }
2589 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2590 break;
2591
2592 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2593 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2594
2595 MachineFunction *MF = MBB.getParent();
2597 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2598
2599 unsigned NewOpcode = -1;
2600 if (SubregSize == 256)
2601 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2602 else if (SubregSize == 128)
2603 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2604 else
2605 break;
2606
2607 const MCInstrDesc &TID = get(NewOpcode);
2608 const TargetRegisterClass *NewRC =
2609 RI.getAllocatableClass(getRegClass(TID, 0));
2610 MRI.setRegClass(DestReg, NewRC);
2611
2612 UseMO->setReg(DestReg);
2613 UseMO->setSubReg(AMDGPU::NoSubRegister);
2614
2615 // Use a smaller load with the desired size, possibly with updated offset.
2616 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2617 MI->setDesc(TID);
2618 MI->getOperand(0).setReg(DestReg);
2619 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2620 if (Offset) {
2621 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2622 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2623 OffsetMO->setImm(FinalOffset);
2624 }
2626 for (const MachineMemOperand *MemOp : Orig.memoperands())
2627 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2628 SubregSize / 8));
2629 MI->setMemRefs(*MF, NewMMOs);
2630
2631 MBB.insert(I, MI);
2632 return;
2633 }
2634
2635 default:
2636 break;
2637 }
2638
2639 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2640}
2641
2642std::pair<MachineInstr*, MachineInstr*>
2644 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2645
2646 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2648 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2649 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2650 return std::pair(&MI, nullptr);
2651 }
2652
2653 MachineBasicBlock &MBB = *MI.getParent();
2654 DebugLoc DL = MBB.findDebugLoc(MI);
2655 MachineFunction *MF = MBB.getParent();
2657 Register Dst = MI.getOperand(0).getReg();
2658 unsigned Part = 0;
2659 MachineInstr *Split[2];
2660
2661 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2662 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2663 if (Dst.isPhysical()) {
2664 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2665 } else {
2666 assert(MRI.isSSA());
2667 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2668 MovDPP.addDef(Tmp);
2669 }
2670
2671 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2672 const MachineOperand &SrcOp = MI.getOperand(I);
2673 assert(!SrcOp.isFPImm());
2674 if (SrcOp.isImm()) {
2675 APInt Imm(64, SrcOp.getImm());
2676 Imm.ashrInPlace(Part * 32);
2677 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2678 } else {
2679 assert(SrcOp.isReg());
2680 Register Src = SrcOp.getReg();
2681 if (Src.isPhysical())
2682 MovDPP.addReg(RI.getSubReg(Src, Sub));
2683 else
2684 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2685 }
2686 }
2687
2688 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2689 MovDPP.addImm(MO.getImm());
2690
2691 Split[Part] = MovDPP;
2692 ++Part;
2693 }
2694
2695 if (Dst.isVirtual())
2696 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2697 .addReg(Split[0]->getOperand(0).getReg())
2698 .addImm(AMDGPU::sub0)
2699 .addReg(Split[1]->getOperand(0).getReg())
2700 .addImm(AMDGPU::sub1);
2701
2702 MI.eraseFromParent();
2703 return std::pair(Split[0], Split[1]);
2704}
2705
2706std::optional<DestSourcePair>
2708 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2709 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2710
2711 return std::nullopt;
2712}
2713
2715 AMDGPU::OpName Src0OpName,
2716 MachineOperand &Src1,
2717 AMDGPU::OpName Src1OpName) const {
2718 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2719 if (!Src0Mods)
2720 return false;
2721
2722 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2723 assert(Src1Mods &&
2724 "All commutable instructions have both src0 and src1 modifiers");
2725
2726 int Src0ModsVal = Src0Mods->getImm();
2727 int Src1ModsVal = Src1Mods->getImm();
2728
2729 Src1Mods->setImm(Src0ModsVal);
2730 Src0Mods->setImm(Src1ModsVal);
2731 return true;
2732}
2733
2735 MachineOperand &RegOp,
2736 MachineOperand &NonRegOp) {
2737 Register Reg = RegOp.getReg();
2738 unsigned SubReg = RegOp.getSubReg();
2739 bool IsKill = RegOp.isKill();
2740 bool IsDead = RegOp.isDead();
2741 bool IsUndef = RegOp.isUndef();
2742 bool IsDebug = RegOp.isDebug();
2743
2744 if (NonRegOp.isImm())
2745 RegOp.ChangeToImmediate(NonRegOp.getImm());
2746 else if (NonRegOp.isFI())
2747 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2748 else if (NonRegOp.isGlobal()) {
2749 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2750 NonRegOp.getTargetFlags());
2751 } else
2752 return nullptr;
2753
2754 // Make sure we don't reinterpret a subreg index in the target flags.
2755 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2756
2757 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2758 NonRegOp.setSubReg(SubReg);
2759
2760 return &MI;
2761}
2762
2764 MachineOperand &NonRegOp1,
2765 MachineOperand &NonRegOp2) {
2766 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2767 int64_t NonRegVal = NonRegOp1.getImm();
2768
2769 NonRegOp1.setImm(NonRegOp2.getImm());
2770 NonRegOp2.setImm(NonRegVal);
2771 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2772 NonRegOp2.setTargetFlags(TargetFlags);
2773 return &MI;
2774}
2775
2776bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2777 unsigned OpIdx1) const {
2778 const MCInstrDesc &InstDesc = MI.getDesc();
2779 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2780 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2781
2782 unsigned Opc = MI.getOpcode();
2783 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2784
2785 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2786 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2787
2788 // Swap doesn't breach constant bus or literal limits
2789 // It may move literal to position other than src0, this is not allowed
2790 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2791 // FIXME: After gfx9, literal can be in place other than Src0
2792 if (isVALU(MI)) {
2793 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2794 !isInlineConstant(MO0, OpInfo1))
2795 return false;
2796 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2797 !isInlineConstant(MO1, OpInfo0))
2798 return false;
2799 }
2800
2801 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2802 if (OpInfo1.RegClass == -1)
2803 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2804 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2805 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2806 }
2807 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2808 if (OpInfo0.RegClass == -1)
2809 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2810 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2811 isLegalRegOperand(MI, OpIdx0, MO1);
2812 }
2813
2814 // No need to check 64-bit literals since swapping does not bring new
2815 // 64-bit literals into current instruction to fold to 32-bit
2816
2817 return isImmOperandLegal(MI, OpIdx1, MO0);
2818}
2819
2821 unsigned Src0Idx,
2822 unsigned Src1Idx) const {
2823 assert(!NewMI && "this should never be used");
2824
2825 unsigned Opc = MI.getOpcode();
2826 int CommutedOpcode = commuteOpcode(Opc);
2827 if (CommutedOpcode == -1)
2828 return nullptr;
2829
2830 if (Src0Idx > Src1Idx)
2831 std::swap(Src0Idx, Src1Idx);
2832
2833 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2834 static_cast<int>(Src0Idx) &&
2835 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2836 static_cast<int>(Src1Idx) &&
2837 "inconsistency with findCommutedOpIndices");
2838
2839 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2840 return nullptr;
2841
2842 MachineInstr *CommutedMI = nullptr;
2843 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2844 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2845 if (Src0.isReg() && Src1.isReg()) {
2846 // Be sure to copy the source modifiers to the right place.
2847 CommutedMI =
2848 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2849 } else if (Src0.isReg() && !Src1.isReg()) {
2850 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2851 } else if (!Src0.isReg() && Src1.isReg()) {
2852 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2853 } else if (Src0.isImm() && Src1.isImm()) {
2854 CommutedMI = swapImmOperands(MI, Src0, Src1);
2855 } else {
2856 // FIXME: Found two non registers to commute. This does happen.
2857 return nullptr;
2858 }
2859
2860 if (CommutedMI) {
2861 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2862 Src1, AMDGPU::OpName::src1_modifiers);
2863
2864 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2865 AMDGPU::OpName::src1_sel);
2866
2867 CommutedMI->setDesc(get(CommutedOpcode));
2868 }
2869
2870 return CommutedMI;
2871}
2872
2873// This needs to be implemented because the source modifiers may be inserted
2874// between the true commutable operands, and the base
2875// TargetInstrInfo::commuteInstruction uses it.
2877 unsigned &SrcOpIdx0,
2878 unsigned &SrcOpIdx1) const {
2879 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2880}
2881
2883 unsigned &SrcOpIdx0,
2884 unsigned &SrcOpIdx1) const {
2885 if (!Desc.isCommutable())
2886 return false;
2887
2888 unsigned Opc = Desc.getOpcode();
2889 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2890 if (Src0Idx == -1)
2891 return false;
2892
2893 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2894 if (Src1Idx == -1)
2895 return false;
2896
2897 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2898}
2899
2901 int64_t BrOffset) const {
2902 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2903 // because its dest block is unanalyzable.
2904 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2905
2906 // Convert to dwords.
2907 BrOffset /= 4;
2908
2909 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2910 // from the next instruction.
2911 BrOffset -= 1;
2912
2913 return isIntN(BranchOffsetBits, BrOffset);
2914}
2915
2918 return MI.getOperand(0).getMBB();
2919}
2920
2922 for (const MachineInstr &MI : MBB->terminators()) {
2923 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2924 MI.getOpcode() == AMDGPU::SI_LOOP)
2925 return true;
2926 }
2927 return false;
2928}
2929
2931 MachineBasicBlock &DestBB,
2932 MachineBasicBlock &RestoreBB,
2933 const DebugLoc &DL, int64_t BrOffset,
2934 RegScavenger *RS) const {
2935 assert(MBB.empty() &&
2936 "new block should be inserted for expanding unconditional branch");
2937 assert(MBB.pred_size() == 1);
2938 assert(RestoreBB.empty() &&
2939 "restore block should be inserted for restoring clobbered registers");
2940
2941 MachineFunction *MF = MBB.getParent();
2944 auto I = MBB.end();
2945 auto &MCCtx = MF->getContext();
2946
2947 if (ST.useAddPC64Inst()) {
2948 MCSymbol *Offset =
2949 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2950 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2952 MCSymbol *PostAddPCLabel =
2953 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2954 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2955 auto *OffsetExpr = MCBinaryExpr::createSub(
2956 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2957 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2958 Offset->setVariableValue(OffsetExpr);
2959 return;
2960 }
2961
2962 assert(RS && "RegScavenger required for long branching");
2963
2964 // FIXME: Virtual register workaround for RegScavenger not working with empty
2965 // blocks.
2966 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2967
2968 // Note: as this is used after hazard recognizer we need to apply some hazard
2969 // workarounds directly.
2970 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2971 ST.hasVALUReadSGPRHazard();
2972 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2973 if (FlushSGPRWrites)
2974 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2976 };
2977
2978 // We need to compute the offset relative to the instruction immediately after
2979 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2980 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2981 ApplyHazardWorkarounds();
2982
2983 MCSymbol *PostGetPCLabel =
2984 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2985 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2986
2987 MCSymbol *OffsetLo =
2988 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2989 MCSymbol *OffsetHi =
2990 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2991 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2992 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2993 .addReg(PCReg, 0, AMDGPU::sub0)
2994 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2995 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2996 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2997 .addReg(PCReg, 0, AMDGPU::sub1)
2998 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2999 ApplyHazardWorkarounds();
3000
3001 // Insert the indirect branch after the other terminator.
3002 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3003 .addReg(PCReg);
3004
3005 // If a spill is needed for the pc register pair, we need to insert a spill
3006 // restore block right before the destination block, and insert a short branch
3007 // into the old destination block's fallthrough predecessor.
3008 // e.g.:
3009 //
3010 // s_cbranch_scc0 skip_long_branch:
3011 //
3012 // long_branch_bb:
3013 // spill s[8:9]
3014 // s_getpc_b64 s[8:9]
3015 // s_add_u32 s8, s8, restore_bb
3016 // s_addc_u32 s9, s9, 0
3017 // s_setpc_b64 s[8:9]
3018 //
3019 // skip_long_branch:
3020 // foo;
3021 //
3022 // .....
3023 //
3024 // dest_bb_fallthrough_predecessor:
3025 // bar;
3026 // s_branch dest_bb
3027 //
3028 // restore_bb:
3029 // restore s[8:9]
3030 // fallthrough dest_bb
3031 ///
3032 // dest_bb:
3033 // buzz;
3034
3035 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3036 Register Scav;
3037
3038 // If we've previously reserved a register for long branches
3039 // avoid running the scavenger and just use those registers
3040 if (LongBranchReservedReg) {
3041 RS->enterBasicBlock(MBB);
3042 Scav = LongBranchReservedReg;
3043 } else {
3044 RS->enterBasicBlockEnd(MBB);
3045 Scav = RS->scavengeRegisterBackwards(
3046 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3047 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3048 }
3049 if (Scav) {
3050 RS->setRegUsed(Scav);
3051 MRI.replaceRegWith(PCReg, Scav);
3052 MRI.clearVirtRegs();
3053 } else {
3054 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3055 // SGPR spill.
3056 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3057 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3058 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3059 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3060 MRI.clearVirtRegs();
3061 }
3062
3063 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3064 // Now, the distance could be defined.
3066 MCSymbolRefExpr::create(DestLabel, MCCtx),
3067 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3068 // Add offset assignments.
3069 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3070 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3071 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3072 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3073}
3074
3075unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3076 switch (Cond) {
3077 case SIInstrInfo::SCC_TRUE:
3078 return AMDGPU::S_CBRANCH_SCC1;
3079 case SIInstrInfo::SCC_FALSE:
3080 return AMDGPU::S_CBRANCH_SCC0;
3081 case SIInstrInfo::VCCNZ:
3082 return AMDGPU::S_CBRANCH_VCCNZ;
3083 case SIInstrInfo::VCCZ:
3084 return AMDGPU::S_CBRANCH_VCCZ;
3085 case SIInstrInfo::EXECNZ:
3086 return AMDGPU::S_CBRANCH_EXECNZ;
3087 case SIInstrInfo::EXECZ:
3088 return AMDGPU::S_CBRANCH_EXECZ;
3089 default:
3090 llvm_unreachable("invalid branch predicate");
3091 }
3092}
3093
3094SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3095 switch (Opcode) {
3096 case AMDGPU::S_CBRANCH_SCC0:
3097 return SCC_FALSE;
3098 case AMDGPU::S_CBRANCH_SCC1:
3099 return SCC_TRUE;
3100 case AMDGPU::S_CBRANCH_VCCNZ:
3101 return VCCNZ;
3102 case AMDGPU::S_CBRANCH_VCCZ:
3103 return VCCZ;
3104 case AMDGPU::S_CBRANCH_EXECNZ:
3105 return EXECNZ;
3106 case AMDGPU::S_CBRANCH_EXECZ:
3107 return EXECZ;
3108 default:
3109 return INVALID_BR;
3110 }
3111}
3112
3116 MachineBasicBlock *&FBB,
3118 bool AllowModify) const {
3119 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3120 // Unconditional Branch
3121 TBB = I->getOperand(0).getMBB();
3122 return false;
3123 }
3124
3125 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3126 if (Pred == INVALID_BR)
3127 return true;
3128
3129 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3130 Cond.push_back(MachineOperand::CreateImm(Pred));
3131 Cond.push_back(I->getOperand(1)); // Save the branch register.
3132
3133 ++I;
3134
3135 if (I == MBB.end()) {
3136 // Conditional branch followed by fall-through.
3137 TBB = CondBB;
3138 return false;
3139 }
3140
3141 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3142 TBB = CondBB;
3143 FBB = I->getOperand(0).getMBB();
3144 return false;
3145 }
3146
3147 return true;
3148}
3149
3151 MachineBasicBlock *&FBB,
3153 bool AllowModify) const {
3154 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3155 auto E = MBB.end();
3156 if (I == E)
3157 return false;
3158
3159 // Skip over the instructions that are artificially terminators for special
3160 // exec management.
3161 while (I != E && !I->isBranch() && !I->isReturn()) {
3162 switch (I->getOpcode()) {
3163 case AMDGPU::S_MOV_B64_term:
3164 case AMDGPU::S_XOR_B64_term:
3165 case AMDGPU::S_OR_B64_term:
3166 case AMDGPU::S_ANDN2_B64_term:
3167 case AMDGPU::S_AND_B64_term:
3168 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3169 case AMDGPU::S_MOV_B32_term:
3170 case AMDGPU::S_XOR_B32_term:
3171 case AMDGPU::S_OR_B32_term:
3172 case AMDGPU::S_ANDN2_B32_term:
3173 case AMDGPU::S_AND_B32_term:
3174 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3175 break;
3176 case AMDGPU::SI_IF:
3177 case AMDGPU::SI_ELSE:
3178 case AMDGPU::SI_KILL_I1_TERMINATOR:
3179 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3180 // FIXME: It's messy that these need to be considered here at all.
3181 return true;
3182 default:
3183 llvm_unreachable("unexpected non-branch terminator inst");
3184 }
3185
3186 ++I;
3187 }
3188
3189 if (I == E)
3190 return false;
3191
3192 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3193}
3194
3196 int *BytesRemoved) const {
3197 unsigned Count = 0;
3198 unsigned RemovedSize = 0;
3199 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3200 // Skip over artificial terminators when removing instructions.
3201 if (MI.isBranch() || MI.isReturn()) {
3202 RemovedSize += getInstSizeInBytes(MI);
3203 MI.eraseFromParent();
3204 ++Count;
3205 }
3206 }
3207
3208 if (BytesRemoved)
3209 *BytesRemoved = RemovedSize;
3210
3211 return Count;
3212}
3213
3214// Copy the flags onto the implicit condition register operand.
3216 const MachineOperand &OrigCond) {
3217 CondReg.setIsUndef(OrigCond.isUndef());
3218 CondReg.setIsKill(OrigCond.isKill());
3219}
3220
3223 MachineBasicBlock *FBB,
3225 const DebugLoc &DL,
3226 int *BytesAdded) const {
3227 if (!FBB && Cond.empty()) {
3228 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3229 .addMBB(TBB);
3230 if (BytesAdded)
3231 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3232 return 1;
3233 }
3234
3235 assert(TBB && Cond[0].isImm());
3236
3237 unsigned Opcode
3238 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3239
3240 if (!FBB) {
3241 MachineInstr *CondBr =
3242 BuildMI(&MBB, DL, get(Opcode))
3243 .addMBB(TBB);
3244
3245 // Copy the flags onto the implicit condition register operand.
3246 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3247 fixImplicitOperands(*CondBr);
3248
3249 if (BytesAdded)
3250 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3251 return 1;
3252 }
3253
3254 assert(TBB && FBB);
3255
3256 MachineInstr *CondBr =
3257 BuildMI(&MBB, DL, get(Opcode))
3258 .addMBB(TBB);
3259 fixImplicitOperands(*CondBr);
3260 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3261 .addMBB(FBB);
3262
3263 MachineOperand &CondReg = CondBr->getOperand(1);
3264 CondReg.setIsUndef(Cond[1].isUndef());
3265 CondReg.setIsKill(Cond[1].isKill());
3266
3267 if (BytesAdded)
3268 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3269
3270 return 2;
3271}
3272
3275 if (Cond.size() != 2) {
3276 return true;
3277 }
3278
3279 if (Cond[0].isImm()) {
3280 Cond[0].setImm(-Cond[0].getImm());
3281 return false;
3282 }
3283
3284 return true;
3285}
3286
3289 Register DstReg, Register TrueReg,
3290 Register FalseReg, int &CondCycles,
3291 int &TrueCycles, int &FalseCycles) const {
3292 switch (Cond[0].getImm()) {
3293 case VCCNZ:
3294 case VCCZ: {
3295 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3296 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3297 if (MRI.getRegClass(FalseReg) != RC)
3298 return false;
3299
3300 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3301 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3302
3303 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3304 return RI.hasVGPRs(RC) && NumInsts <= 6;
3305 }
3306 case SCC_TRUE:
3307 case SCC_FALSE: {
3308 // FIXME: We could insert for VGPRs if we could replace the original compare
3309 // with a vector one.
3310 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3311 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3312 if (MRI.getRegClass(FalseReg) != RC)
3313 return false;
3314
3315 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3316
3317 // Multiples of 8 can do s_cselect_b64
3318 if (NumInsts % 2 == 0)
3319 NumInsts /= 2;
3320
3321 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3322 return RI.isSGPRClass(RC);
3323 }
3324 default:
3325 return false;
3326 }
3327}
3328
3332 Register TrueReg, Register FalseReg) const {
3333 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3334 if (Pred == VCCZ || Pred == SCC_FALSE) {
3335 Pred = static_cast<BranchPredicate>(-Pred);
3336 std::swap(TrueReg, FalseReg);
3337 }
3338
3339 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3340 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3341 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3342
3343 if (DstSize == 32) {
3345 if (Pred == SCC_TRUE) {
3346 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3347 .addReg(TrueReg)
3348 .addReg(FalseReg);
3349 } else {
3350 // Instruction's operands are backwards from what is expected.
3351 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3352 .addReg(FalseReg)
3353 .addReg(TrueReg);
3354 }
3355
3356 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3357 return;
3358 }
3359
3360 if (DstSize == 64 && Pred == SCC_TRUE) {
3362 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3363 .addReg(TrueReg)
3364 .addReg(FalseReg);
3365
3366 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3367 return;
3368 }
3369
3370 static const int16_t Sub0_15[] = {
3371 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3372 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3373 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3374 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3375 };
3376
3377 static const int16_t Sub0_15_64[] = {
3378 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3379 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3380 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3381 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3382 };
3383
3384 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3385 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3386 const int16_t *SubIndices = Sub0_15;
3387 int NElts = DstSize / 32;
3388
3389 // 64-bit select is only available for SALU.
3390 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3391 if (Pred == SCC_TRUE) {
3392 if (NElts % 2) {
3393 SelOp = AMDGPU::S_CSELECT_B32;
3394 EltRC = &AMDGPU::SGPR_32RegClass;
3395 } else {
3396 SelOp = AMDGPU::S_CSELECT_B64;
3397 EltRC = &AMDGPU::SGPR_64RegClass;
3398 SubIndices = Sub0_15_64;
3399 NElts /= 2;
3400 }
3401 }
3402
3404 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3405
3406 I = MIB->getIterator();
3407
3409 for (int Idx = 0; Idx != NElts; ++Idx) {
3410 Register DstElt = MRI.createVirtualRegister(EltRC);
3411 Regs.push_back(DstElt);
3412
3413 unsigned SubIdx = SubIndices[Idx];
3414
3416 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3417 Select =
3418 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3419 .addReg(FalseReg, 0, SubIdx)
3420 .addReg(TrueReg, 0, SubIdx);
3421 } else {
3422 Select =
3423 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3424 .addReg(TrueReg, 0, SubIdx)
3425 .addReg(FalseReg, 0, SubIdx);
3426 }
3427
3428 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3430
3431 MIB.addReg(DstElt)
3432 .addImm(SubIdx);
3433 }
3434}
3435
3437 switch (MI.getOpcode()) {
3438 case AMDGPU::V_MOV_B16_t16_e32:
3439 case AMDGPU::V_MOV_B16_t16_e64:
3440 case AMDGPU::V_MOV_B32_e32:
3441 case AMDGPU::V_MOV_B32_e64:
3442 case AMDGPU::V_MOV_B64_PSEUDO:
3443 case AMDGPU::V_MOV_B64_e32:
3444 case AMDGPU::V_MOV_B64_e64:
3445 case AMDGPU::S_MOV_B32:
3446 case AMDGPU::S_MOV_B64:
3447 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3448 case AMDGPU::COPY:
3449 case AMDGPU::WWM_COPY:
3450 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3451 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3452 case AMDGPU::V_ACCVGPR_MOV_B32:
3453 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3454 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3455 return true;
3456 default:
3457 return false;
3458 }
3459}
3460
3462 switch (MI.getOpcode()) {
3463 case AMDGPU::V_MOV_B16_t16_e32:
3464 case AMDGPU::V_MOV_B16_t16_e64:
3465 return 2;
3466 case AMDGPU::V_MOV_B32_e32:
3467 case AMDGPU::V_MOV_B32_e64:
3468 case AMDGPU::V_MOV_B64_PSEUDO:
3469 case AMDGPU::V_MOV_B64_e32:
3470 case AMDGPU::V_MOV_B64_e64:
3471 case AMDGPU::S_MOV_B32:
3472 case AMDGPU::S_MOV_B64:
3473 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3474 case AMDGPU::COPY:
3475 case AMDGPU::WWM_COPY:
3476 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3477 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3478 case AMDGPU::V_ACCVGPR_MOV_B32:
3479 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3480 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3481 return 1;
3482 default:
3483 llvm_unreachable("MI is not a foldable copy");
3484 }
3485}
3486
3487static constexpr AMDGPU::OpName ModifierOpNames[] = {
3488 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3489 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3490 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3491
3493 unsigned Opc = MI.getOpcode();
3494 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3495 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3496 if (Idx >= 0)
3497 MI.removeOperand(Idx);
3498 }
3499}
3500
3502 const MCInstrDesc &NewDesc) const {
3503 MI.setDesc(NewDesc);
3504
3505 // Remove any leftover implicit operands from mutating the instruction. e.g.
3506 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3507 // anymore.
3508 const MCInstrDesc &Desc = MI.getDesc();
3509 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3510 Desc.implicit_defs().size();
3511
3512 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3513 MI.removeOperand(I);
3514}
3515
3516std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3517 unsigned SubRegIndex) {
3518 switch (SubRegIndex) {
3519 case AMDGPU::NoSubRegister:
3520 return Imm;
3521 case AMDGPU::sub0:
3522 return SignExtend64<32>(Imm);
3523 case AMDGPU::sub1:
3524 return SignExtend64<32>(Imm >> 32);
3525 case AMDGPU::lo16:
3526 return SignExtend64<16>(Imm);
3527 case AMDGPU::hi16:
3528 return SignExtend64<16>(Imm >> 16);
3529 case AMDGPU::sub1_lo16:
3530 return SignExtend64<16>(Imm >> 32);
3531 case AMDGPU::sub1_hi16:
3532 return SignExtend64<16>(Imm >> 48);
3533 default:
3534 return std::nullopt;
3535 }
3536
3537 llvm_unreachable("covered subregister switch");
3538}
3539
3540static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3541 switch (Opc) {
3542 case AMDGPU::V_MAC_F16_e32:
3543 case AMDGPU::V_MAC_F16_e64:
3544 case AMDGPU::V_MAD_F16_e64:
3545 return AMDGPU::V_MADAK_F16;
3546 case AMDGPU::V_MAC_F32_e32:
3547 case AMDGPU::V_MAC_F32_e64:
3548 case AMDGPU::V_MAD_F32_e64:
3549 return AMDGPU::V_MADAK_F32;
3550 case AMDGPU::V_FMAC_F32_e32:
3551 case AMDGPU::V_FMAC_F32_e64:
3552 case AMDGPU::V_FMA_F32_e64:
3553 return AMDGPU::V_FMAAK_F32;
3554 case AMDGPU::V_FMAC_F16_e32:
3555 case AMDGPU::V_FMAC_F16_e64:
3556 case AMDGPU::V_FMAC_F16_t16_e64:
3557 case AMDGPU::V_FMAC_F16_fake16_e64:
3558 case AMDGPU::V_FMAC_F16_t16_e32:
3559 case AMDGPU::V_FMAC_F16_fake16_e32:
3560 case AMDGPU::V_FMA_F16_e64:
3561 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3562 ? AMDGPU::V_FMAAK_F16_t16
3563 : AMDGPU::V_FMAAK_F16_fake16
3564 : AMDGPU::V_FMAAK_F16;
3565 case AMDGPU::V_FMAC_F64_e32:
3566 case AMDGPU::V_FMAC_F64_e64:
3567 case AMDGPU::V_FMA_F64_e64:
3568 return AMDGPU::V_FMAAK_F64;
3569 default:
3570 llvm_unreachable("invalid instruction");
3571 }
3572}
3573
3574static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3575 switch (Opc) {
3576 case AMDGPU::V_MAC_F16_e32:
3577 case AMDGPU::V_MAC_F16_e64:
3578 case AMDGPU::V_MAD_F16_e64:
3579 return AMDGPU::V_MADMK_F16;
3580 case AMDGPU::V_MAC_F32_e32:
3581 case AMDGPU::V_MAC_F32_e64:
3582 case AMDGPU::V_MAD_F32_e64:
3583 return AMDGPU::V_MADMK_F32;
3584 case AMDGPU::V_FMAC_F32_e32:
3585 case AMDGPU::V_FMAC_F32_e64:
3586 case AMDGPU::V_FMA_F32_e64:
3587 return AMDGPU::V_FMAMK_F32;
3588 case AMDGPU::V_FMAC_F16_e32:
3589 case AMDGPU::V_FMAC_F16_e64:
3590 case AMDGPU::V_FMAC_F16_t16_e64:
3591 case AMDGPU::V_FMAC_F16_fake16_e64:
3592 case AMDGPU::V_FMAC_F16_t16_e32:
3593 case AMDGPU::V_FMAC_F16_fake16_e32:
3594 case AMDGPU::V_FMA_F16_e64:
3595 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3596 ? AMDGPU::V_FMAMK_F16_t16
3597 : AMDGPU::V_FMAMK_F16_fake16
3598 : AMDGPU::V_FMAMK_F16;
3599 case AMDGPU::V_FMAC_F64_e32:
3600 case AMDGPU::V_FMAC_F64_e64:
3601 case AMDGPU::V_FMA_F64_e64:
3602 return AMDGPU::V_FMAMK_F64;
3603 default:
3604 llvm_unreachable("invalid instruction");
3605 }
3606}
3607
3609 Register Reg, MachineRegisterInfo *MRI) const {
3610 int64_t Imm;
3611 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3612 return false;
3613
3614 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3615
3616 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3617
3618 unsigned Opc = UseMI.getOpcode();
3619 if (Opc == AMDGPU::COPY) {
3620 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3621
3622 Register DstReg = UseMI.getOperand(0).getReg();
3623 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3624
3625 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3626
3627 if (HasMultipleUses) {
3628 // TODO: This should fold in more cases with multiple use, but we need to
3629 // more carefully consider what those uses are.
3630 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3631
3632 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3633 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3634 return false;
3635
3636 // Most of the time folding a 32-bit inline constant is free (though this
3637 // might not be true if we can't later fold it into a real user).
3638 //
3639 // FIXME: This isInlineConstant check is imprecise if
3640 // getConstValDefinedInReg handled the tricky non-mov cases.
3641 if (ImmDefSize == 32 &&
3643 return false;
3644 }
3645
3646 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3647 RI.getSubRegIdxSize(UseSubReg) == 16;
3648
3649 if (Is16Bit) {
3650 if (RI.hasVGPRs(DstRC))
3651 return false; // Do not clobber vgpr_hi16
3652
3653 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3654 return false;
3655 }
3656
3657 MachineFunction *MF = UseMI.getMF();
3658
3659 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3660 MCRegister MovDstPhysReg =
3661 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3662
3663 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3664
3665 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3666 for (unsigned MovOp :
3667 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3668 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3669 const MCInstrDesc &MovDesc = get(MovOp);
3670
3671 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3672 if (Is16Bit) {
3673 // We just need to find a correctly sized register class, so the
3674 // subregister index compatibility doesn't matter since we're statically
3675 // extracting the immediate value.
3676 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3677 if (!MovDstRC)
3678 continue;
3679
3680 if (MovDstPhysReg) {
3681 // FIXME: We probably should not do this. If there is a live value in
3682 // the high half of the register, it will be corrupted.
3683 MovDstPhysReg =
3684 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3685 if (!MovDstPhysReg)
3686 continue;
3687 }
3688 }
3689
3690 // Result class isn't the right size, try the next instruction.
3691 if (MovDstPhysReg) {
3692 if (!MovDstRC->contains(MovDstPhysReg))
3693 return false;
3694 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3695 // TODO: This will be overly conservative in the case of 16-bit virtual
3696 // SGPRs. We could hack up the virtual register uses to use a compatible
3697 // 32-bit class.
3698 continue;
3699 }
3700
3701 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3702
3703 // Ensure the interpreted immediate value is a valid operand in the new
3704 // mov.
3705 //
3706 // FIXME: isImmOperandLegal should have form that doesn't require existing
3707 // MachineInstr or MachineOperand
3708 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3709 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3710 break;
3711
3712 NewOpc = MovOp;
3713 break;
3714 }
3715
3716 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3717 return false;
3718
3719 if (Is16Bit) {
3720 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3721 if (MovDstPhysReg)
3722 UseMI.getOperand(0).setReg(MovDstPhysReg);
3723 assert(UseMI.getOperand(1).getReg().isVirtual());
3724 }
3725
3726 const MCInstrDesc &NewMCID = get(NewOpc);
3727 UseMI.setDesc(NewMCID);
3728 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3729 UseMI.addImplicitDefUseOperands(*MF);
3730 return true;
3731 }
3732
3733 if (HasMultipleUses)
3734 return false;
3735
3736 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3737 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3738 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3739 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3740 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3741 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3742 Opc == AMDGPU::V_FMAC_F64_e64) {
3743 // Don't fold if we are using source or output modifiers. The new VOP2
3744 // instructions don't have them.
3746 return false;
3747
3748 // If this is a free constant, there's no reason to do this.
3749 // TODO: We could fold this here instead of letting SIFoldOperands do it
3750 // later.
3751 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3752
3753 // Any src operand can be used for the legality check.
3754 if (isInlineConstant(UseMI, Src0Idx, Imm))
3755 return false;
3756
3757 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3758
3759 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3760 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3761
3762 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3763 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3764 (Src1->isReg() && Src1->getReg() == Reg)) {
3765 MachineOperand *RegSrc =
3766 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3767 if (!RegSrc->isReg())
3768 return false;
3769 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3770 ST.getConstantBusLimit(Opc) < 2)
3771 return false;
3772
3773 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3774 return false;
3775
3776 // If src2 is also a literal constant then we have to choose which one to
3777 // fold. In general it is better to choose madak so that the other literal
3778 // can be materialized in an sgpr instead of a vgpr:
3779 // s_mov_b32 s0, literal
3780 // v_madak_f32 v0, s0, v0, literal
3781 // Instead of:
3782 // v_mov_b32 v1, literal
3783 // v_madmk_f32 v0, v0, literal, v1
3784 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3785 if (Def && Def->isMoveImmediate() &&
3786 !isInlineConstant(Def->getOperand(1)))
3787 return false;
3788
3789 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3790 if (pseudoToMCOpcode(NewOpc) == -1)
3791 return false;
3792
3793 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3794 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3795
3796 // FIXME: This would be a lot easier if we could return a new instruction
3797 // instead of having to modify in place.
3798
3799 Register SrcReg = RegSrc->getReg();
3800 unsigned SrcSubReg = RegSrc->getSubReg();
3801 Src0->setReg(SrcReg);
3802 Src0->setSubReg(SrcSubReg);
3803 Src0->setIsKill(RegSrc->isKill());
3804
3805 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3806 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3807 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3808 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3809 UseMI.untieRegOperand(
3810 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3811
3812 Src1->ChangeToImmediate(*SubRegImm);
3813
3815 UseMI.setDesc(get(NewOpc));
3816
3817 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3818 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3819 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3820 auto Tmp = MRI->createVirtualRegister(NewRC);
3821 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3822 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3823 UseMI.getOperand(0).getReg())
3824 .addReg(Tmp, RegState::Kill);
3825 UseMI.getOperand(0).setReg(Tmp);
3826 if (UseMI.getOperand(1).isReg() &&
3827 RI.isVGPR(*MRI, UseMI.getOperand(1).getReg())) {
3828 auto Tmp = MRI->createVirtualRegister(NewRC);
3829 BuildMI(*UseMI.getParent(), UseMI.getIterator(), UseMI.getDebugLoc(),
3830 get(AMDGPU::COPY), Tmp)
3831 .addReg(UseMI.getOperand(1).getReg());
3832 UseMI.getOperand(1).setReg(Tmp);
3833 UseMI.getOperand(1).setIsKill();
3834 }
3835 if (UseMI.getOperand(3).isReg() &&
3836 RI.isVGPR(*MRI, UseMI.getOperand(3).getReg())) {
3837 auto Tmp = MRI->createVirtualRegister(NewRC);
3838 BuildMI(*UseMI.getParent(), UseMI.getIterator(), UseMI.getDebugLoc(),
3839 get(AMDGPU::COPY), Tmp)
3840 .addReg(UseMI.getOperand(3).getReg());
3841 UseMI.getOperand(3).setReg(Tmp);
3842 UseMI.getOperand(3).setIsKill();
3843 }
3844 }
3845
3846 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3847 if (DeleteDef)
3848 DefMI.eraseFromParent();
3849
3850 return true;
3851 }
3852
3853 // Added part is the constant: Use v_madak_{f16, f32}.
3854 if (Src2->isReg() && Src2->getReg() == Reg) {
3855 if (ST.getConstantBusLimit(Opc) < 2) {
3856 // Not allowed to use constant bus for another operand.
3857 // We can however allow an inline immediate as src0.
3858 bool Src0Inlined = false;
3859 if (Src0->isReg()) {
3860 // Try to inline constant if possible.
3861 // If the Def moves immediate and the use is single
3862 // We are saving VGPR here.
3863 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3864 if (Def && Def->isMoveImmediate() &&
3865 isInlineConstant(Def->getOperand(1)) &&
3866 MRI->hasOneNonDBGUse(Src0->getReg())) {
3867 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3868 Src0Inlined = true;
3869 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3870 RI.isSGPRReg(*MRI, Src0->getReg())) {
3871 return false;
3872 }
3873 // VGPR is okay as Src0 - fallthrough
3874 }
3875
3876 if (Src1->isReg() && !Src0Inlined) {
3877 // We have one slot for inlinable constant so far - try to fill it
3878 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3879 if (Def && Def->isMoveImmediate() &&
3880 isInlineConstant(Def->getOperand(1)) &&
3881 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3882 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3883 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3884 return false;
3885 // VGPR is okay as Src1 - fallthrough
3886 }
3887 }
3888
3889 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3890 if (pseudoToMCOpcode(NewOpc) == -1)
3891 return false;
3892
3893 // FIXME: This would be a lot easier if we could return a new instruction
3894 // instead of having to modify in place.
3895
3896 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3897 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3898 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3899 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3900 UseMI.untieRegOperand(
3901 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3902
3903 const std::optional<int64_t> SubRegImm =
3904 extractSubregFromImm(Imm, Src2->getSubReg());
3905
3906 // ChangingToImmediate adds Src2 back to the instruction.
3907 Src2->ChangeToImmediate(*SubRegImm);
3908
3909 // These come before src2.
3911 UseMI.setDesc(get(NewOpc));
3912
3913 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3914 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3915 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3916 auto Tmp = MRI->createVirtualRegister(NewRC);
3917 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3918 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3919 UseMI.getOperand(0).getReg())
3920 .addReg(Tmp, RegState::Kill);
3921 UseMI.getOperand(0).setReg(Tmp);
3922 if (UseMI.getOperand(1).isReg() &&
3923 RI.isVGPR(*MRI, UseMI.getOperand(1).getReg())) {
3924 auto Tmp = MRI->createVirtualRegister(NewRC);
3925 BuildMI(*UseMI.getParent(), UseMI.getIterator(), UseMI.getDebugLoc(),
3926 get(AMDGPU::COPY), Tmp)
3927 .addReg(UseMI.getOperand(1).getReg());
3928 UseMI.getOperand(1).setReg(Tmp);
3929 UseMI.getOperand(1).setIsKill();
3930 }
3931 if (UseMI.getOperand(2).isReg() &&
3932 RI.isVGPR(*MRI, UseMI.getOperand(2).getReg())) {
3933 auto Tmp = MRI->createVirtualRegister(NewRC);
3934 BuildMI(*UseMI.getParent(), UseMI.getIterator(), UseMI.getDebugLoc(),
3935 get(AMDGPU::COPY), Tmp)
3936 .addReg(UseMI.getOperand(2).getReg());
3937 UseMI.getOperand(2).setReg(Tmp);
3938 UseMI.getOperand(2).setIsKill();
3939 }
3940 }
3941
3942 // It might happen that UseMI was commuted
3943 // and we now have SGPR as SRC1. If so 2 inlined
3944 // constant and SGPR are illegal.
3946
3947 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3948 if (DeleteDef)
3949 DefMI.eraseFromParent();
3950
3951 return true;
3952 }
3953 }
3954
3955 return false;
3956}
3957
3958static bool
3961 if (BaseOps1.size() != BaseOps2.size())
3962 return false;
3963 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3964 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3965 return false;
3966 }
3967 return true;
3968}
3969
3970static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3971 LocationSize WidthB, int OffsetB) {
3972 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3973 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3974 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3975 return LowWidth.hasValue() &&
3976 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3977}
3978
3979bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3980 const MachineInstr &MIb) const {
3981 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3982 int64_t Offset0, Offset1;
3983 LocationSize Dummy0 = LocationSize::precise(0);
3984 LocationSize Dummy1 = LocationSize::precise(0);
3985 bool Offset0IsScalable, Offset1IsScalable;
3986 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3987 Dummy0, &RI) ||
3988 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3989 Dummy1, &RI))
3990 return false;
3991
3992 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3993 return false;
3994
3995 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3996 // FIXME: Handle ds_read2 / ds_write2.
3997 return false;
3998 }
3999 LocationSize Width0 = MIa.memoperands().front()->getSize();
4000 LocationSize Width1 = MIb.memoperands().front()->getSize();
4001 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4002}
4003
4005 const MachineInstr &MIb) const {
4006 assert(MIa.mayLoadOrStore() &&
4007 "MIa must load from or modify a memory location");
4008 assert(MIb.mayLoadOrStore() &&
4009 "MIb must load from or modify a memory location");
4010
4012 return false;
4013
4014 // XXX - Can we relax this between address spaces?
4015 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4016 return false;
4017
4018 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4019 return false;
4020
4021 if (MIa.isBundle() || MIb.isBundle())
4022 return false;
4023
4024 // TODO: Should we check the address space from the MachineMemOperand? That
4025 // would allow us to distinguish objects we know don't alias based on the
4026 // underlying address space, even if it was lowered to a different one,
4027 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4028 // buffer.
4029 if (isDS(MIa)) {
4030 if (isDS(MIb))
4031 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4032
4033 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4034 }
4035
4036 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4037 if (isMUBUF(MIb) || isMTBUF(MIb))
4038 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4039
4040 if (isFLAT(MIb))
4041 return isFLATScratch(MIb);
4042
4043 return !isSMRD(MIb);
4044 }
4045
4046 if (isSMRD(MIa)) {
4047 if (isSMRD(MIb))
4048 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4049
4050 if (isFLAT(MIb))
4051 return isFLATScratch(MIb);
4052
4053 return !isMUBUF(MIb) && !isMTBUF(MIb);
4054 }
4055
4056 if (isFLAT(MIa)) {
4057 if (isFLAT(MIb)) {
4058 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4059 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4060 return true;
4061
4062 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4063 }
4064
4065 return false;
4066 }
4067
4068 return false;
4069}
4070
4072 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4073 if (Reg.isPhysical())
4074 return false;
4075 auto *Def = MRI.getUniqueVRegDef(Reg);
4076 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4077 Imm = Def->getOperand(1).getImm();
4078 if (DefMI)
4079 *DefMI = Def;
4080 return true;
4081 }
4082 return false;
4083}
4084
4085static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4086 MachineInstr **DefMI = nullptr) {
4087 if (!MO->isReg())
4088 return false;
4089 const MachineFunction *MF = MO->getParent()->getMF();
4090 const MachineRegisterInfo &MRI = MF->getRegInfo();
4091 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4092}
4093
4095 MachineInstr &NewMI) {
4096 if (LV) {
4097 unsigned NumOps = MI.getNumOperands();
4098 for (unsigned I = 1; I < NumOps; ++I) {
4099 MachineOperand &Op = MI.getOperand(I);
4100 if (Op.isReg() && Op.isKill())
4101 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4102 }
4103 }
4104}
4105
4106static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4107 switch (Opc) {
4108 case AMDGPU::V_MAC_F16_e32:
4109 case AMDGPU::V_MAC_F16_e64:
4110 return AMDGPU::V_MAD_F16_e64;
4111 case AMDGPU::V_MAC_F32_e32:
4112 case AMDGPU::V_MAC_F32_e64:
4113 return AMDGPU::V_MAD_F32_e64;
4114 case AMDGPU::V_MAC_LEGACY_F32_e32:
4115 case AMDGPU::V_MAC_LEGACY_F32_e64:
4116 return AMDGPU::V_MAD_LEGACY_F32_e64;
4117 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4118 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4119 return AMDGPU::V_FMA_LEGACY_F32_e64;
4120 case AMDGPU::V_FMAC_F16_e32:
4121 case AMDGPU::V_FMAC_F16_e64:
4122 case AMDGPU::V_FMAC_F16_t16_e64:
4123 case AMDGPU::V_FMAC_F16_fake16_e64:
4124 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4125 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4126 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4127 : AMDGPU::V_FMA_F16_gfx9_e64;
4128 case AMDGPU::V_FMAC_F32_e32:
4129 case AMDGPU::V_FMAC_F32_e64:
4130 return AMDGPU::V_FMA_F32_e64;
4131 case AMDGPU::V_FMAC_F64_e32:
4132 case AMDGPU::V_FMAC_F64_e64:
4133 return AMDGPU::V_FMA_F64_e64;
4134 default:
4135 llvm_unreachable("invalid instruction");
4136 }
4137}
4138
4139/// Helper struct for the implementation of 3-address conversion to communicate
4140/// updates made to instruction operands.
4142 /// Other instruction whose def is no longer used by the converted
4143 /// instruction.
4145};
4146
4148 LiveVariables *LV,
4149 LiveIntervals *LIS) const {
4150 MachineBasicBlock &MBB = *MI.getParent();
4151 MachineInstr *CandidateMI = &MI;
4152
4153 if (MI.isBundle()) {
4154 // This is a temporary placeholder for bundle handling that enables us to
4155 // exercise the relevant code paths in the two-address instruction pass.
4156 if (MI.getBundleSize() != 1)
4157 return nullptr;
4158 CandidateMI = MI.getNextNode();
4159 }
4160
4162 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4163 if (!NewMI)
4164 return nullptr;
4165
4166 if (MI.isBundle()) {
4167 CandidateMI->eraseFromBundle();
4168
4169 for (MachineOperand &MO : MI.all_defs()) {
4170 if (MO.isTied())
4171 MI.untieRegOperand(MO.getOperandNo());
4172 }
4173 } else {
4174 updateLiveVariables(LV, MI, *NewMI);
4175 if (LIS) {
4176 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4177 // SlotIndex of defs needs to be updated when converting to early-clobber
4178 MachineOperand &Def = NewMI->getOperand(0);
4179 if (Def.isEarlyClobber() && Def.isReg() &&
4180 LIS->hasInterval(Def.getReg())) {
4181 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4182 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4183 auto &LI = LIS->getInterval(Def.getReg());
4184 auto UpdateDefIndex = [&](LiveRange &LR) {
4185 auto *S = LR.find(OldIndex);
4186 if (S != LR.end() && S->start == OldIndex) {
4187 assert(S->valno && S->valno->def == OldIndex);
4188 S->start = NewIndex;
4189 S->valno->def = NewIndex;
4190 }
4191 };
4192 UpdateDefIndex(LI);
4193 for (auto &SR : LI.subranges())
4194 UpdateDefIndex(SR);
4195 }
4196 }
4197 }
4198
4199 if (U.RemoveMIUse) {
4200 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4201 // The only user is the instruction which will be killed.
4202 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4203
4204 if (MRI.hasOneNonDBGUse(DefReg)) {
4205 // We cannot just remove the DefMI here, calling pass will crash.
4206 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4207 U.RemoveMIUse->getOperand(0).setIsDead(true);
4208 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4209 U.RemoveMIUse->removeOperand(I);
4210 if (LV)
4211 LV->getVarInfo(DefReg).AliveBlocks.clear();
4212 }
4213
4214 if (MI.isBundle()) {
4215 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4216 if (!VRI.Reads && !VRI.Writes) {
4217 for (MachineOperand &MO : MI.all_uses()) {
4218 if (MO.isReg() && MO.getReg() == DefReg) {
4219 assert(MO.getSubReg() == 0 &&
4220 "tied sub-registers in bundles currently not supported");
4221 MI.removeOperand(MO.getOperandNo());
4222 break;
4223 }
4224 }
4225
4226 if (LIS)
4227 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4228 }
4229 } else if (LIS) {
4230 LiveInterval &DefLI = LIS->getInterval(DefReg);
4231
4232 // We cannot delete the original instruction here, so hack out the use
4233 // in the original instruction with a dummy register so we can use
4234 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4235 // not have the complexity of deleting a use to consider here.
4236 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4237 for (MachineOperand &MIOp : MI.uses()) {
4238 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4239 MIOp.setIsUndef(true);
4240 MIOp.setReg(DummyReg);
4241 }
4242 }
4243
4244 if (MI.isBundle()) {
4245 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4246 if (!VRI.Reads && !VRI.Writes) {
4247 for (MachineOperand &MIOp : MI.uses()) {
4248 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4249 MIOp.setIsUndef(true);
4250 MIOp.setReg(DummyReg);
4251 }
4252 }
4253 }
4254
4255 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4256 false, /*isUndef=*/true));
4257 }
4258
4259 LIS->shrinkToUses(&DefLI);
4260 }
4261 }
4262
4263 return MI.isBundle() ? &MI : NewMI;
4264}
4265
4267SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4268 ThreeAddressUpdates &U) const {
4269 MachineBasicBlock &MBB = *MI.getParent();
4270 unsigned Opc = MI.getOpcode();
4271
4272 // Handle MFMA.
4273 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4274 if (NewMFMAOpc != -1) {
4276 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4277 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4278 MIB.add(MI.getOperand(I));
4279 return MIB;
4280 }
4281
4282 if (SIInstrInfo::isWMMA(MI)) {
4283 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4284 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4285 .setMIFlags(MI.getFlags());
4286 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4287 MIB->addOperand(MI.getOperand(I));
4288 return MIB;
4289 }
4290
4291 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4292 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4293 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4294 "present pre-RA");
4295
4296 // Handle MAC/FMAC.
4297 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4298 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4299 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4300 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4301 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4302 bool Src0Literal = false;
4303
4304 switch (Opc) {
4305 default:
4306 return nullptr;
4307 case AMDGPU::V_MAC_F16_e64:
4308 case AMDGPU::V_FMAC_F16_e64:
4309 case AMDGPU::V_FMAC_F16_t16_e64:
4310 case AMDGPU::V_FMAC_F16_fake16_e64:
4311 case AMDGPU::V_MAC_F32_e64:
4312 case AMDGPU::V_MAC_LEGACY_F32_e64:
4313 case AMDGPU::V_FMAC_F32_e64:
4314 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4315 case AMDGPU::V_FMAC_F64_e64:
4316 break;
4317 case AMDGPU::V_MAC_F16_e32:
4318 case AMDGPU::V_FMAC_F16_e32:
4319 case AMDGPU::V_MAC_F32_e32:
4320 case AMDGPU::V_MAC_LEGACY_F32_e32:
4321 case AMDGPU::V_FMAC_F32_e32:
4322 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4323 case AMDGPU::V_FMAC_F64_e32: {
4324 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4325 AMDGPU::OpName::src0);
4326 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4327 if (!Src0->isReg() && !Src0->isImm())
4328 return nullptr;
4329
4330 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4331 Src0Literal = true;
4332
4333 break;
4334 }
4335 }
4336
4337 MachineInstrBuilder MIB;
4338 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4339 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4340 const MachineOperand *Src0Mods =
4341 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4342 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4343 const MachineOperand *Src1Mods =
4344 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4345 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4346 const MachineOperand *Src2Mods =
4347 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4348 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4349 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4350 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4351
4352 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4353 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4354 // If we have an SGPR input, we will violate the constant bus restriction.
4355 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4356 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4357 MachineInstr *DefMI;
4358
4359 int64_t Imm;
4360 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4361 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4362 if (pseudoToMCOpcode(NewOpc) != -1) {
4363 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4364 .add(*Dst)
4365 .add(*Src0)
4366 .add(*Src1)
4367 .addImm(Imm)
4368 .setMIFlags(MI.getFlags());
4369 U.RemoveMIUse = DefMI;
4370 return MIB;
4371 }
4372 }
4373 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4374 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4375 if (pseudoToMCOpcode(NewOpc) != -1) {
4376 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4377 .add(*Dst)
4378 .add(*Src0)
4379 .addImm(Imm)
4380 .add(*Src2)
4381 .setMIFlags(MI.getFlags());
4382 U.RemoveMIUse = DefMI;
4383 return MIB;
4384 }
4385 }
4386 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4387 if (Src0Literal) {
4388 Imm = Src0->getImm();
4389 DefMI = nullptr;
4390 }
4391 if (pseudoToMCOpcode(NewOpc) != -1 &&
4393 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4394 Src1)) {
4395 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4396 .add(*Dst)
4397 .add(*Src1)
4398 .addImm(Imm)
4399 .add(*Src2)
4400 .setMIFlags(MI.getFlags());
4401 U.RemoveMIUse = DefMI;
4402 return MIB;
4403 }
4404 }
4405 }
4406
4407 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4408 // if VOP3 does not allow a literal operand.
4409 if (Src0Literal && !ST.hasVOP3Literal())
4410 return nullptr;
4411
4412 unsigned NewOpc = getNewFMAInst(ST, Opc);
4413
4414 if (pseudoToMCOpcode(NewOpc) == -1)
4415 return nullptr;
4416
4417 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4418 .add(*Dst)
4419 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4420 .add(*Src0)
4421 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4422 .add(*Src1)
4423 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4424 .add(*Src2)
4425 .addImm(Clamp ? Clamp->getImm() : 0)
4426 .addImm(Omod ? Omod->getImm() : 0)
4427 .setMIFlags(MI.getFlags());
4428 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4429 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4430 return MIB;
4431}
4432
4433// It's not generally safe to move VALU instructions across these since it will
4434// start using the register as a base index rather than directly.
4435// XXX - Why isn't hasSideEffects sufficient for these?
4437 switch (MI.getOpcode()) {
4438 case AMDGPU::S_SET_GPR_IDX_ON:
4439 case AMDGPU::S_SET_GPR_IDX_MODE:
4440 case AMDGPU::S_SET_GPR_IDX_OFF:
4441 return true;
4442 default:
4443 return false;
4444 }
4445}
4446
4448 const MachineBasicBlock *MBB,
4449 const MachineFunction &MF) const {
4450 // Skipping the check for SP writes in the base implementation. The reason it
4451 // was added was apparently due to compile time concerns.
4452 //
4453 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4454 // but is probably avoidable.
4455
4456 // Copied from base implementation.
4457 // Terminators and labels can't be scheduled around.
4458 if (MI.isTerminator() || MI.isPosition())
4459 return true;
4460
4461 // INLINEASM_BR can jump to another block
4462 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4463 return true;
4464
4465 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4466 return true;
4467
4468 // Target-independent instructions do not have an implicit-use of EXEC, even
4469 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4470 // boundaries prevents incorrect movements of such instructions.
4471 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4472 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4473 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4474 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4475 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4477}
4478
4480 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4481 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4482 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4483}
4484
4486 // Instructions that access scratch use FLAT encoding or BUF encodings.
4487 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4488 return false;
4489
4490 // If scratch is not initialized, we can never access it.
4491 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4492 return false;
4493
4494 // SCRATCH instructions always access scratch.
4495 if (isFLATScratch(MI))
4496 return true;
4497
4498 // If there are no memory operands then conservatively assume the flat
4499 // operation may access scratch.
4500 if (MI.memoperands_empty())
4501 return true;
4502
4503 // See if any memory operand specifies an address space that involves scratch.
4504 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4505 unsigned AS = Memop->getAddrSpace();
4506 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4507 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4508 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4509 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4510 }
4511 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4512 });
4513}
4514
4516 assert(isFLAT(MI));
4517
4518 // All flat instructions use the VMEM counter except prefetch.
4519 if (!usesVM_CNT(MI))
4520 return false;
4521
4522 // If there are no memory operands then conservatively assume the flat
4523 // operation may access VMEM.
4524 if (MI.memoperands_empty())
4525 return true;
4526
4527 // See if any memory operand specifies an address space that involves VMEM.
4528 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4529 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4530 // (GDS) address space is not supported by flat operations. Therefore, simply
4531 // return true unless only the LDS address space is found.
4532 for (const MachineMemOperand *Memop : MI.memoperands()) {
4533 unsigned AS = Memop->getAddrSpace();
4535 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4536 return true;
4537 }
4538
4539 return false;
4540}
4541
4543 assert(isFLAT(MI));
4544
4545 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4546 if (!usesLGKM_CNT(MI))
4547 return false;
4548
4549 // If in tgsplit mode then there can be no use of LDS.
4550 if (ST.isTgSplitEnabled())
4551 return false;
4552
4553 // If there are no memory operands then conservatively assume the flat
4554 // operation may access LDS.
4555 if (MI.memoperands_empty())
4556 return true;
4557
4558 // See if any memory operand specifies an address space that involves LDS.
4559 for (const MachineMemOperand *Memop : MI.memoperands()) {
4560 unsigned AS = Memop->getAddrSpace();
4562 return true;
4563 }
4564
4565 return false;
4566}
4567
4569 // Skip the full operand and register alias search modifiesRegister
4570 // does. There's only a handful of instructions that touch this, it's only an
4571 // implicit def, and doesn't alias any other registers.
4572 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4573}
4574
4576 unsigned Opcode = MI.getOpcode();
4577
4578 if (MI.mayStore() && isSMRD(MI))
4579 return true; // scalar store or atomic
4580
4581 // This will terminate the function when other lanes may need to continue.
4582 if (MI.isReturn())
4583 return true;
4584
4585 // These instructions cause shader I/O that may cause hardware lockups
4586 // when executed with an empty EXEC mask.
4587 //
4588 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4589 // EXEC = 0, but checking for that case here seems not worth it
4590 // given the typical code patterns.
4591 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4592 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4593 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4594 return true;
4595
4596 if (MI.isCall() || MI.isInlineAsm())
4597 return true; // conservative assumption
4598
4599 // Assume that barrier interactions are only intended with active lanes.
4600 if (isBarrier(Opcode))
4601 return true;
4602
4603 // A mode change is a scalar operation that influences vector instructions.
4605 return true;
4606
4607 // These are like SALU instructions in terms of effects, so it's questionable
4608 // whether we should return true for those.
4609 //
4610 // However, executing them with EXEC = 0 causes them to operate on undefined
4611 // data, which we avoid by returning true here.
4612 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4613 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4614 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4615 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4616 return true;
4617
4618 return false;
4619}
4620
4622 const MachineInstr &MI) const {
4623 if (MI.isMetaInstruction())
4624 return false;
4625
4626 // This won't read exec if this is an SGPR->SGPR copy.
4627 if (MI.isCopyLike()) {
4628 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4629 return true;
4630
4631 // Make sure this isn't copying exec as a normal operand
4632 return MI.readsRegister(AMDGPU::EXEC, &RI);
4633 }
4634
4635 // Make a conservative assumption about the callee.
4636 if (MI.isCall())
4637 return true;
4638
4639 // Be conservative with any unhandled generic opcodes.
4640 if (!isTargetSpecificOpcode(MI.getOpcode()))
4641 return true;
4642
4643 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4644}
4645
4646bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4647 switch (Imm.getBitWidth()) {
4648 case 1: // This likely will be a condition code mask.
4649 return true;
4650
4651 case 32:
4652 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4653 ST.hasInv2PiInlineImm());
4654 case 64:
4655 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4656 ST.hasInv2PiInlineImm());
4657 case 16:
4658 return ST.has16BitInsts() &&
4659 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4660 ST.hasInv2PiInlineImm());
4661 default:
4662 llvm_unreachable("invalid bitwidth");
4663 }
4664}
4665
4667 APInt IntImm = Imm.bitcastToAPInt();
4668 int64_t IntImmVal = IntImm.getSExtValue();
4669 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4670 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4671 default:
4672 llvm_unreachable("invalid fltSemantics");
4675 return isInlineConstant(IntImm);
4677 return ST.has16BitInsts() &&
4678 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4680 return ST.has16BitInsts() &&
4681 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4682 }
4683}
4684
4685bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4686 // MachineOperand provides no way to tell the true operand size, since it only
4687 // records a 64-bit value. We need to know the size to determine if a 32-bit
4688 // floating point immediate bit pattern is legal for an integer immediate. It
4689 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4690 switch (OperandType) {
4700 int32_t Trunc = static_cast<int32_t>(Imm);
4701 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4702 }
4708 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4711 // We would expect inline immediates to not be concerned with an integer/fp
4712 // distinction. However, in the case of 16-bit integer operations, the
4713 // "floating point" values appear to not work. It seems read the low 16-bits
4714 // of 32-bit immediates, which happens to always work for the integer
4715 // values.
4716 //
4717 // See llvm bugzilla 46302.
4718 //
4719 // TODO: Theoretically we could use op-sel to use the high bits of the
4720 // 32-bit FP values.
4732 return false;
4735 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4736 // A few special case instructions have 16-bit operands on subtargets
4737 // where 16-bit instructions are not legal.
4738 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4739 // constants in these cases
4740 int16_t Trunc = static_cast<int16_t>(Imm);
4741 return ST.has16BitInsts() &&
4742 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4743 }
4744
4745 return false;
4746 }
4749 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4750 int16_t Trunc = static_cast<int16_t>(Imm);
4751 return ST.has16BitInsts() &&
4752 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4753 }
4754 return false;
4755 }
4759 return false;
4761 return isLegalAV64PseudoImm(Imm);
4764 // Always embedded in the instruction for free.
4765 return true;
4775 // Just ignore anything else.
4776 return true;
4777 default:
4778 llvm_unreachable("invalid operand type");
4779 }
4780}
4781
4782static bool compareMachineOp(const MachineOperand &Op0,
4783 const MachineOperand &Op1) {
4784 if (Op0.getType() != Op1.getType())
4785 return false;
4786
4787 switch (Op0.getType()) {
4789 return Op0.getReg() == Op1.getReg();
4791 return Op0.getImm() == Op1.getImm();
4792 default:
4793 llvm_unreachable("Didn't expect to be comparing these operand types");
4794 }
4795}
4796
4798 const MCOperandInfo &OpInfo) const {
4799 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4800 return true;
4801
4802 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4803 return false;
4804
4805 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4806 return true;
4807
4808 return ST.hasVOP3Literal();
4809}
4810
4811bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4812 int64_t ImmVal) const {
4813 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4814 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4815 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4816 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4817 AMDGPU::OpName::src2))
4818 return false;
4819 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4820 }
4821
4822 return isLiteralOperandLegal(InstDesc, OpInfo);
4823}
4824
4825bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4826 const MachineOperand &MO) const {
4827 if (MO.isImm())
4828 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4829
4830 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4831 "unexpected imm-like operand kind");
4832 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4833 return isLiteralOperandLegal(InstDesc, OpInfo);
4834}
4835
4837 // 2 32-bit inline constants packed into one.
4838 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4839 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4840}
4841
4842bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4843 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4844 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4845 return false;
4846
4847 int Op32 = AMDGPU::getVOPe32(Opcode);
4848 if (Op32 == -1)
4849 return false;
4850
4851 return pseudoToMCOpcode(Op32) != -1;
4852}
4853
4854bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4855 // The src0_modifier operand is present on all instructions
4856 // that have modifiers.
4857
4858 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4859}
4860
4862 AMDGPU::OpName OpName) const {
4863 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4864 return Mods && Mods->getImm();
4865}
4866
4868 return any_of(ModifierOpNames,
4869 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4870}
4871
4873 const MachineRegisterInfo &MRI) const {
4874 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4875 // Can't shrink instruction with three operands.
4876 if (Src2) {
4877 switch (MI.getOpcode()) {
4878 default: return false;
4879
4880 case AMDGPU::V_ADDC_U32_e64:
4881 case AMDGPU::V_SUBB_U32_e64:
4882 case AMDGPU::V_SUBBREV_U32_e64: {
4883 const MachineOperand *Src1
4884 = getNamedOperand(MI, AMDGPU::OpName::src1);
4885 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4886 return false;
4887 // Additional verification is needed for sdst/src2.
4888 return true;
4889 }
4890 case AMDGPU::V_MAC_F16_e64:
4891 case AMDGPU::V_MAC_F32_e64:
4892 case AMDGPU::V_MAC_LEGACY_F32_e64:
4893 case AMDGPU::V_FMAC_F16_e64:
4894 case AMDGPU::V_FMAC_F16_t16_e64:
4895 case AMDGPU::V_FMAC_F16_fake16_e64:
4896 case AMDGPU::V_FMAC_F32_e64:
4897 case AMDGPU::V_FMAC_F64_e64:
4898 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4899 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4900 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4901 return false;
4902 break;
4903
4904 case AMDGPU::V_CNDMASK_B32_e64:
4905 break;
4906 }
4907 }
4908
4909 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4910 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4911 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4912 return false;
4913
4914 // We don't need to check src0, all input types are legal, so just make sure
4915 // src0 isn't using any modifiers.
4916 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4917 return false;
4918
4919 // Can it be shrunk to a valid 32 bit opcode?
4920 if (!hasVALU32BitEncoding(MI.getOpcode()))
4921 return false;
4922
4923 // Check output modifiers
4924 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4925 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4926 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4927 // TODO: Can we avoid checking bound_ctrl/fi here?
4928 // They are only used by permlane*_swap special case.
4929 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4930 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4931}
4932
4933// Set VCC operand with all flags from \p Orig, except for setting it as
4934// implicit.
4936 const MachineOperand &Orig) {
4937
4938 for (MachineOperand &Use : MI.implicit_operands()) {
4939 if (Use.isUse() &&
4940 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4941 Use.setIsUndef(Orig.isUndef());
4942 Use.setIsKill(Orig.isKill());
4943 return;
4944 }
4945 }
4946}
4947
4949 unsigned Op32) const {
4950 MachineBasicBlock *MBB = MI.getParent();
4951
4952 const MCInstrDesc &Op32Desc = get(Op32);
4953 MachineInstrBuilder Inst32 =
4954 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4955 .setMIFlags(MI.getFlags());
4956
4957 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4958 // For VOPC instructions, this is replaced by an implicit def of vcc.
4959
4960 // We assume the defs of the shrunk opcode are in the same order, and the
4961 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4962 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4963 Inst32.add(MI.getOperand(I));
4964
4965 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4966
4967 int Idx = MI.getNumExplicitDefs();
4968 for (const MachineOperand &Use : MI.explicit_uses()) {
4969 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4971 continue;
4972
4973 if (&Use == Src2) {
4974 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4975 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4976 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4977 // of vcc was already added during the initial BuildMI, but we
4978 // 1) may need to change vcc to vcc_lo to preserve the original register
4979 // 2) have to preserve the original flags.
4980 copyFlagsToImplicitVCC(*Inst32, *Src2);
4981 continue;
4982 }
4983 }
4984
4985 Inst32.add(Use);
4986 }
4987
4988 // FIXME: Losing implicit operands
4989 fixImplicitOperands(*Inst32);
4990 return Inst32;
4991}
4992
4994 // Null is free
4995 Register Reg = RegOp.getReg();
4996 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4997 return false;
4998
4999 // SGPRs use the constant bus
5000
5001 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5002 // physical register operands should also count, except for exec.
5003 if (RegOp.isImplicit())
5004 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5005
5006 // SGPRs use the constant bus
5007 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5008 AMDGPU::SReg_64RegClass.contains(Reg);
5009}
5010
5012 const MachineRegisterInfo &MRI) const {
5013 Register Reg = RegOp.getReg();
5014 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5015 : physRegUsesConstantBus(RegOp);
5016}
5017
5019 const MachineOperand &MO,
5020 const MCOperandInfo &OpInfo) const {
5021 // Literal constants use the constant bus.
5022 if (!MO.isReg())
5023 return !isInlineConstant(MO, OpInfo);
5024
5025 Register Reg = MO.getReg();
5026 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5028}
5029
5031 for (const MachineOperand &MO : MI.implicit_operands()) {
5032 // We only care about reads.
5033 if (MO.isDef())
5034 continue;
5035
5036 switch (MO.getReg()) {
5037 case AMDGPU::VCC:
5038 case AMDGPU::VCC_LO:
5039 case AMDGPU::VCC_HI:
5040 case AMDGPU::M0:
5041 case AMDGPU::FLAT_SCR:
5042 return MO.getReg();
5043
5044 default:
5045 break;
5046 }
5047 }
5048
5049 return Register();
5050}
5051
5052static bool shouldReadExec(const MachineInstr &MI) {
5053 if (SIInstrInfo::isVALU(MI)) {
5054 switch (MI.getOpcode()) {
5055 case AMDGPU::V_READLANE_B32:
5056 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5057 case AMDGPU::V_WRITELANE_B32:
5058 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5059 return false;
5060 }
5061
5062 return true;
5063 }
5064
5065 if (MI.isPreISelOpcode() ||
5066 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5069 return false;
5070
5071 return true;
5072}
5073
5074static bool isRegOrFI(const MachineOperand &MO) {
5075 return MO.isReg() || MO.isFI();
5076}
5077
5078static bool isSubRegOf(const SIRegisterInfo &TRI,
5079 const MachineOperand &SuperVec,
5080 const MachineOperand &SubReg) {
5081 if (SubReg.getReg().isPhysical())
5082 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5083
5084 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5085 SubReg.getReg() == SuperVec.getReg();
5086}
5087
5088// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5089bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5090 const MachineRegisterInfo &MRI,
5091 StringRef &ErrInfo) const {
5092 Register DstReg = MI.getOperand(0).getReg();
5093 Register SrcReg = MI.getOperand(1).getReg();
5094 // This is a check for copy from vector register to SGPR
5095 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5096 ErrInfo = "illegal copy from vector register to SGPR";
5097 return false;
5098 }
5099 return true;
5100}
5101
5103 StringRef &ErrInfo) const {
5104 uint16_t Opcode = MI.getOpcode();
5105 const MachineFunction *MF = MI.getMF();
5106 const MachineRegisterInfo &MRI = MF->getRegInfo();
5107
5108 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5109 // Find a better property to recognize the point where instruction selection
5110 // is just done.
5111 // We can only enforce this check after SIFixSGPRCopies pass so that the
5112 // illegal copies are legalized and thereafter we don't expect a pass
5113 // inserting similar copies.
5114 if (!MRI.isSSA() && MI.isCopy())
5115 return verifyCopy(MI, MRI, ErrInfo);
5116
5117 if (SIInstrInfo::isGenericOpcode(Opcode))
5118 return true;
5119
5120 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5121 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5122 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5123 int Src3Idx = -1;
5124 if (Src0Idx == -1) {
5125 // VOPD V_DUAL_* instructions use different operand names.
5126 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5127 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5128 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5129 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5130 }
5131
5132 // Make sure the number of operands is correct.
5133 const MCInstrDesc &Desc = get(Opcode);
5134 if (!Desc.isVariadic() &&
5135 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5136 ErrInfo = "Instruction has wrong number of operands.";
5137 return false;
5138 }
5139
5140 if (MI.isInlineAsm()) {
5141 // Verify register classes for inlineasm constraints.
5142 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5143 I != E; ++I) {
5144 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5145 if (!RC)
5146 continue;
5147
5148 const MachineOperand &Op = MI.getOperand(I);
5149 if (!Op.isReg())
5150 continue;
5151
5152 Register Reg = Op.getReg();
5153 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5154 ErrInfo = "inlineasm operand has incorrect register class.";
5155 return false;
5156 }
5157 }
5158
5159 return true;
5160 }
5161
5162 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5163 ErrInfo = "missing memory operand from image instruction.";
5164 return false;
5165 }
5166
5167 // Make sure the register classes are correct.
5168 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5169 const MachineOperand &MO = MI.getOperand(i);
5170 if (MO.isFPImm()) {
5171 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5172 "all fp values to integers.";
5173 return false;
5174 }
5175
5176 const MCOperandInfo &OpInfo = Desc.operands()[i];
5177 int16_t RegClass = getOpRegClassID(OpInfo);
5178
5179 switch (OpInfo.OperandType) {
5181 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5182 ErrInfo = "Illegal immediate value for operand.";
5183 return false;
5184 }
5185 break;
5198 break;
5200 break;
5201 break;
5215 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5216 ErrInfo = "Illegal immediate value for operand.";
5217 return false;
5218 }
5219 break;
5220 }
5222 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5223 ErrInfo = "Expected inline constant for operand.";
5224 return false;
5225 }
5226 break;
5230 break;
5235 // Check if this operand is an immediate.
5236 // FrameIndex operands will be replaced by immediates, so they are
5237 // allowed.
5238 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5239 ErrInfo = "Expected immediate, but got non-immediate";
5240 return false;
5241 }
5242 break;
5246 break;
5247 default:
5248 if (OpInfo.isGenericType())
5249 continue;
5250 break;
5251 }
5252
5253 if (!MO.isReg())
5254 continue;
5255 Register Reg = MO.getReg();
5256 if (!Reg)
5257 continue;
5258
5259 // FIXME: Ideally we would have separate instruction definitions with the
5260 // aligned register constraint.
5261 // FIXME: We do not verify inline asm operands, but custom inline asm
5262 // verification is broken anyway
5263 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5264 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5265 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5266 if (const TargetRegisterClass *SubRC =
5267 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5268 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5269 if (RC)
5270 RC = SubRC;
5271 }
5272 }
5273
5274 // Check that this is the aligned version of the class.
5275 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5276 ErrInfo = "Subtarget requires even aligned vector registers";
5277 return false;
5278 }
5279 }
5280
5281 if (RegClass != -1) {
5282 if (Reg.isVirtual())
5283 continue;
5284
5285 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5286 if (!RC->contains(Reg)) {
5287 ErrInfo = "Operand has incorrect register class.";
5288 return false;
5289 }
5290 }
5291 }
5292
5293 // Verify SDWA
5294 if (isSDWA(MI)) {
5295 if (!ST.hasSDWA()) {
5296 ErrInfo = "SDWA is not supported on this target";
5297 return false;
5298 }
5299
5300 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5301 AMDGPU::OpName::dst_sel}) {
5302 const MachineOperand *MO = getNamedOperand(MI, Op);
5303 if (!MO)
5304 continue;
5305 int64_t Imm = MO->getImm();
5306 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5307 ErrInfo = "Invalid SDWA selection";
5308 return false;
5309 }
5310 }
5311
5312 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5313
5314 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5315 if (OpIdx == -1)
5316 continue;
5317 const MachineOperand &MO = MI.getOperand(OpIdx);
5318
5319 if (!ST.hasSDWAScalar()) {
5320 // Only VGPRS on VI
5321 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5322 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5323 return false;
5324 }
5325 } else {
5326 // No immediates on GFX9
5327 if (!MO.isReg()) {
5328 ErrInfo =
5329 "Only reg allowed as operands in SDWA instructions on GFX9+";
5330 return false;
5331 }
5332 }
5333 }
5334
5335 if (!ST.hasSDWAOmod()) {
5336 // No omod allowed on VI
5337 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5338 if (OMod != nullptr &&
5339 (!OMod->isImm() || OMod->getImm() != 0)) {
5340 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5341 return false;
5342 }
5343 }
5344
5345 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5346 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5347 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5348 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5349 const MachineOperand *Src0ModsMO =
5350 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5351 unsigned Mods = Src0ModsMO->getImm();
5352 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5353 Mods & SISrcMods::SEXT) {
5354 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5355 return false;
5356 }
5357 }
5358
5359 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5360 if (isVOPC(BasicOpcode)) {
5361 if (!ST.hasSDWASdst() && DstIdx != -1) {
5362 // Only vcc allowed as dst on VI for VOPC
5363 const MachineOperand &Dst = MI.getOperand(DstIdx);
5364 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5365 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5366 return false;
5367 }
5368 } else if (!ST.hasSDWAOutModsVOPC()) {
5369 // No clamp allowed on GFX9 for VOPC
5370 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5371 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5372 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5373 return false;
5374 }
5375
5376 // No omod allowed on GFX9 for VOPC
5377 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5378 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5379 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5380 return false;
5381 }
5382 }
5383 }
5384
5385 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5386 if (DstUnused && DstUnused->isImm() &&
5387 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5388 const MachineOperand &Dst = MI.getOperand(DstIdx);
5389 if (!Dst.isReg() || !Dst.isTied()) {
5390 ErrInfo = "Dst register should have tied register";
5391 return false;
5392 }
5393
5394 const MachineOperand &TiedMO =
5395 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5396 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5397 ErrInfo =
5398 "Dst register should be tied to implicit use of preserved register";
5399 return false;
5400 }
5401 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5402 ErrInfo = "Dst register should use same physical register as preserved";
5403 return false;
5404 }
5405 }
5406 }
5407
5408 // Verify MIMG / VIMAGE / VSAMPLE
5409 if (isImage(Opcode) && !MI.mayStore()) {
5410 // Ensure that the return type used is large enough for all the options
5411 // being used TFE/LWE require an extra result register.
5412 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5413 if (DMask) {
5414 uint64_t DMaskImm = DMask->getImm();
5415 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5416 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5417 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5418 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5419
5420 // Adjust for packed 16 bit values
5421 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5422 RegCount = divideCeil(RegCount, 2);
5423
5424 // Adjust if using LWE or TFE
5425 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5426 RegCount += 1;
5427
5428 const uint32_t DstIdx =
5429 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5430 const MachineOperand &Dst = MI.getOperand(DstIdx);
5431 if (Dst.isReg()) {
5432 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5433 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5434 if (RegCount > DstSize) {
5435 ErrInfo = "Image instruction returns too many registers for dst "
5436 "register class";
5437 return false;
5438 }
5439 }
5440 }
5441 }
5442
5443 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5444 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5445 unsigned ConstantBusCount = 0;
5446 bool UsesLiteral = false;
5447 const MachineOperand *LiteralVal = nullptr;
5448
5449 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5450 if (ImmIdx != -1) {
5451 ++ConstantBusCount;
5452 UsesLiteral = true;
5453 LiteralVal = &MI.getOperand(ImmIdx);
5454 }
5455
5456 SmallVector<Register, 2> SGPRsUsed;
5457 Register SGPRUsed;
5458
5459 // Only look at the true operands. Only a real operand can use the constant
5460 // bus, and we don't want to check pseudo-operands like the source modifier
5461 // flags.
5462 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5463 if (OpIdx == -1)
5464 continue;
5465 const MachineOperand &MO = MI.getOperand(OpIdx);
5466 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5467 if (MO.isReg()) {
5468 SGPRUsed = MO.getReg();
5469 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5470 ++ConstantBusCount;
5471 SGPRsUsed.push_back(SGPRUsed);
5472 }
5473 } else if (!MO.isFI()) { // Treat FI like a register.
5474 if (!UsesLiteral) {
5475 ++ConstantBusCount;
5476 UsesLiteral = true;
5477 LiteralVal = &MO;
5478 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5479 assert(isVOP2(MI) || isVOP3(MI));
5480 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5481 return false;
5482 }
5483 }
5484 }
5485 }
5486
5487 SGPRUsed = findImplicitSGPRRead(MI);
5488 if (SGPRUsed) {
5489 // Implicit uses may safely overlap true operands
5490 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5491 return !RI.regsOverlap(SGPRUsed, SGPR);
5492 })) {
5493 ++ConstantBusCount;
5494 SGPRsUsed.push_back(SGPRUsed);
5495 }
5496 }
5497
5498 // v_writelane_b32 is an exception from constant bus restriction:
5499 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5500 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5501 Opcode != AMDGPU::V_WRITELANE_B32) {
5502 ErrInfo = "VOP* instruction violates constant bus restriction";
5503 return false;
5504 }
5505
5506 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5507 ErrInfo = "VOP3 instruction uses literal";
5508 return false;
5509 }
5510 }
5511
5512 // Special case for writelane - this can break the multiple constant bus rule,
5513 // but still can't use more than one SGPR register
5514 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5515 unsigned SGPRCount = 0;
5516 Register SGPRUsed;
5517
5518 for (int OpIdx : {Src0Idx, Src1Idx}) {
5519 if (OpIdx == -1)
5520 break;
5521
5522 const MachineOperand &MO = MI.getOperand(OpIdx);
5523
5524 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5525 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5526 if (MO.getReg() != SGPRUsed)
5527 ++SGPRCount;
5528 SGPRUsed = MO.getReg();
5529 }
5530 }
5531 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5532 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5533 return false;
5534 }
5535 }
5536 }
5537
5538 // Verify misc. restrictions on specific instructions.
5539 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5540 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5541 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5542 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5543 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5544 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5545 if (!compareMachineOp(Src0, Src1) &&
5546 !compareMachineOp(Src0, Src2)) {
5547 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5548 return false;
5549 }
5550 }
5551 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5552 SISrcMods::ABS) ||
5553 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5554 SISrcMods::ABS) ||
5555 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5556 SISrcMods::ABS)) {
5557 ErrInfo = "ABS not allowed in VOP3B instructions";
5558 return false;
5559 }
5560 }
5561
5562 if (isSOP2(MI) || isSOPC(MI)) {
5563 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5564 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5565
5566 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5567 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5568 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5569 !Src0.isIdenticalTo(Src1)) {
5570 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5571 return false;
5572 }
5573 }
5574
5575 if (isSOPK(MI)) {
5576 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5577 if (Desc.isBranch()) {
5578 if (!Op->isMBB()) {
5579 ErrInfo = "invalid branch target for SOPK instruction";
5580 return false;
5581 }
5582 } else {
5583 uint64_t Imm = Op->getImm();
5584 if (sopkIsZext(Opcode)) {
5585 if (!isUInt<16>(Imm)) {
5586 ErrInfo = "invalid immediate for SOPK instruction";
5587 return false;
5588 }
5589 } else {
5590 if (!isInt<16>(Imm)) {
5591 ErrInfo = "invalid immediate for SOPK instruction";
5592 return false;
5593 }
5594 }
5595 }
5596 }
5597
5598 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5599 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5600 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5601 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5602 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5603 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5604
5605 const unsigned StaticNumOps =
5606 Desc.getNumOperands() + Desc.implicit_uses().size();
5607 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5608
5609 // Require additional implicit operands. This allows a fixup done by the
5610 // post RA scheduler where the main implicit operand is killed and
5611 // implicit-defs are added for sub-registers that remain live after this
5612 // instruction.
5613 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5614 ErrInfo = "missing implicit register operands";
5615 return false;
5616 }
5617
5618 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5619 if (IsDst) {
5620 if (!Dst->isUse()) {
5621 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5622 return false;
5623 }
5624
5625 unsigned UseOpIdx;
5626 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5627 UseOpIdx != StaticNumOps + 1) {
5628 ErrInfo = "movrel implicit operands should be tied";
5629 return false;
5630 }
5631 }
5632
5633 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5634 const MachineOperand &ImpUse
5635 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5636 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5637 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5638 ErrInfo = "src0 should be subreg of implicit vector use";
5639 return false;
5640 }
5641 }
5642
5643 // Make sure we aren't losing exec uses in the td files. This mostly requires
5644 // being careful when using let Uses to try to add other use registers.
5645 if (shouldReadExec(MI)) {
5646 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5647 ErrInfo = "VALU instruction does not implicitly read exec mask";
5648 return false;
5649 }
5650 }
5651
5652 if (isSMRD(MI)) {
5653 if (MI.mayStore() &&
5654 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5655 // The register offset form of scalar stores may only use m0 as the
5656 // soffset register.
5657 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5658 if (Soff && Soff->getReg() != AMDGPU::M0) {
5659 ErrInfo = "scalar stores must use m0 as offset register";
5660 return false;
5661 }
5662 }
5663 }
5664
5665 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5666 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5667 if (Offset->getImm() != 0) {
5668 ErrInfo = "subtarget does not support offsets in flat instructions";
5669 return false;
5670 }
5671 }
5672
5673 if (isDS(MI) && !ST.hasGDS()) {
5674 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5675 if (GDSOp && GDSOp->getImm() != 0) {
5676 ErrInfo = "GDS is not supported on this subtarget";
5677 return false;
5678 }
5679 }
5680
5681 if (isImage(MI)) {
5682 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5683 if (DimOp) {
5684 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5685 AMDGPU::OpName::vaddr0);
5686 AMDGPU::OpName RSrcOpName =
5687 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5688 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5689 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5690 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5691 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5692 const AMDGPU::MIMGDimInfo *Dim =
5694
5695 if (!Dim) {
5696 ErrInfo = "dim is out of range";
5697 return false;
5698 }
5699
5700 bool IsA16 = false;
5701 if (ST.hasR128A16()) {
5702 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5703 IsA16 = R128A16->getImm() != 0;
5704 } else if (ST.hasA16()) {
5705 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5706 IsA16 = A16->getImm() != 0;
5707 }
5708
5709 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5710
5711 unsigned AddrWords =
5712 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5713
5714 unsigned VAddrWords;
5715 if (IsNSA) {
5716 VAddrWords = RsrcIdx - VAddr0Idx;
5717 if (ST.hasPartialNSAEncoding() &&
5718 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5719 unsigned LastVAddrIdx = RsrcIdx - 1;
5720 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5721 }
5722 } else {
5723 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5724 if (AddrWords > 12)
5725 AddrWords = 16;
5726 }
5727
5728 if (VAddrWords != AddrWords) {
5729 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5730 << " but got " << VAddrWords << "\n");
5731 ErrInfo = "bad vaddr size";
5732 return false;
5733 }
5734 }
5735 }
5736
5737 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5738 if (DppCt) {
5739 using namespace AMDGPU::DPP;
5740
5741 unsigned DC = DppCt->getImm();
5742 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5743 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5744 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5745 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5746 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5747 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5748 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5749 ErrInfo = "Invalid dpp_ctrl value";
5750 return false;
5751 }
5752 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5753 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5754 ErrInfo = "Invalid dpp_ctrl value: "
5755 "wavefront shifts are not supported on GFX10+";
5756 return false;
5757 }
5758 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5759 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5760 ErrInfo = "Invalid dpp_ctrl value: "
5761 "broadcasts are not supported on GFX10+";
5762 return false;
5763 }
5764 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5765 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5766 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5767 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5768 !ST.hasGFX90AInsts()) {
5769 ErrInfo = "Invalid dpp_ctrl value: "
5770 "row_newbroadcast/row_share is not supported before "
5771 "GFX90A/GFX10";
5772 return false;
5773 }
5774 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5775 ErrInfo = "Invalid dpp_ctrl value: "
5776 "row_share and row_xmask are not supported before GFX10";
5777 return false;
5778 }
5779 }
5780
5781 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5783 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5784 ErrInfo = "Invalid dpp_ctrl value: "
5785 "DP ALU dpp only support row_newbcast";
5786 return false;
5787 }
5788 }
5789
5790 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5791 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5792 AMDGPU::OpName DataName =
5793 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5794 const MachineOperand *Data = getNamedOperand(MI, DataName);
5795 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5796 if (Data && !Data->isReg())
5797 Data = nullptr;
5798
5799 if (ST.hasGFX90AInsts()) {
5800 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5801 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5802 ErrInfo = "Invalid register class: "
5803 "vdata and vdst should be both VGPR or AGPR";
5804 return false;
5805 }
5806 if (Data && Data2 &&
5807 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5808 ErrInfo = "Invalid register class: "
5809 "both data operands should be VGPR or AGPR";
5810 return false;
5811 }
5812 } else {
5813 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5814 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5815 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5816 ErrInfo = "Invalid register class: "
5817 "agpr loads and stores not supported on this GPU";
5818 return false;
5819 }
5820 }
5821 }
5822
5823 if (ST.needsAlignedVGPRs()) {
5824 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5826 if (!Op)
5827 return true;
5828 Register Reg = Op->getReg();
5829 if (Reg.isPhysical())
5830 return !(RI.getHWRegIndex(Reg) & 1);
5831 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5832 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5833 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5834 };
5835
5836 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5837 Opcode == AMDGPU::DS_GWS_BARRIER) {
5838
5839 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5840 ErrInfo = "Subtarget requires even aligned vector registers "
5841 "for DS_GWS instructions";
5842 return false;
5843 }
5844 }
5845
5846 if (isMIMG(MI)) {
5847 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5848 ErrInfo = "Subtarget requires even aligned vector registers "
5849 "for vaddr operand of image instructions";
5850 return false;
5851 }
5852 }
5853 }
5854
5855 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5856 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5857 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5858 ErrInfo = "Invalid register class: "
5859 "v_accvgpr_write with an SGPR is not supported on this GPU";
5860 return false;
5861 }
5862 }
5863
5864 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5865 const MachineOperand &SrcOp = MI.getOperand(1);
5866 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5867 ErrInfo = "pseudo expects only physical SGPRs";
5868 return false;
5869 }
5870 }
5871
5872 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5873 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5874 if (!ST.hasScaleOffset()) {
5875 ErrInfo = "Subtarget does not support offset scaling";
5876 return false;
5877 }
5878 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5879 ErrInfo = "Instruction does not support offset scaling";
5880 return false;
5881 }
5882 }
5883 }
5884
5885 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5886 // information.
5887 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5888 for (unsigned I = 0; I < 3; ++I) {
5890 return false;
5891 }
5892 }
5893
5894 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5895 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5896 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5897 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5898 &AMDGPU::SReg_64RegClass) ||
5899 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5900 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5901 return false;
5902 }
5903 }
5904
5905 return true;
5906}
5907
5908// It is more readable to list mapped opcodes on the same line.
5909// clang-format off
5910
5912 switch (MI.getOpcode()) {
5913 default: return AMDGPU::INSTRUCTION_LIST_END;
5914 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5915 case AMDGPU::COPY: return AMDGPU::COPY;
5916 case AMDGPU::PHI: return AMDGPU::PHI;
5917 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5918 case AMDGPU::WQM: return AMDGPU::WQM;
5919 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5920 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5921 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5922 case AMDGPU::S_MOV_B32: {
5923 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5924 return MI.getOperand(1).isReg() ||
5925 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5926 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5927 }
5928 case AMDGPU::S_ADD_I32:
5929 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5930 case AMDGPU::S_ADDC_U32:
5931 return AMDGPU::V_ADDC_U32_e32;
5932 case AMDGPU::S_SUB_I32:
5933 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5934 // FIXME: These are not consistently handled, and selected when the carry is
5935 // used.
5936 case AMDGPU::S_ADD_U32:
5937 return AMDGPU::V_ADD_CO_U32_e32;
5938 case AMDGPU::S_SUB_U32:
5939 return AMDGPU::V_SUB_CO_U32_e32;
5940 case AMDGPU::S_ADD_U64_PSEUDO:
5941 return AMDGPU::V_ADD_U64_PSEUDO;
5942 case AMDGPU::S_SUB_U64_PSEUDO:
5943 return AMDGPU::V_SUB_U64_PSEUDO;
5944 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5945 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5946 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5947 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5948 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5949 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5950 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5951 case AMDGPU::S_XNOR_B32:
5952 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5953 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5954 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5955 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5956 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5957 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5958 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5959 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5960 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5961 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5962 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5963 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5964 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5965 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5966 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5967 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5968 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5969 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5970 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5971 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5972 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5973 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5974 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5975 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5976 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5977 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5978 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5979 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5980 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5981 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5982 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5983 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5984 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5985 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5986 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5987 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5988 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5989 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5990 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5991 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5992 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5993 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5994 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5995 case AMDGPU::S_CVT_F32_F16:
5996 case AMDGPU::S_CVT_HI_F32_F16:
5997 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5998 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5999 case AMDGPU::S_CVT_F16_F32:
6000 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6001 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6002 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6003 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6004 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6005 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6006 case AMDGPU::S_CEIL_F16:
6007 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6008 : AMDGPU::V_CEIL_F16_fake16_e64;
6009 case AMDGPU::S_FLOOR_F16:
6010 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6011 : AMDGPU::V_FLOOR_F16_fake16_e64;
6012 case AMDGPU::S_TRUNC_F16:
6013 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6014 : AMDGPU::V_TRUNC_F16_fake16_e64;
6015 case AMDGPU::S_RNDNE_F16:
6016 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6017 : AMDGPU::V_RNDNE_F16_fake16_e64;
6018 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6019 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6020 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6021 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6022 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6023 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6024 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6025 case AMDGPU::S_ADD_F16:
6026 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6027 : AMDGPU::V_ADD_F16_fake16_e64;
6028 case AMDGPU::S_SUB_F16:
6029 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6030 : AMDGPU::V_SUB_F16_fake16_e64;
6031 case AMDGPU::S_MIN_F16:
6032 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6033 : AMDGPU::V_MIN_F16_fake16_e64;
6034 case AMDGPU::S_MAX_F16:
6035 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6036 : AMDGPU::V_MAX_F16_fake16_e64;
6037 case AMDGPU::S_MINIMUM_F16:
6038 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6039 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6040 case AMDGPU::S_MAXIMUM_F16:
6041 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6042 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6043 case AMDGPU::S_MUL_F16:
6044 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6045 : AMDGPU::V_MUL_F16_fake16_e64;
6046 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6047 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6048 case AMDGPU::S_FMAC_F16:
6049 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6050 : AMDGPU::V_FMAC_F16_fake16_e64;
6051 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6052 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6053 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6054 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6055 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6056 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6057 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6058 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6059 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6060 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6061 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6062 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6063 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6064 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6065 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6066 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6067 case AMDGPU::S_CMP_LT_F16:
6068 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6069 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6070 case AMDGPU::S_CMP_EQ_F16:
6071 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6072 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6073 case AMDGPU::S_CMP_LE_F16:
6074 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6075 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6076 case AMDGPU::S_CMP_GT_F16:
6077 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6078 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6079 case AMDGPU::S_CMP_LG_F16:
6080 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6081 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6082 case AMDGPU::S_CMP_GE_F16:
6083 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6084 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6085 case AMDGPU::S_CMP_O_F16:
6086 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6087 : AMDGPU::V_CMP_O_F16_fake16_e64;
6088 case AMDGPU::S_CMP_U_F16:
6089 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6090 : AMDGPU::V_CMP_U_F16_fake16_e64;
6091 case AMDGPU::S_CMP_NGE_F16:
6092 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6093 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6094 case AMDGPU::S_CMP_NLG_F16:
6095 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6096 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6097 case AMDGPU::S_CMP_NGT_F16:
6098 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6099 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6100 case AMDGPU::S_CMP_NLE_F16:
6101 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6102 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6103 case AMDGPU::S_CMP_NEQ_F16:
6104 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6105 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6106 case AMDGPU::S_CMP_NLT_F16:
6107 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6108 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6109 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6110 case AMDGPU::V_S_EXP_F16_e64:
6111 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6112 : AMDGPU::V_EXP_F16_fake16_e64;
6113 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6114 case AMDGPU::V_S_LOG_F16_e64:
6115 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6116 : AMDGPU::V_LOG_F16_fake16_e64;
6117 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6118 case AMDGPU::V_S_RCP_F16_e64:
6119 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6120 : AMDGPU::V_RCP_F16_fake16_e64;
6121 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6122 case AMDGPU::V_S_RSQ_F16_e64:
6123 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6124 : AMDGPU::V_RSQ_F16_fake16_e64;
6125 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6126 case AMDGPU::V_S_SQRT_F16_e64:
6127 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6128 : AMDGPU::V_SQRT_F16_fake16_e64;
6129 }
6131 "Unexpected scalar opcode without corresponding vector one!");
6132}
6133
6134// clang-format on
6135
6139 const DebugLoc &DL, Register Reg,
6140 bool IsSCCLive,
6141 SlotIndexes *Indexes) const {
6142 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6143 const SIInstrInfo *TII = ST.getInstrInfo();
6145 if (IsSCCLive) {
6146 // Insert two move instructions, one to save the original value of EXEC and
6147 // the other to turn on all bits in EXEC. This is required as we can't use
6148 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6149 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6151 auto FlipExecMI =
6152 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6153 if (Indexes) {
6154 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6155 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6156 }
6157 } else {
6158 auto SaveExec =
6159 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6160 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6161 if (Indexes)
6162 Indexes->insertMachineInstrInMaps(*SaveExec);
6163 }
6164}
6165
6168 const DebugLoc &DL, Register Reg,
6169 SlotIndexes *Indexes) const {
6171 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6172 .addReg(Reg, RegState::Kill);
6173 if (Indexes)
6174 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6175}
6176
6180 "Not a whole wave func");
6181 MachineBasicBlock &MBB = *MF.begin();
6182 for (MachineInstr &MI : MBB)
6183 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6184 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6185 return &MI;
6186
6187 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6188}
6189
6191 unsigned OpNo) const {
6192 const MCInstrDesc &Desc = get(MI.getOpcode());
6193 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6194 Desc.operands()[OpNo].RegClass == -1) {
6195 Register Reg = MI.getOperand(OpNo).getReg();
6196
6197 if (Reg.isVirtual()) {
6198 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6199 return MRI.getRegClass(Reg);
6200 }
6201 return RI.getPhysRegBaseClass(Reg);
6202 }
6203
6204 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6205 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6206}
6207
6210 MachineBasicBlock *MBB = MI.getParent();
6211 MachineOperand &MO = MI.getOperand(OpIdx);
6212 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6213 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6214 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6215 unsigned Size = RI.getRegSizeInBits(*RC);
6216 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6217 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6218 : AMDGPU::V_MOV_B32_e32;
6219 if (MO.isReg())
6220 Opcode = AMDGPU::COPY;
6221 else if (RI.isSGPRClass(RC))
6222 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6223
6224 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6225 Register Reg = MRI.createVirtualRegister(VRC);
6226 DebugLoc DL = MBB->findDebugLoc(I);
6227 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6228 MO.ChangeToRegister(Reg, false);
6229}
6230
6233 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6234 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6235 if (!SuperReg.getReg().isVirtual())
6236 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6237
6238 MachineBasicBlock *MBB = MI->getParent();
6239 const DebugLoc &DL = MI->getDebugLoc();
6240 Register SubReg = MRI.createVirtualRegister(SubRC);
6241
6242 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6243 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6244 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6245 return SubReg;
6246}
6247
6250 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6251 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6252 if (Op.isImm()) {
6253 if (SubIdx == AMDGPU::sub0)
6254 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6255 if (SubIdx == AMDGPU::sub1)
6256 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6257
6258 llvm_unreachable("Unhandled register index for immediate");
6259 }
6260
6261 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6262 SubIdx, SubRC);
6263 return MachineOperand::CreateReg(SubReg, false);
6264}
6265
6266// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6267void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6268 assert(Inst.getNumExplicitOperands() == 3);
6269 MachineOperand Op1 = Inst.getOperand(1);
6270 Inst.removeOperand(1);
6271 Inst.addOperand(Op1);
6272}
6273
6275 const MCOperandInfo &OpInfo,
6276 const MachineOperand &MO) const {
6277 if (!MO.isReg())
6278 return false;
6279
6280 Register Reg = MO.getReg();
6281
6282 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6283 if (Reg.isPhysical())
6284 return DRC->contains(Reg);
6285
6286 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6287
6288 if (MO.getSubReg()) {
6289 const MachineFunction *MF = MO.getParent()->getMF();
6290 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6291 if (!SuperRC)
6292 return false;
6293 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6294 }
6295
6296 return RI.getCommonSubClass(DRC, RC) != nullptr;
6297}
6298
6300 const MachineOperand &MO) const {
6301 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6302 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6303 unsigned Opc = MI.getOpcode();
6304
6305 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6306 // information.
6307 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6308 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6309 constexpr AMDGPU::OpName OpNames[] = {
6310 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6311
6312 for (auto [I, OpName] : enumerate(OpNames)) {
6313 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6314 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6316 return false;
6317 }
6318 }
6319
6320 if (!isLegalRegOperand(MRI, OpInfo, MO))
6321 return false;
6322
6323 // check Accumulate GPR operand
6324 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6325 if (IsAGPR && !ST.hasMAIInsts())
6326 return false;
6327 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6328 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6329 return false;
6330 // Atomics should have both vdst and vdata either vgpr or agpr.
6331 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6332 const int DataIdx = AMDGPU::getNamedOperandIdx(
6333 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6334 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6335 MI.getOperand(DataIdx).isReg() &&
6336 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6337 return false;
6338 if ((int)OpIdx == DataIdx) {
6339 if (VDstIdx != -1 &&
6340 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6341 return false;
6342 // DS instructions with 2 src operands also must have tied RC.
6343 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6344 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6345 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6346 return false;
6347 }
6348
6349 // Check V_ACCVGPR_WRITE_B32_e64
6350 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6351 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6352 RI.isSGPRReg(MRI, MO.getReg()))
6353 return false;
6354
6355 if (ST.hasFlatScratchHiInB64InstHazard() &&
6356 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6357 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6358 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6359 64)
6360 return false;
6361 }
6362 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6363 return false;
6364 }
6365
6366 return true;
6367}
6368
6370 const MCOperandInfo &OpInfo,
6371 const MachineOperand &MO) const {
6372 if (MO.isReg())
6373 return isLegalRegOperand(MRI, OpInfo, MO);
6374
6375 // Handle non-register types that are treated like immediates.
6376 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6377 return true;
6378}
6379
6381 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6382 const MachineOperand *MO) const {
6383 constexpr unsigned NumOps = 3;
6384 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6385 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6386 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6387 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6388
6389 assert(SrcN < NumOps);
6390
6391 if (!MO) {
6392 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6393 if (SrcIdx == -1)
6394 return true;
6395 MO = &MI.getOperand(SrcIdx);
6396 }
6397
6398 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6399 return true;
6400
6401 int ModsIdx =
6402 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6403 if (ModsIdx == -1)
6404 return true;
6405
6406 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6407 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6408 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6409
6410 return !OpSel && !OpSelHi;
6411}
6412
6414 const MachineOperand *MO) const {
6415 const MachineFunction &MF = *MI.getMF();
6416 const MachineRegisterInfo &MRI = MF.getRegInfo();
6417 const MCInstrDesc &InstDesc = MI.getDesc();
6418 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6419 int64_t RegClass = getOpRegClassID(OpInfo);
6420 const TargetRegisterClass *DefinedRC =
6421 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6422 if (!MO)
6423 MO = &MI.getOperand(OpIdx);
6424
6425 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6426
6427 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6428 const MachineOperand *UsedLiteral = nullptr;
6429
6430 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6431 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6432
6433 // TODO: Be more permissive with frame indexes.
6434 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6435 if (!LiteralLimit--)
6436 return false;
6437
6438 UsedLiteral = MO;
6439 }
6440
6442 if (MO->isReg())
6443 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6444
6445 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6446 if (i == OpIdx)
6447 continue;
6448 const MachineOperand &Op = MI.getOperand(i);
6449 if (Op.isReg()) {
6450 if (Op.isUse()) {
6451 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6452 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6453 if (--ConstantBusLimit <= 0)
6454 return false;
6455 }
6456 }
6457 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6458 !isInlineConstant(Op, InstDesc.operands()[i])) {
6459 // The same literal may be used multiple times.
6460 if (!UsedLiteral)
6461 UsedLiteral = &Op;
6462 else if (UsedLiteral->isIdenticalTo(Op))
6463 continue;
6464
6465 if (!LiteralLimit--)
6466 return false;
6467 if (--ConstantBusLimit <= 0)
6468 return false;
6469 }
6470 }
6471 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6472 // There can be at most one literal operand, but it can be repeated.
6473 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6474 if (i == OpIdx)
6475 continue;
6476 const MachineOperand &Op = MI.getOperand(i);
6477 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6478 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6479 !Op.isIdenticalTo(*MO))
6480 return false;
6481
6482 // Do not fold a non-inlineable and non-register operand into an
6483 // instruction that already has a frame index. The frame index handling
6484 // code could not handle well when a frame index co-exists with another
6485 // non-register operand, unless that operand is an inlineable immediate.
6486 if (Op.isFI())
6487 return false;
6488 }
6489 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6490 isF16PseudoScalarTrans(MI.getOpcode())) {
6491 return false;
6492 }
6493
6494 if (MO->isReg()) {
6495 if (!DefinedRC)
6496 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6497 return isLegalRegOperand(MI, OpIdx, *MO);
6498 }
6499
6500 if (MO->isImm()) {
6501 uint64_t Imm = MO->getImm();
6502 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6503 bool Is64BitOp = Is64BitFPOp ||
6504 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6505 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6506 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6507 if (Is64BitOp &&
6508 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6509 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6510 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6511 return false;
6512
6513 // FIXME: We can use sign extended 64-bit literals, but only for signed
6514 // operands. At the moment we do not know if an operand is signed.
6515 // Such operand will be encoded as its low 32 bits and then either
6516 // correctly sign extended or incorrectly zero extended by HW.
6517 // If 64-bit literals are supported and the literal will be encoded
6518 // as full 64 bit we still can use it.
6519 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6520 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6521 return false;
6522 }
6523 }
6524
6525 // Handle non-register types that are treated like immediates.
6526 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6527
6528 if (!DefinedRC) {
6529 // This operand expects an immediate.
6530 return true;
6531 }
6532
6533 return isImmOperandLegal(MI, OpIdx, *MO);
6534}
6535
6537 bool IsGFX950Only = ST.hasGFX950Insts();
6538 bool IsGFX940Only = ST.hasGFX940Insts();
6539
6540 if (!IsGFX950Only && !IsGFX940Only)
6541 return false;
6542
6543 if (!isVALU(MI))
6544 return false;
6545
6546 // V_COS, V_EXP, V_RCP, etc.
6547 if (isTRANS(MI))
6548 return true;
6549
6550 // DOT2, DOT2C, DOT4, etc.
6551 if (isDOT(MI))
6552 return true;
6553
6554 // MFMA, SMFMA
6555 if (isMFMA(MI))
6556 return true;
6557
6558 unsigned Opcode = MI.getOpcode();
6559 switch (Opcode) {
6560 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6561 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6562 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6563 case AMDGPU::V_MQSAD_U32_U8_e64:
6564 case AMDGPU::V_PK_ADD_F16:
6565 case AMDGPU::V_PK_ADD_F32:
6566 case AMDGPU::V_PK_ADD_I16:
6567 case AMDGPU::V_PK_ADD_U16:
6568 case AMDGPU::V_PK_ASHRREV_I16:
6569 case AMDGPU::V_PK_FMA_F16:
6570 case AMDGPU::V_PK_FMA_F32:
6571 case AMDGPU::V_PK_FMAC_F16_e32:
6572 case AMDGPU::V_PK_FMAC_F16_e64:
6573 case AMDGPU::V_PK_LSHLREV_B16:
6574 case AMDGPU::V_PK_LSHRREV_B16:
6575 case AMDGPU::V_PK_MAD_I16:
6576 case AMDGPU::V_PK_MAD_U16:
6577 case AMDGPU::V_PK_MAX_F16:
6578 case AMDGPU::V_PK_MAX_I16:
6579 case AMDGPU::V_PK_MAX_U16:
6580 case AMDGPU::V_PK_MIN_F16:
6581 case AMDGPU::V_PK_MIN_I16:
6582 case AMDGPU::V_PK_MIN_U16:
6583 case AMDGPU::V_PK_MOV_B32:
6584 case AMDGPU::V_PK_MUL_F16:
6585 case AMDGPU::V_PK_MUL_F32:
6586 case AMDGPU::V_PK_MUL_LO_U16:
6587 case AMDGPU::V_PK_SUB_I16:
6588 case AMDGPU::V_PK_SUB_U16:
6589 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6590 return true;
6591 default:
6592 return false;
6593 }
6594}
6595
6597 MachineInstr &MI) const {
6598 unsigned Opc = MI.getOpcode();
6599 const MCInstrDesc &InstrDesc = get(Opc);
6600
6601 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6602 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6603
6604 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6605 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6606
6607 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6608 // we need to only have one constant bus use before GFX10.
6609 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6610 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6611 RI.isSGPRReg(MRI, Src0.getReg()))
6612 legalizeOpWithMove(MI, Src0Idx);
6613
6614 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6615 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6616 // src0/src1 with V_READFIRSTLANE.
6617 if (Opc == AMDGPU::V_WRITELANE_B32) {
6618 const DebugLoc &DL = MI.getDebugLoc();
6619 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6620 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6621 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6622 .add(Src0);
6623 Src0.ChangeToRegister(Reg, false);
6624 }
6625 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6626 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6627 const DebugLoc &DL = MI.getDebugLoc();
6628 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6629 .add(Src1);
6630 Src1.ChangeToRegister(Reg, false);
6631 }
6632 return;
6633 }
6634
6635 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6636 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6637 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6638 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6639 legalizeOpWithMove(MI, Src2Idx);
6640 }
6641
6642 // VOP2 src0 instructions support all operand types, so we don't need to check
6643 // their legality. If src1 is already legal, we don't need to do anything.
6644 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6645 return;
6646
6647 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6648 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6649 // select is uniform.
6650 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6651 RI.isVGPR(MRI, Src1.getReg())) {
6652 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6653 const DebugLoc &DL = MI.getDebugLoc();
6654 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6655 .add(Src1);
6656 Src1.ChangeToRegister(Reg, false);
6657 return;
6658 }
6659
6660 // We do not use commuteInstruction here because it is too aggressive and will
6661 // commute if it is possible. We only want to commute here if it improves
6662 // legality. This can be called a fairly large number of times so don't waste
6663 // compile time pointlessly swapping and checking legality again.
6664 if (HasImplicitSGPR || !MI.isCommutable()) {
6665 legalizeOpWithMove(MI, Src1Idx);
6666 return;
6667 }
6668
6669 // If src0 can be used as src1, commuting will make the operands legal.
6670 // Otherwise we have to give up and insert a move.
6671 //
6672 // TODO: Other immediate-like operand kinds could be commuted if there was a
6673 // MachineOperand::ChangeTo* for them.
6674 if ((!Src1.isImm() && !Src1.isReg()) ||
6675 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6676 legalizeOpWithMove(MI, Src1Idx);
6677 return;
6678 }
6679
6680 int CommutedOpc = commuteOpcode(MI);
6681 if (CommutedOpc == -1) {
6682 legalizeOpWithMove(MI, Src1Idx);
6683 return;
6684 }
6685
6686 MI.setDesc(get(CommutedOpc));
6687
6688 Register Src0Reg = Src0.getReg();
6689 unsigned Src0SubReg = Src0.getSubReg();
6690 bool Src0Kill = Src0.isKill();
6691
6692 if (Src1.isImm())
6693 Src0.ChangeToImmediate(Src1.getImm());
6694 else if (Src1.isReg()) {
6695 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6696 Src0.setSubReg(Src1.getSubReg());
6697 } else
6698 llvm_unreachable("Should only have register or immediate operands");
6699
6700 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6701 Src1.setSubReg(Src0SubReg);
6703}
6704
6705// Legalize VOP3 operands. All operand types are supported for any operand
6706// but only one literal constant and only starting from GFX10.
6708 MachineInstr &MI) const {
6709 unsigned Opc = MI.getOpcode();
6710
6711 int VOP3Idx[3] = {
6712 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6713 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6714 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6715 };
6716
6717 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6718 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6719 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6720 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6721 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6722 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6723 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6724 // src1 and src2 must be scalar
6725 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6726 const DebugLoc &DL = MI.getDebugLoc();
6727 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6728 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6729 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6730 .add(Src1);
6731 Src1.ChangeToRegister(Reg, false);
6732 }
6733 if (VOP3Idx[2] != -1) {
6734 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6735 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6736 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6737 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6738 .add(Src2);
6739 Src2.ChangeToRegister(Reg, false);
6740 }
6741 }
6742 }
6743
6744 // Find the one SGPR operand we are allowed to use.
6745 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6746 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6747 SmallDenseSet<unsigned> SGPRsUsed;
6748 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6749 if (SGPRReg) {
6750 SGPRsUsed.insert(SGPRReg);
6751 --ConstantBusLimit;
6752 }
6753
6754 for (int Idx : VOP3Idx) {
6755 if (Idx == -1)
6756 break;
6757 MachineOperand &MO = MI.getOperand(Idx);
6758
6759 if (!MO.isReg()) {
6760 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6761 continue;
6762
6763 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6764 --LiteralLimit;
6765 --ConstantBusLimit;
6766 continue;
6767 }
6768
6769 --LiteralLimit;
6770 --ConstantBusLimit;
6771 legalizeOpWithMove(MI, Idx);
6772 continue;
6773 }
6774
6775 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6776 continue; // VGPRs are legal
6777
6778 // We can use one SGPR in each VOP3 instruction prior to GFX10
6779 // and two starting from GFX10.
6780 if (SGPRsUsed.count(MO.getReg()))
6781 continue;
6782 if (ConstantBusLimit > 0) {
6783 SGPRsUsed.insert(MO.getReg());
6784 --ConstantBusLimit;
6785 continue;
6786 }
6787
6788 // If we make it this far, then the operand is not legal and we must
6789 // legalize it.
6790 legalizeOpWithMove(MI, Idx);
6791 }
6792
6793 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6794 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6795 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6796 legalizeOpWithMove(MI, VOP3Idx[2]);
6797
6798 // Fix the register class of packed FP32 instructions on gfx12+. See
6799 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6801 for (unsigned I = 0; I < 3; ++I) {
6803 legalizeOpWithMove(MI, VOP3Idx[I]);
6804 }
6805 }
6806}
6807
6810 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6811 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6812 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6813 if (DstRC)
6814 SRC = RI.getCommonSubClass(SRC, DstRC);
6815
6816 Register DstReg = MRI.createVirtualRegister(SRC);
6817 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6818
6819 if (RI.hasAGPRs(VRC)) {
6820 VRC = RI.getEquivalentVGPRClass(VRC);
6821 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6822 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6823 get(TargetOpcode::COPY), NewSrcReg)
6824 .addReg(SrcReg);
6825 SrcReg = NewSrcReg;
6826 }
6827
6828 if (SubRegs == 1) {
6829 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6830 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6831 .addReg(SrcReg);
6832 return DstReg;
6833 }
6834
6836 for (unsigned i = 0; i < SubRegs; ++i) {
6837 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6838 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6839 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6840 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6841 SRegs.push_back(SGPR);
6842 }
6843
6845 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6846 get(AMDGPU::REG_SEQUENCE), DstReg);
6847 for (unsigned i = 0; i < SubRegs; ++i) {
6848 MIB.addReg(SRegs[i]);
6849 MIB.addImm(RI.getSubRegFromChannel(i));
6850 }
6851 return DstReg;
6852}
6853
6855 MachineInstr &MI) const {
6856
6857 // If the pointer is store in VGPRs, then we need to move them to
6858 // SGPRs using v_readfirstlane. This is safe because we only select
6859 // loads with uniform pointers to SMRD instruction so we know the
6860 // pointer value is uniform.
6861 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6862 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6863 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6864 SBase->setReg(SGPR);
6865 }
6866 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6867 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6868 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6869 SOff->setReg(SGPR);
6870 }
6871}
6872
6874 unsigned Opc = Inst.getOpcode();
6875 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6876 if (OldSAddrIdx < 0)
6877 return false;
6878
6879 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6880
6881 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6882 if (NewOpc < 0)
6884 if (NewOpc < 0)
6885 return false;
6886
6888 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6889 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6890 return false;
6891
6892 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6893 if (NewVAddrIdx < 0)
6894 return false;
6895
6896 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6897
6898 // Check vaddr, it shall be zero or absent.
6899 MachineInstr *VAddrDef = nullptr;
6900 if (OldVAddrIdx >= 0) {
6901 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6902 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6903 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6904 !VAddrDef->getOperand(1).isImm() ||
6905 VAddrDef->getOperand(1).getImm() != 0)
6906 return false;
6907 }
6908
6909 const MCInstrDesc &NewDesc = get(NewOpc);
6910 Inst.setDesc(NewDesc);
6911
6912 // Callers expect iterator to be valid after this call, so modify the
6913 // instruction in place.
6914 if (OldVAddrIdx == NewVAddrIdx) {
6915 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6916 // Clear use list from the old vaddr holding a zero register.
6917 MRI.removeRegOperandFromUseList(&NewVAddr);
6918 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6919 Inst.removeOperand(OldSAddrIdx);
6920 // Update the use list with the pointer we have just moved from vaddr to
6921 // saddr position. Otherwise new vaddr will be missing from the use list.
6922 MRI.removeRegOperandFromUseList(&NewVAddr);
6923 MRI.addRegOperandToUseList(&NewVAddr);
6924 } else {
6925 assert(OldSAddrIdx == NewVAddrIdx);
6926
6927 if (OldVAddrIdx >= 0) {
6928 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6929 AMDGPU::OpName::vdst_in);
6930
6931 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6932 // it asserts. Untie the operands for now and retie them afterwards.
6933 if (NewVDstIn != -1) {
6934 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6935 Inst.untieRegOperand(OldVDstIn);
6936 }
6937
6938 Inst.removeOperand(OldVAddrIdx);
6939
6940 if (NewVDstIn != -1) {
6941 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6942 Inst.tieOperands(NewVDst, NewVDstIn);
6943 }
6944 }
6945 }
6946
6947 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6948 VAddrDef->eraseFromParent();
6949
6950 return true;
6951}
6952
6953// FIXME: Remove this when SelectionDAG is obsoleted.
6955 MachineInstr &MI) const {
6956 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6957 return;
6958
6959 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6960 // thinks they are uniform, so a readfirstlane should be valid.
6961 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6962 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6963 return;
6964
6966 return;
6967
6968 const TargetRegisterClass *DeclaredRC =
6969 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6970
6971 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6972 SAddr->setReg(ToSGPR);
6973}
6974
6977 const TargetRegisterClass *DstRC,
6980 const DebugLoc &DL) const {
6981 Register OpReg = Op.getReg();
6982 unsigned OpSubReg = Op.getSubReg();
6983
6984 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6985 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6986
6987 // Check if operand is already the correct register class.
6988 if (DstRC == OpRC)
6989 return;
6990
6991 Register DstReg = MRI.createVirtualRegister(DstRC);
6992 auto Copy =
6993 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6994 Op.setReg(DstReg);
6995
6996 MachineInstr *Def = MRI.getVRegDef(OpReg);
6997 if (!Def)
6998 return;
6999
7000 // Try to eliminate the copy if it is copying an immediate value.
7001 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7002 foldImmediate(*Copy, *Def, OpReg, &MRI);
7003
7004 bool ImpDef = Def->isImplicitDef();
7005 while (!ImpDef && Def && Def->isCopy()) {
7006 if (Def->getOperand(1).getReg().isPhysical())
7007 break;
7008 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7009 ImpDef = Def && Def->isImplicitDef();
7010 }
7011 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7012 !ImpDef)
7013 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7014}
7015
7016// Emit the actual waterfall loop, executing the wrapped instruction for each
7017// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7018// iteration, in the worst case we execute 64 (once per lane).
7019static void
7022 MachineBasicBlock &LoopBB,
7023 MachineBasicBlock &BodyBB,
7024 const DebugLoc &DL,
7025 ArrayRef<MachineOperand *> ScalarOps) {
7026 MachineFunction &MF = *LoopBB.getParent();
7027 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7028 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7030 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7031
7033 Register CondReg;
7034
7035 for (MachineOperand *ScalarOp : ScalarOps) {
7036 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7037 unsigned NumSubRegs = RegSize / 32;
7038 Register VScalarOp = ScalarOp->getReg();
7039
7040 if (NumSubRegs == 1) {
7041 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7042
7043 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7044 .addReg(VScalarOp);
7045
7046 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7047
7048 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7049 .addReg(CurReg)
7050 .addReg(VScalarOp);
7051
7052 // Combine the comparison results with AND.
7053 if (!CondReg) // First.
7054 CondReg = NewCondReg;
7055 else { // If not the first, we create an AND.
7056 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7057 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7058 .addReg(CondReg)
7059 .addReg(NewCondReg);
7060 CondReg = AndReg;
7061 }
7062
7063 // Update ScalarOp operand to use the SGPR ScalarOp.
7064 ScalarOp->setReg(CurReg);
7065 ScalarOp->setIsKill();
7066 } else {
7067 SmallVector<Register, 8> ReadlanePieces;
7068 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7069 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7070 "Unhandled register size");
7071
7072 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7073 Register CurRegLo =
7074 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7075 Register CurRegHi =
7076 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7077
7078 // Read the next variant <- also loop target.
7079 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7080 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7081
7082 // Read the next variant <- also loop target.
7083 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7084 .addReg(VScalarOp, VScalarOpUndef,
7085 TRI->getSubRegFromChannel(Idx + 1));
7086
7087 ReadlanePieces.push_back(CurRegLo);
7088 ReadlanePieces.push_back(CurRegHi);
7089
7090 // Comparison is to be done as 64-bit.
7091 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7092 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7093 .addReg(CurRegLo)
7094 .addImm(AMDGPU::sub0)
7095 .addReg(CurRegHi)
7096 .addImm(AMDGPU::sub1);
7097
7098 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7099 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7100 NewCondReg)
7101 .addReg(CurReg);
7102 if (NumSubRegs <= 2)
7103 Cmp.addReg(VScalarOp);
7104 else
7105 Cmp.addReg(VScalarOp, VScalarOpUndef,
7106 TRI->getSubRegFromChannel(Idx, 2));
7107
7108 // Combine the comparison results with AND.
7109 if (!CondReg) // First.
7110 CondReg = NewCondReg;
7111 else { // If not the first, we create an AND.
7112 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7113 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7114 .addReg(CondReg)
7115 .addReg(NewCondReg);
7116 CondReg = AndReg;
7117 }
7118 } // End for loop.
7119
7120 const auto *SScalarOpRC =
7121 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7122 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7123
7124 // Build scalar ScalarOp.
7125 auto Merge =
7126 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7127 unsigned Channel = 0;
7128 for (Register Piece : ReadlanePieces) {
7129 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7130 }
7131
7132 // Update ScalarOp operand to use the SGPR ScalarOp.
7133 ScalarOp->setReg(SScalarOp);
7134 ScalarOp->setIsKill();
7135 }
7136 }
7137
7138 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7139 MRI.setSimpleHint(SaveExec, CondReg);
7140
7141 // Update EXEC to matching lanes, saving original to SaveExec.
7142 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7143 .addReg(CondReg, RegState::Kill);
7144
7145 // The original instruction is here; we insert the terminators after it.
7146 I = BodyBB.end();
7147
7148 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7149 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7150 .addReg(LMC.ExecReg)
7151 .addReg(SaveExec);
7152
7153 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7154}
7155
7156// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7157// with SGPRs by iterating over all unique values across all lanes.
7158// Returns the loop basic block that now contains \p MI.
7159static MachineBasicBlock *
7163 MachineBasicBlock::iterator Begin = nullptr,
7164 MachineBasicBlock::iterator End = nullptr) {
7165 MachineBasicBlock &MBB = *MI.getParent();
7166 MachineFunction &MF = *MBB.getParent();
7167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7168 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7170 if (!Begin.isValid())
7171 Begin = &MI;
7172 if (!End.isValid()) {
7173 End = &MI;
7174 ++End;
7175 }
7176 const DebugLoc &DL = MI.getDebugLoc();
7178 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7179
7180 // Save SCC. Waterfall Loop may overwrite SCC.
7181 Register SaveSCCReg;
7182
7183 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7184 // rather than unlimited scan everywhere
7185 bool SCCNotDead =
7186 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7187 std::numeric_limits<unsigned>::max()) !=
7189 if (SCCNotDead) {
7190 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7191 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7192 .addImm(1)
7193 .addImm(0);
7194 }
7195
7196 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7197
7198 // Save the EXEC mask
7199 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7200
7201 // Killed uses in the instruction we are waterfalling around will be
7202 // incorrect due to the added control-flow.
7204 ++AfterMI;
7205 for (auto I = Begin; I != AfterMI; I++) {
7206 for (auto &MO : I->all_uses())
7207 MRI.clearKillFlags(MO.getReg());
7208 }
7209
7210 // To insert the loop we need to split the block. Move everything after this
7211 // point to a new block, and insert a new empty block between the two.
7214 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7216 ++MBBI;
7217
7218 MF.insert(MBBI, LoopBB);
7219 MF.insert(MBBI, BodyBB);
7220 MF.insert(MBBI, RemainderBB);
7221
7222 LoopBB->addSuccessor(BodyBB);
7223 BodyBB->addSuccessor(LoopBB);
7224 BodyBB->addSuccessor(RemainderBB);
7225
7226 // Move Begin to MI to the BodyBB, and the remainder of the block to
7227 // RemainderBB.
7228 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7229 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7230 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7231
7232 MBB.addSuccessor(LoopBB);
7233
7234 // Update dominators. We know that MBB immediately dominates LoopBB, that
7235 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7236 // RemainderBB. RemainderBB immediately dominates all of the successors
7237 // transferred to it from MBB that MBB used to properly dominate.
7238 if (MDT) {
7239 MDT->addNewBlock(LoopBB, &MBB);
7240 MDT->addNewBlock(BodyBB, LoopBB);
7241 MDT->addNewBlock(RemainderBB, BodyBB);
7242 for (auto &Succ : RemainderBB->successors()) {
7243 if (MDT->properlyDominates(&MBB, Succ)) {
7244 MDT->changeImmediateDominator(Succ, RemainderBB);
7245 }
7246 }
7247 }
7248
7249 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7250
7251 MachineBasicBlock::iterator First = RemainderBB->begin();
7252 // Restore SCC
7253 if (SCCNotDead) {
7254 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7255 .addReg(SaveSCCReg, RegState::Kill)
7256 .addImm(0);
7257 }
7258
7259 // Restore the EXEC mask
7260 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7261 .addReg(SaveExec);
7262 return BodyBB;
7263}
7264
7265// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7266static std::tuple<unsigned, unsigned>
7268 MachineBasicBlock &MBB = *MI.getParent();
7269 MachineFunction &MF = *MBB.getParent();
7271
7272 // Extract the ptr from the resource descriptor.
7273 unsigned RsrcPtr =
7274 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7275 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7276
7277 // Create an empty resource descriptor
7278 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7279 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7280 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7281 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7282 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7283
7284 // Zero64 = 0
7285 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7286 .addImm(0);
7287
7288 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7289 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7290 .addImm(Lo_32(RsrcDataFormat));
7291
7292 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7293 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7294 .addImm(Hi_32(RsrcDataFormat));
7295
7296 // NewSRsrc = {Zero64, SRsrcFormat}
7297 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7298 .addReg(Zero64)
7299 .addImm(AMDGPU::sub0_sub1)
7300 .addReg(SRsrcFormatLo)
7301 .addImm(AMDGPU::sub2)
7302 .addReg(SRsrcFormatHi)
7303 .addImm(AMDGPU::sub3);
7304
7305 return std::tuple(RsrcPtr, NewSRsrc);
7306}
7307
7310 MachineDominatorTree *MDT) const {
7311 MachineFunction &MF = *MI.getMF();
7313 MachineBasicBlock *CreatedBB = nullptr;
7314
7315 // Legalize VOP2
7316 if (isVOP2(MI) || isVOPC(MI)) {
7318 return CreatedBB;
7319 }
7320
7321 // Legalize VOP3
7322 if (isVOP3(MI)) {
7324 return CreatedBB;
7325 }
7326
7327 // Legalize SMRD
7328 if (isSMRD(MI)) {
7330 return CreatedBB;
7331 }
7332
7333 // Legalize FLAT
7334 if (isFLAT(MI)) {
7336 return CreatedBB;
7337 }
7338
7339 // Legalize REG_SEQUENCE and PHI
7340 // The register class of the operands much be the same type as the register
7341 // class of the output.
7342 if (MI.getOpcode() == AMDGPU::PHI) {
7343 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7344 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7345 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7346 continue;
7347 const TargetRegisterClass *OpRC =
7348 MRI.getRegClass(MI.getOperand(i).getReg());
7349 if (RI.hasVectorRegisters(OpRC)) {
7350 VRC = OpRC;
7351 } else {
7352 SRC = OpRC;
7353 }
7354 }
7355
7356 // If any of the operands are VGPR registers, then they all most be
7357 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7358 // them.
7359 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7360 if (!VRC) {
7361 assert(SRC);
7362 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7363 VRC = &AMDGPU::VReg_1RegClass;
7364 } else
7365 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7366 ? RI.getEquivalentAGPRClass(SRC)
7367 : RI.getEquivalentVGPRClass(SRC);
7368 } else {
7369 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7370 ? RI.getEquivalentAGPRClass(VRC)
7371 : RI.getEquivalentVGPRClass(VRC);
7372 }
7373 RC = VRC;
7374 } else {
7375 RC = SRC;
7376 }
7377
7378 // Update all the operands so they have the same type.
7379 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7380 MachineOperand &Op = MI.getOperand(I);
7381 if (!Op.isReg() || !Op.getReg().isVirtual())
7382 continue;
7383
7384 // MI is a PHI instruction.
7385 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7387
7388 // Avoid creating no-op copies with the same src and dst reg class. These
7389 // confuse some of the machine passes.
7390 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7391 }
7392 }
7393
7394 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7395 // VGPR dest type and SGPR sources, insert copies so all operands are
7396 // VGPRs. This seems to help operand folding / the register coalescer.
7397 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7398 MachineBasicBlock *MBB = MI.getParent();
7399 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7400 if (RI.hasVGPRs(DstRC)) {
7401 // Update all the operands so they are VGPR register classes. These may
7402 // not be the same register class because REG_SEQUENCE supports mixing
7403 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7404 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7405 MachineOperand &Op = MI.getOperand(I);
7406 if (!Op.isReg() || !Op.getReg().isVirtual())
7407 continue;
7408
7409 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7410 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7411 if (VRC == OpRC)
7412 continue;
7413
7414 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7415 Op.setIsKill();
7416 }
7417 }
7418
7419 return CreatedBB;
7420 }
7421
7422 // Legalize INSERT_SUBREG
7423 // src0 must have the same register class as dst
7424 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7425 Register Dst = MI.getOperand(0).getReg();
7426 Register Src0 = MI.getOperand(1).getReg();
7427 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7428 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7429 if (DstRC != Src0RC) {
7430 MachineBasicBlock *MBB = MI.getParent();
7431 MachineOperand &Op = MI.getOperand(1);
7432 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7433 }
7434 return CreatedBB;
7435 }
7436
7437 // Legalize SI_INIT_M0
7438 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7439 MachineOperand &Src = MI.getOperand(0);
7440 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7441 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7442 return CreatedBB;
7443 }
7444
7445 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7446 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7447 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7448 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7449 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7450 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7451 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7452 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7453 MachineOperand &Src = MI.getOperand(1);
7454 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7455 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7456 return CreatedBB;
7457 }
7458
7459 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7460 //
7461 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7462 // scratch memory access. In both cases, the legalization never involves
7463 // conversion to the addr64 form.
7465 (isMUBUF(MI) || isMTBUF(MI)))) {
7466 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7467 ? AMDGPU::OpName::rsrc
7468 : AMDGPU::OpName::srsrc;
7469 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7470 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7471 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7472
7473 AMDGPU::OpName SampOpName =
7474 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7475 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7476 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7477 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7478
7479 return CreatedBB;
7480 }
7481
7482 // Legalize SI_CALL
7483 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7484 MachineOperand *Dest = &MI.getOperand(0);
7485 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7486 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7487 // following copies, we also need to move copies from and to physical
7488 // registers into the loop block.
7489 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7490 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7491
7492 // Also move the copies to physical registers into the loop block
7493 MachineBasicBlock &MBB = *MI.getParent();
7495 while (Start->getOpcode() != FrameSetupOpcode)
7496 --Start;
7498 while (End->getOpcode() != FrameDestroyOpcode)
7499 ++End;
7500 // Also include following copies of the return value
7501 ++End;
7502 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7503 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7504 ++End;
7505 CreatedBB =
7506 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7507 }
7508 }
7509
7510 // Legalize s_sleep_var.
7511 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7512 const DebugLoc &DL = MI.getDebugLoc();
7513 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7514 int Src0Idx =
7515 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7516 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7517 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7518 .add(Src0);
7519 Src0.ChangeToRegister(Reg, false);
7520 return nullptr;
7521 }
7522
7523 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7524 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7525 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7526 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7527 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7528 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7529 for (MachineOperand &Src : MI.explicit_operands()) {
7530 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7531 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7532 }
7533 return CreatedBB;
7534 }
7535
7536 // Legalize MUBUF instructions.
7537 bool isSoffsetLegal = true;
7538 int SoffsetIdx =
7539 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7540 if (SoffsetIdx != -1) {
7541 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7542 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7543 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7544 isSoffsetLegal = false;
7545 }
7546 }
7547
7548 bool isRsrcLegal = true;
7549 int RsrcIdx =
7550 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7551 if (RsrcIdx != -1) {
7552 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7553 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7554 isRsrcLegal = false;
7555 }
7556
7557 // The operands are legal.
7558 if (isRsrcLegal && isSoffsetLegal)
7559 return CreatedBB;
7560
7561 if (!isRsrcLegal) {
7562 // Legalize a VGPR Rsrc
7563 //
7564 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7565 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7566 // a zero-value SRsrc.
7567 //
7568 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7569 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7570 // above.
7571 //
7572 // Otherwise we are on non-ADDR64 hardware, and/or we have
7573 // idxen/offen/bothen and we fall back to a waterfall loop.
7574
7575 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7576 MachineBasicBlock &MBB = *MI.getParent();
7577
7578 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7579 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7580 // This is already an ADDR64 instruction so we need to add the pointer
7581 // extracted from the resource descriptor to the current value of VAddr.
7582 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7583 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7584 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7585
7586 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7587 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7588 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7589
7590 unsigned RsrcPtr, NewSRsrc;
7591 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7592
7593 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7594 const DebugLoc &DL = MI.getDebugLoc();
7595 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7596 .addDef(CondReg0)
7597 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7598 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7599 .addImm(0);
7600
7601 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7602 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7603 .addDef(CondReg1, RegState::Dead)
7604 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7605 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7606 .addReg(CondReg0, RegState::Kill)
7607 .addImm(0);
7608
7609 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7610 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7611 .addReg(NewVAddrLo)
7612 .addImm(AMDGPU::sub0)
7613 .addReg(NewVAddrHi)
7614 .addImm(AMDGPU::sub1);
7615
7616 VAddr->setReg(NewVAddr);
7617 Rsrc->setReg(NewSRsrc);
7618 } else if (!VAddr && ST.hasAddr64()) {
7619 // This instructions is the _OFFSET variant, so we need to convert it to
7620 // ADDR64.
7621 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7622 "FIXME: Need to emit flat atomics here");
7623
7624 unsigned RsrcPtr, NewSRsrc;
7625 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7626
7627 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7628 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7629 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7630 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7631 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7632
7633 // Atomics with return have an additional tied operand and are
7634 // missing some of the special bits.
7635 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7636 MachineInstr *Addr64;
7637
7638 if (!VDataIn) {
7639 // Regular buffer load / store.
7641 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7642 .add(*VData)
7643 .addReg(NewVAddr)
7644 .addReg(NewSRsrc)
7645 .add(*SOffset)
7646 .add(*Offset);
7647
7648 if (const MachineOperand *CPol =
7649 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7650 MIB.addImm(CPol->getImm());
7651 }
7652
7653 if (const MachineOperand *TFE =
7654 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7655 MIB.addImm(TFE->getImm());
7656 }
7657
7658 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7659
7660 MIB.cloneMemRefs(MI);
7661 Addr64 = MIB;
7662 } else {
7663 // Atomics with return.
7664 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7665 .add(*VData)
7666 .add(*VDataIn)
7667 .addReg(NewVAddr)
7668 .addReg(NewSRsrc)
7669 .add(*SOffset)
7670 .add(*Offset)
7671 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7672 .cloneMemRefs(MI);
7673 }
7674
7675 MI.removeFromParent();
7676
7677 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7678 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7679 NewVAddr)
7680 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7681 .addImm(AMDGPU::sub0)
7682 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7683 .addImm(AMDGPU::sub1);
7684 } else {
7685 // Legalize a VGPR Rsrc and soffset together.
7686 if (!isSoffsetLegal) {
7687 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7688 CreatedBB =
7689 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7690 return CreatedBB;
7691 }
7692 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7693 return CreatedBB;
7694 }
7695 }
7696
7697 // Legalize a VGPR soffset.
7698 if (!isSoffsetLegal) {
7699 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7700 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7701 return CreatedBB;
7702 }
7703 return CreatedBB;
7704}
7705
7707 InstrList.insert(MI);
7708 // Add MBUF instructiosn to deferred list.
7709 int RsrcIdx =
7710 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7711 if (RsrcIdx != -1) {
7712 DeferredList.insert(MI);
7713 }
7714}
7715
7717 return DeferredList.contains(MI);
7718}
7719
7720// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7721// lowering (change spgr to vgpr).
7722// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7723// size. Need to legalize the size of the operands during the vgpr lowering
7724// chain. This can be removed after we have sgpr16 in place
7726 MachineRegisterInfo &MRI) const {
7727 if (!ST.useRealTrue16Insts())
7728 return;
7729
7730 unsigned Opcode = MI.getOpcode();
7731 MachineBasicBlock *MBB = MI.getParent();
7732 // Legalize operands and check for size mismatch
7733 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7734 OpIdx >= get(Opcode).getNumOperands() ||
7735 get(Opcode).operands()[OpIdx].RegClass == -1)
7736 return;
7737
7738 MachineOperand &Op = MI.getOperand(OpIdx);
7739 if (!Op.isReg() || !Op.getReg().isVirtual())
7740 return;
7741
7742 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7743 if (!RI.isVGPRClass(CurrRC))
7744 return;
7745
7746 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7747 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7748 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7749 Op.setSubReg(AMDGPU::lo16);
7750 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7751 const DebugLoc &DL = MI.getDebugLoc();
7752 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7753 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7754 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7755 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7756 .addReg(Op.getReg())
7757 .addImm(AMDGPU::lo16)
7758 .addReg(Undef)
7759 .addImm(AMDGPU::hi16);
7760 Op.setReg(NewDstReg);
7761 }
7762}
7764 MachineRegisterInfo &MRI) const {
7765 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7767}
7768
7770 MachineDominatorTree *MDT) const {
7771
7772 while (!Worklist.empty()) {
7773 MachineInstr &Inst = *Worklist.top();
7774 Worklist.erase_top();
7775 // Skip MachineInstr in the deferred list.
7776 if (Worklist.isDeferred(&Inst))
7777 continue;
7778 moveToVALUImpl(Worklist, MDT, Inst);
7779 }
7780
7781 // Deferred list of instructions will be processed once
7782 // all the MachineInstr in the worklist are done.
7783 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7784 moveToVALUImpl(Worklist, MDT, *Inst);
7785 assert(Worklist.empty() &&
7786 "Deferred MachineInstr are not supposed to re-populate worklist");
7787 }
7788}
7789
7792 MachineInstr &Inst) const {
7793
7795 if (!MBB)
7796 return;
7797 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7798 unsigned Opcode = Inst.getOpcode();
7799 unsigned NewOpcode = getVALUOp(Inst);
7800 const DebugLoc &DL = Inst.getDebugLoc();
7801
7802 // Handle some special cases
7803 switch (Opcode) {
7804 default:
7805 break;
7806 case AMDGPU::S_ADD_I32:
7807 case AMDGPU::S_SUB_I32: {
7808 // FIXME: The u32 versions currently selected use the carry.
7809 bool Changed;
7810 MachineBasicBlock *CreatedBBTmp = nullptr;
7811 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7812 if (Changed)
7813 return;
7814
7815 // Default handling
7816 break;
7817 }
7818
7819 case AMDGPU::S_MUL_U64:
7820 if (ST.hasVectorMulU64()) {
7821 NewOpcode = AMDGPU::V_MUL_U64_e64;
7822 break;
7823 }
7824 // Split s_mul_u64 in 32-bit vector multiplications.
7825 splitScalarSMulU64(Worklist, Inst, MDT);
7826 Inst.eraseFromParent();
7827 return;
7828
7829 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7830 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7831 // This is a special case of s_mul_u64 where all the operands are either
7832 // zero extended or sign extended.
7833 splitScalarSMulPseudo(Worklist, Inst, MDT);
7834 Inst.eraseFromParent();
7835 return;
7836
7837 case AMDGPU::S_AND_B64:
7838 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7839 Inst.eraseFromParent();
7840 return;
7841
7842 case AMDGPU::S_OR_B64:
7843 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7844 Inst.eraseFromParent();
7845 return;
7846
7847 case AMDGPU::S_XOR_B64:
7848 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7849 Inst.eraseFromParent();
7850 return;
7851
7852 case AMDGPU::S_NAND_B64:
7853 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7854 Inst.eraseFromParent();
7855 return;
7856
7857 case AMDGPU::S_NOR_B64:
7858 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7859 Inst.eraseFromParent();
7860 return;
7861
7862 case AMDGPU::S_XNOR_B64:
7863 if (ST.hasDLInsts())
7864 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7865 else
7866 splitScalar64BitXnor(Worklist, Inst, MDT);
7867 Inst.eraseFromParent();
7868 return;
7869
7870 case AMDGPU::S_ANDN2_B64:
7871 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7872 Inst.eraseFromParent();
7873 return;
7874
7875 case AMDGPU::S_ORN2_B64:
7876 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7877 Inst.eraseFromParent();
7878 return;
7879
7880 case AMDGPU::S_BREV_B64:
7881 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7882 Inst.eraseFromParent();
7883 return;
7884
7885 case AMDGPU::S_NOT_B64:
7886 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7887 Inst.eraseFromParent();
7888 return;
7889
7890 case AMDGPU::S_BCNT1_I32_B64:
7891 splitScalar64BitBCNT(Worklist, Inst);
7892 Inst.eraseFromParent();
7893 return;
7894
7895 case AMDGPU::S_BFE_I64:
7896 splitScalar64BitBFE(Worklist, Inst);
7897 Inst.eraseFromParent();
7898 return;
7899
7900 case AMDGPU::S_FLBIT_I32_B64:
7901 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7902 Inst.eraseFromParent();
7903 return;
7904 case AMDGPU::S_FF1_I32_B64:
7905 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7906 Inst.eraseFromParent();
7907 return;
7908
7909 case AMDGPU::S_LSHL_B32:
7910 if (ST.hasOnlyRevVALUShifts()) {
7911 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7912 swapOperands(Inst);
7913 }
7914 break;
7915 case AMDGPU::S_ASHR_I32:
7916 if (ST.hasOnlyRevVALUShifts()) {
7917 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7918 swapOperands(Inst);
7919 }
7920 break;
7921 case AMDGPU::S_LSHR_B32:
7922 if (ST.hasOnlyRevVALUShifts()) {
7923 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7924 swapOperands(Inst);
7925 }
7926 break;
7927 case AMDGPU::S_LSHL_B64:
7928 if (ST.hasOnlyRevVALUShifts()) {
7929 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7930 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7931 : AMDGPU::V_LSHLREV_B64_e64;
7932 swapOperands(Inst);
7933 }
7934 break;
7935 case AMDGPU::S_ASHR_I64:
7936 if (ST.hasOnlyRevVALUShifts()) {
7937 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7938 swapOperands(Inst);
7939 }
7940 break;
7941 case AMDGPU::S_LSHR_B64:
7942 if (ST.hasOnlyRevVALUShifts()) {
7943 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7944 swapOperands(Inst);
7945 }
7946 break;
7947
7948 case AMDGPU::S_ABS_I32:
7949 lowerScalarAbs(Worklist, Inst);
7950 Inst.eraseFromParent();
7951 return;
7952
7953 case AMDGPU::S_ABSDIFF_I32:
7954 lowerScalarAbsDiff(Worklist, Inst);
7955 Inst.eraseFromParent();
7956 return;
7957
7958 case AMDGPU::S_CBRANCH_SCC0:
7959 case AMDGPU::S_CBRANCH_SCC1: {
7960 // Clear unused bits of vcc
7961 Register CondReg = Inst.getOperand(1).getReg();
7962 bool IsSCC = CondReg == AMDGPU::SCC;
7964 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7965 .addReg(LMC.ExecReg)
7966 .addReg(IsSCC ? LMC.VccReg : CondReg);
7967 Inst.removeOperand(1);
7968 } break;
7969
7970 case AMDGPU::S_BFE_U64:
7971 case AMDGPU::S_BFM_B64:
7972 llvm_unreachable("Moving this op to VALU not implemented");
7973
7974 case AMDGPU::S_PACK_LL_B32_B16:
7975 case AMDGPU::S_PACK_LH_B32_B16:
7976 case AMDGPU::S_PACK_HL_B32_B16:
7977 case AMDGPU::S_PACK_HH_B32_B16:
7978 movePackToVALU(Worklist, MRI, Inst);
7979 Inst.eraseFromParent();
7980 return;
7981
7982 case AMDGPU::S_XNOR_B32:
7983 lowerScalarXnor(Worklist, Inst);
7984 Inst.eraseFromParent();
7985 return;
7986
7987 case AMDGPU::S_NAND_B32:
7988 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7989 Inst.eraseFromParent();
7990 return;
7991
7992 case AMDGPU::S_NOR_B32:
7993 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7994 Inst.eraseFromParent();
7995 return;
7996
7997 case AMDGPU::S_ANDN2_B32:
7998 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7999 Inst.eraseFromParent();
8000 return;
8001
8002 case AMDGPU::S_ORN2_B32:
8003 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8004 Inst.eraseFromParent();
8005 return;
8006
8007 // TODO: remove as soon as everything is ready
8008 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8009 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8010 // can only be selected from the uniform SDNode.
8011 case AMDGPU::S_ADD_CO_PSEUDO:
8012 case AMDGPU::S_SUB_CO_PSEUDO: {
8013 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8014 ? AMDGPU::V_ADDC_U32_e64
8015 : AMDGPU::V_SUBB_U32_e64;
8016 const auto *CarryRC = RI.getWaveMaskRegClass();
8017
8018 Register CarryInReg = Inst.getOperand(4).getReg();
8019 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8020 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8021 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8022 .addReg(CarryInReg);
8023 }
8024
8025 Register CarryOutReg = Inst.getOperand(1).getReg();
8026
8027 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8028 MRI.getRegClass(Inst.getOperand(0).getReg())));
8029 MachineInstr *CarryOp =
8030 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8031 .addReg(CarryOutReg, RegState::Define)
8032 .add(Inst.getOperand(2))
8033 .add(Inst.getOperand(3))
8034 .addReg(CarryInReg)
8035 .addImm(0);
8036 legalizeOperands(*CarryOp);
8037 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8038 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8039 Inst.eraseFromParent();
8040 }
8041 return;
8042 case AMDGPU::S_UADDO_PSEUDO:
8043 case AMDGPU::S_USUBO_PSEUDO: {
8044 MachineOperand &Dest0 = Inst.getOperand(0);
8045 MachineOperand &Dest1 = Inst.getOperand(1);
8046 MachineOperand &Src0 = Inst.getOperand(2);
8047 MachineOperand &Src1 = Inst.getOperand(3);
8048
8049 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8050 ? AMDGPU::V_ADD_CO_U32_e64
8051 : AMDGPU::V_SUB_CO_U32_e64;
8052 const TargetRegisterClass *NewRC =
8053 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8054 Register DestReg = MRI.createVirtualRegister(NewRC);
8055 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8056 .addReg(Dest1.getReg(), RegState::Define)
8057 .add(Src0)
8058 .add(Src1)
8059 .addImm(0); // clamp bit
8060
8061 legalizeOperands(*NewInstr, MDT);
8062 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8063 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8064 Inst.eraseFromParent();
8065 }
8066 return;
8067 case AMDGPU::S_LSHL1_ADD_U32:
8068 case AMDGPU::S_LSHL2_ADD_U32:
8069 case AMDGPU::S_LSHL3_ADD_U32:
8070 case AMDGPU::S_LSHL4_ADD_U32: {
8071 MachineOperand &Dest = Inst.getOperand(0);
8072 MachineOperand &Src0 = Inst.getOperand(1);
8073 MachineOperand &Src1 = Inst.getOperand(2);
8074 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8075 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8076 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8077 : 4);
8078
8079 const TargetRegisterClass *NewRC =
8080 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8081 Register DestReg = MRI.createVirtualRegister(NewRC);
8082 MachineInstr *NewInstr =
8083 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8084 .add(Src0)
8085 .addImm(ShiftAmt)
8086 .add(Src1);
8087
8088 legalizeOperands(*NewInstr, MDT);
8089 MRI.replaceRegWith(Dest.getReg(), DestReg);
8090 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8091 Inst.eraseFromParent();
8092 }
8093 return;
8094 case AMDGPU::S_CSELECT_B32:
8095 case AMDGPU::S_CSELECT_B64:
8096 lowerSelect(Worklist, Inst, MDT);
8097 Inst.eraseFromParent();
8098 return;
8099 case AMDGPU::S_CMP_EQ_I32:
8100 case AMDGPU::S_CMP_LG_I32:
8101 case AMDGPU::S_CMP_GT_I32:
8102 case AMDGPU::S_CMP_GE_I32:
8103 case AMDGPU::S_CMP_LT_I32:
8104 case AMDGPU::S_CMP_LE_I32:
8105 case AMDGPU::S_CMP_EQ_U32:
8106 case AMDGPU::S_CMP_LG_U32:
8107 case AMDGPU::S_CMP_GT_U32:
8108 case AMDGPU::S_CMP_GE_U32:
8109 case AMDGPU::S_CMP_LT_U32:
8110 case AMDGPU::S_CMP_LE_U32:
8111 case AMDGPU::S_CMP_EQ_U64:
8112 case AMDGPU::S_CMP_LG_U64:
8113 case AMDGPU::S_CMP_LT_F32:
8114 case AMDGPU::S_CMP_EQ_F32:
8115 case AMDGPU::S_CMP_LE_F32:
8116 case AMDGPU::S_CMP_GT_F32:
8117 case AMDGPU::S_CMP_LG_F32:
8118 case AMDGPU::S_CMP_GE_F32:
8119 case AMDGPU::S_CMP_O_F32:
8120 case AMDGPU::S_CMP_U_F32:
8121 case AMDGPU::S_CMP_NGE_F32:
8122 case AMDGPU::S_CMP_NLG_F32:
8123 case AMDGPU::S_CMP_NGT_F32:
8124 case AMDGPU::S_CMP_NLE_F32:
8125 case AMDGPU::S_CMP_NEQ_F32:
8126 case AMDGPU::S_CMP_NLT_F32: {
8127 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8128 auto NewInstr =
8129 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8130 .setMIFlags(Inst.getFlags());
8131 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8132 0) {
8133 NewInstr
8134 .addImm(0) // src0_modifiers
8135 .add(Inst.getOperand(0)) // src0
8136 .addImm(0) // src1_modifiers
8137 .add(Inst.getOperand(1)) // src1
8138 .addImm(0); // clamp
8139 } else {
8140 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8141 }
8142 legalizeOperands(*NewInstr, MDT);
8143 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8144 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8145 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8146 Inst.eraseFromParent();
8147 return;
8148 }
8149 case AMDGPU::S_CMP_LT_F16:
8150 case AMDGPU::S_CMP_EQ_F16:
8151 case AMDGPU::S_CMP_LE_F16:
8152 case AMDGPU::S_CMP_GT_F16:
8153 case AMDGPU::S_CMP_LG_F16:
8154 case AMDGPU::S_CMP_GE_F16:
8155 case AMDGPU::S_CMP_O_F16:
8156 case AMDGPU::S_CMP_U_F16:
8157 case AMDGPU::S_CMP_NGE_F16:
8158 case AMDGPU::S_CMP_NLG_F16:
8159 case AMDGPU::S_CMP_NGT_F16:
8160 case AMDGPU::S_CMP_NLE_F16:
8161 case AMDGPU::S_CMP_NEQ_F16:
8162 case AMDGPU::S_CMP_NLT_F16: {
8163 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8164 auto NewInstr =
8165 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8166 .setMIFlags(Inst.getFlags());
8167 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8168 NewInstr
8169 .addImm(0) // src0_modifiers
8170 .add(Inst.getOperand(0)) // src0
8171 .addImm(0) // src1_modifiers
8172 .add(Inst.getOperand(1)) // src1
8173 .addImm(0); // clamp
8174 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8175 NewInstr.addImm(0); // op_sel0
8176 } else {
8177 NewInstr
8178 .add(Inst.getOperand(0))
8179 .add(Inst.getOperand(1));
8180 }
8181 legalizeOperandsVALUt16(*NewInstr, MRI);
8182 legalizeOperands(*NewInstr, MDT);
8183 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8184 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8185 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8186 Inst.eraseFromParent();
8187 return;
8188 }
8189 case AMDGPU::S_CVT_HI_F32_F16: {
8190 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8191 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8192 if (ST.useRealTrue16Insts()) {
8193 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8194 .add(Inst.getOperand(1));
8195 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8196 .addImm(0) // src0_modifiers
8197 .addReg(TmpReg, 0, AMDGPU::hi16)
8198 .addImm(0) // clamp
8199 .addImm(0) // omod
8200 .addImm(0); // op_sel0
8201 } else {
8202 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8203 .addImm(16)
8204 .add(Inst.getOperand(1));
8205 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8206 .addImm(0) // src0_modifiers
8207 .addReg(TmpReg)
8208 .addImm(0) // clamp
8209 .addImm(0); // omod
8210 }
8211
8212 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8213 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8214 Inst.eraseFromParent();
8215 return;
8216 }
8217 case AMDGPU::S_MINIMUM_F32:
8218 case AMDGPU::S_MAXIMUM_F32: {
8219 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8220 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8221 .addImm(0) // src0_modifiers
8222 .add(Inst.getOperand(1))
8223 .addImm(0) // src1_modifiers
8224 .add(Inst.getOperand(2))
8225 .addImm(0) // clamp
8226 .addImm(0); // omod
8227 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8228
8229 legalizeOperands(*NewInstr, MDT);
8230 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8231 Inst.eraseFromParent();
8232 return;
8233 }
8234 case AMDGPU::S_MINIMUM_F16:
8235 case AMDGPU::S_MAXIMUM_F16: {
8236 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8237 ? &AMDGPU::VGPR_16RegClass
8238 : &AMDGPU::VGPR_32RegClass);
8239 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8240 .addImm(0) // src0_modifiers
8241 .add(Inst.getOperand(1))
8242 .addImm(0) // src1_modifiers
8243 .add(Inst.getOperand(2))
8244 .addImm(0) // clamp
8245 .addImm(0) // omod
8246 .addImm(0); // opsel0
8247 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8248 legalizeOperandsVALUt16(*NewInstr, MRI);
8249 legalizeOperands(*NewInstr, MDT);
8250 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8251 Inst.eraseFromParent();
8252 return;
8253 }
8254 case AMDGPU::V_S_EXP_F16_e64:
8255 case AMDGPU::V_S_LOG_F16_e64:
8256 case AMDGPU::V_S_RCP_F16_e64:
8257 case AMDGPU::V_S_RSQ_F16_e64:
8258 case AMDGPU::V_S_SQRT_F16_e64: {
8259 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8260 ? &AMDGPU::VGPR_16RegClass
8261 : &AMDGPU::VGPR_32RegClass);
8262 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8263 .add(Inst.getOperand(1)) // src0_modifiers
8264 .add(Inst.getOperand(2))
8265 .add(Inst.getOperand(3)) // clamp
8266 .add(Inst.getOperand(4)) // omod
8267 .setMIFlags(Inst.getFlags());
8268 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8269 NewInstr.addImm(0); // opsel0
8270 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8271 legalizeOperandsVALUt16(*NewInstr, MRI);
8272 legalizeOperands(*NewInstr, MDT);
8273 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8274 Inst.eraseFromParent();
8275 return;
8276 }
8277 }
8278
8279 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8280 // We cannot move this instruction to the VALU, so we should try to
8281 // legalize its operands instead.
8282 legalizeOperands(Inst, MDT);
8283 return;
8284 }
8285 // Handle converting generic instructions like COPY-to-SGPR into
8286 // COPY-to-VGPR.
8287 if (NewOpcode == Opcode) {
8288 Register DstReg = Inst.getOperand(0).getReg();
8289 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8290
8291 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8292 // hope for the best.
8293 if (Inst.isCopy() && DstReg.isPhysical() &&
8294 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8295 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8296 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8297 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8298 .add(Inst.getOperand(1));
8299 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8300 DstReg)
8301 .addReg(NewDst);
8302
8303 Inst.eraseFromParent();
8304 return;
8305 }
8306
8307 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8308 Register NewDstReg = Inst.getOperand(1).getReg();
8309 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8310 if (const TargetRegisterClass *CommonRC =
8311 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8312 // Instead of creating a copy where src and dst are the same register
8313 // class, we just replace all uses of dst with src. These kinds of
8314 // copies interfere with the heuristics MachineSink uses to decide
8315 // whether or not to split a critical edge. Since the pass assumes
8316 // that copies will end up as machine instructions and not be
8317 // eliminated.
8318 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8319 MRI.replaceRegWith(DstReg, NewDstReg);
8320 MRI.clearKillFlags(NewDstReg);
8321 Inst.getOperand(0).setReg(DstReg);
8322
8323 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8324 llvm_unreachable("failed to constrain register");
8325
8326 Inst.eraseFromParent();
8327 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8328 for (MachineOperand &MO :
8329 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8330 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8331 }
8332
8333 return;
8334 }
8335 }
8336
8337 // If this is a v2s copy between 16bit and 32bit reg,
8338 // replace vgpr copy to reg_sequence/extract_subreg
8339 // This can be remove after we have sgpr16 in place
8340 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8341 Inst.getOperand(1).getReg().isVirtual() &&
8342 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8343 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8344 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8345 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8346 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8347 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8348 get(AMDGPU::IMPLICIT_DEF), Undef);
8349 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8350 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8351 .addReg(Inst.getOperand(1).getReg())
8352 .addImm(AMDGPU::lo16)
8353 .addReg(Undef)
8354 .addImm(AMDGPU::hi16);
8355 Inst.eraseFromParent();
8356 MRI.replaceRegWith(DstReg, NewDstReg);
8357 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8358 return;
8359 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8360 AMDGPU::lo16)) {
8361 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8362 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8363 MRI.replaceRegWith(DstReg, NewDstReg);
8364 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8365 return;
8366 }
8367 }
8368
8369 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8370 MRI.replaceRegWith(DstReg, NewDstReg);
8371 legalizeOperands(Inst, MDT);
8372 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8373 return;
8374 }
8375
8376 // Use the new VALU Opcode.
8377 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8378 .setMIFlags(Inst.getFlags());
8379 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8380 // Intersperse VOP3 modifiers among the SALU operands.
8381 NewInstr->addOperand(Inst.getOperand(0));
8382 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8383 AMDGPU::OpName::src0_modifiers) >= 0)
8384 NewInstr.addImm(0);
8385 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8386 const MachineOperand &Src = Inst.getOperand(1);
8387 NewInstr->addOperand(Src);
8388 }
8389
8390 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8391 // We are converting these to a BFE, so we need to add the missing
8392 // operands for the size and offset.
8393 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8394 NewInstr.addImm(0);
8395 NewInstr.addImm(Size);
8396 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8397 // The VALU version adds the second operand to the result, so insert an
8398 // extra 0 operand.
8399 NewInstr.addImm(0);
8400 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8401 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8402 // If we need to move this to VGPRs, we need to unpack the second
8403 // operand back into the 2 separate ones for bit offset and width.
8404 assert(OffsetWidthOp.isImm() &&
8405 "Scalar BFE is only implemented for constant width and offset");
8406 uint32_t Imm = OffsetWidthOp.getImm();
8407
8408 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8409 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8410 NewInstr.addImm(Offset);
8411 NewInstr.addImm(BitWidth);
8412 } else {
8413 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8414 AMDGPU::OpName::src1_modifiers) >= 0)
8415 NewInstr.addImm(0);
8416 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8417 NewInstr->addOperand(Inst.getOperand(2));
8418 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8419 AMDGPU::OpName::src2_modifiers) >= 0)
8420 NewInstr.addImm(0);
8421 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8422 NewInstr->addOperand(Inst.getOperand(3));
8423 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8424 NewInstr.addImm(0);
8425 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8426 NewInstr.addImm(0);
8427 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8428 NewInstr.addImm(0);
8429 }
8430 } else {
8431 // Just copy the SALU operands.
8432 for (const MachineOperand &Op : Inst.explicit_operands())
8433 NewInstr->addOperand(Op);
8434 }
8435
8436 // Remove any references to SCC. Vector instructions can't read from it, and
8437 // We're just about to add the implicit use / defs of VCC, and we don't want
8438 // both.
8439 for (MachineOperand &Op : Inst.implicit_operands()) {
8440 if (Op.getReg() == AMDGPU::SCC) {
8441 // Only propagate through live-def of SCC.
8442 if (Op.isDef() && !Op.isDead())
8443 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8444 if (Op.isUse())
8445 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8446 }
8447 }
8448 Inst.eraseFromParent();
8449 Register NewDstReg;
8450 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8451 Register DstReg = NewInstr->getOperand(0).getReg();
8452 assert(DstReg.isVirtual());
8453 // Update the destination register class.
8454 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8455 assert(NewDstRC);
8456 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8457 MRI.replaceRegWith(DstReg, NewDstReg);
8458 }
8459 fixImplicitOperands(*NewInstr);
8460
8461 legalizeOperandsVALUt16(*NewInstr, MRI);
8462
8463 // Legalize the operands
8464 legalizeOperands(*NewInstr, MDT);
8465 if (NewDstReg)
8466 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8467}
8468
8469// Add/sub require special handling to deal with carry outs.
8470std::pair<bool, MachineBasicBlock *>
8471SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8472 MachineDominatorTree *MDT) const {
8473 if (ST.hasAddNoCarry()) {
8474 // Assume there is no user of scc since we don't select this in that case.
8475 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8476 // is used.
8477
8478 MachineBasicBlock &MBB = *Inst.getParent();
8479 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8480
8481 Register OldDstReg = Inst.getOperand(0).getReg();
8482 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8483
8484 unsigned Opc = Inst.getOpcode();
8485 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8486
8487 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8488 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8489
8490 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8491 Inst.removeOperand(3);
8492
8493 Inst.setDesc(get(NewOpc));
8494 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8495 Inst.addImplicitDefUseOperands(*MBB.getParent());
8496 MRI.replaceRegWith(OldDstReg, ResultReg);
8497 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8498
8499 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8500 return std::pair(true, NewBB);
8501 }
8502
8503 return std::pair(false, nullptr);
8504}
8505
8506void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8507 MachineDominatorTree *MDT) const {
8508
8509 MachineBasicBlock &MBB = *Inst.getParent();
8510 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8511 MachineBasicBlock::iterator MII = Inst;
8512 const DebugLoc &DL = Inst.getDebugLoc();
8513
8514 MachineOperand &Dest = Inst.getOperand(0);
8515 MachineOperand &Src0 = Inst.getOperand(1);
8516 MachineOperand &Src1 = Inst.getOperand(2);
8517 MachineOperand &Cond = Inst.getOperand(3);
8518
8519 Register CondReg = Cond.getReg();
8520 bool IsSCC = (CondReg == AMDGPU::SCC);
8521
8522 // If this is a trivial select where the condition is effectively not SCC
8523 // (CondReg is a source of copy to SCC), then the select is semantically
8524 // equivalent to copying CondReg. Hence, there is no need to create
8525 // V_CNDMASK, we can just use that and bail out.
8526 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8527 (Src1.getImm() == 0)) {
8528 MRI.replaceRegWith(Dest.getReg(), CondReg);
8529 return;
8530 }
8531
8532 Register NewCondReg = CondReg;
8533 if (IsSCC) {
8534 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8535 NewCondReg = MRI.createVirtualRegister(TC);
8536
8537 // Now look for the closest SCC def if it is a copy
8538 // replacing the CondReg with the COPY source register
8539 bool CopyFound = false;
8540 for (MachineInstr &CandI :
8542 Inst.getParent()->rend())) {
8543 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8544 -1) {
8545 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8546 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8547 .addReg(CandI.getOperand(1).getReg());
8548 CopyFound = true;
8549 }
8550 break;
8551 }
8552 }
8553 if (!CopyFound) {
8554 // SCC def is not a copy
8555 // Insert a trivial select instead of creating a copy, because a copy from
8556 // SCC would semantically mean just copying a single bit, but we may need
8557 // the result to be a vector condition mask that needs preserving.
8558 unsigned Opcode =
8559 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8560 auto NewSelect =
8561 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8562 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8563 }
8564 }
8565
8566 Register NewDestReg = MRI.createVirtualRegister(
8567 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8568 MachineInstr *NewInst;
8569 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8570 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8571 .addImm(0)
8572 .add(Src1) // False
8573 .addImm(0)
8574 .add(Src0) // True
8575 .addReg(NewCondReg);
8576 } else {
8577 NewInst =
8578 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8579 .add(Src1) // False
8580 .add(Src0) // True
8581 .addReg(NewCondReg);
8582 }
8583 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8584 legalizeOperands(*NewInst, MDT);
8585 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8586}
8587
8588void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8589 MachineInstr &Inst) const {
8590 MachineBasicBlock &MBB = *Inst.getParent();
8591 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8592 MachineBasicBlock::iterator MII = Inst;
8593 const DebugLoc &DL = Inst.getDebugLoc();
8594
8595 MachineOperand &Dest = Inst.getOperand(0);
8596 MachineOperand &Src = Inst.getOperand(1);
8597 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8598 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8599
8600 unsigned SubOp = ST.hasAddNoCarry() ?
8601 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8602
8603 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8604 .addImm(0)
8605 .addReg(Src.getReg());
8606
8607 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8608 .addReg(Src.getReg())
8609 .addReg(TmpReg);
8610
8611 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8612 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8613}
8614
8615void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8616 MachineInstr &Inst) const {
8617 MachineBasicBlock &MBB = *Inst.getParent();
8618 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8619 MachineBasicBlock::iterator MII = Inst;
8620 const DebugLoc &DL = Inst.getDebugLoc();
8621
8622 MachineOperand &Dest = Inst.getOperand(0);
8623 MachineOperand &Src1 = Inst.getOperand(1);
8624 MachineOperand &Src2 = Inst.getOperand(2);
8625 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8626 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8627 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8628
8629 unsigned SubOp =
8630 ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8631
8632 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8633 .addReg(Src1.getReg())
8634 .addReg(Src2.getReg());
8635
8636 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8637
8638 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8639 .addReg(SubResultReg)
8640 .addReg(TmpReg);
8641
8642 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8643 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8644}
8645
8646void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8647 MachineInstr &Inst) const {
8648 MachineBasicBlock &MBB = *Inst.getParent();
8649 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8650 MachineBasicBlock::iterator MII = Inst;
8651 const DebugLoc &DL = Inst.getDebugLoc();
8652
8653 MachineOperand &Dest = Inst.getOperand(0);
8654 MachineOperand &Src0 = Inst.getOperand(1);
8655 MachineOperand &Src1 = Inst.getOperand(2);
8656
8657 if (ST.hasDLInsts()) {
8658 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8659 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8660 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8661
8662 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8663 .add(Src0)
8664 .add(Src1);
8665
8666 MRI.replaceRegWith(Dest.getReg(), NewDest);
8667 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8668 } else {
8669 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8670 // invert either source and then perform the XOR. If either source is a
8671 // scalar register, then we can leave the inversion on the scalar unit to
8672 // achieve a better distribution of scalar and vector instructions.
8673 bool Src0IsSGPR = Src0.isReg() &&
8674 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8675 bool Src1IsSGPR = Src1.isReg() &&
8676 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8677 MachineInstr *Xor;
8678 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8679 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8680
8681 // Build a pair of scalar instructions and add them to the work list.
8682 // The next iteration over the work list will lower these to the vector
8683 // unit as necessary.
8684 if (Src0IsSGPR) {
8685 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8686 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8687 .addReg(Temp)
8688 .add(Src1);
8689 } else if (Src1IsSGPR) {
8690 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8691 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8692 .add(Src0)
8693 .addReg(Temp);
8694 } else {
8695 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8696 .add(Src0)
8697 .add(Src1);
8698 MachineInstr *Not =
8699 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8700 Worklist.insert(Not);
8701 }
8702
8703 MRI.replaceRegWith(Dest.getReg(), NewDest);
8704
8705 Worklist.insert(Xor);
8706
8707 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8708 }
8709}
8710
8711void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8712 MachineInstr &Inst,
8713 unsigned Opcode) const {
8714 MachineBasicBlock &MBB = *Inst.getParent();
8715 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8716 MachineBasicBlock::iterator MII = Inst;
8717 const DebugLoc &DL = Inst.getDebugLoc();
8718
8719 MachineOperand &Dest = Inst.getOperand(0);
8720 MachineOperand &Src0 = Inst.getOperand(1);
8721 MachineOperand &Src1 = Inst.getOperand(2);
8722
8723 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8724 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8725
8726 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8727 .add(Src0)
8728 .add(Src1);
8729
8730 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8731 .addReg(Interm);
8732
8733 Worklist.insert(&Op);
8734 Worklist.insert(&Not);
8735
8736 MRI.replaceRegWith(Dest.getReg(), NewDest);
8737 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8738}
8739
8740void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8741 MachineInstr &Inst,
8742 unsigned Opcode) const {
8743 MachineBasicBlock &MBB = *Inst.getParent();
8744 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8745 MachineBasicBlock::iterator MII = Inst;
8746 const DebugLoc &DL = Inst.getDebugLoc();
8747
8748 MachineOperand &Dest = Inst.getOperand(0);
8749 MachineOperand &Src0 = Inst.getOperand(1);
8750 MachineOperand &Src1 = Inst.getOperand(2);
8751
8752 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8753 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8754
8755 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8756 .add(Src1);
8757
8758 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8759 .add(Src0)
8760 .addReg(Interm);
8761
8762 Worklist.insert(&Not);
8763 Worklist.insert(&Op);
8764
8765 MRI.replaceRegWith(Dest.getReg(), NewDest);
8766 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8767}
8768
8769void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8770 MachineInstr &Inst, unsigned Opcode,
8771 bool Swap) const {
8772 MachineBasicBlock &MBB = *Inst.getParent();
8773 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8774
8775 MachineOperand &Dest = Inst.getOperand(0);
8776 MachineOperand &Src0 = Inst.getOperand(1);
8777 const DebugLoc &DL = Inst.getDebugLoc();
8778
8779 MachineBasicBlock::iterator MII = Inst;
8780
8781 const MCInstrDesc &InstDesc = get(Opcode);
8782 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8783 MRI.getRegClass(Src0.getReg()) :
8784 &AMDGPU::SGPR_32RegClass;
8785
8786 const TargetRegisterClass *Src0SubRC =
8787 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8788
8789 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8790 AMDGPU::sub0, Src0SubRC);
8791
8792 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8793 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8794 const TargetRegisterClass *NewDestSubRC =
8795 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8796
8797 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8798 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8799
8800 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8801 AMDGPU::sub1, Src0SubRC);
8802
8803 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8804 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8805
8806 if (Swap)
8807 std::swap(DestSub0, DestSub1);
8808
8809 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8810 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8811 .addReg(DestSub0)
8812 .addImm(AMDGPU::sub0)
8813 .addReg(DestSub1)
8814 .addImm(AMDGPU::sub1);
8815
8816 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8817
8818 Worklist.insert(&LoHalf);
8819 Worklist.insert(&HiHalf);
8820
8821 // We don't need to legalizeOperands here because for a single operand, src0
8822 // will support any kind of input.
8823
8824 // Move all users of this moved value.
8825 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8826}
8827
8828// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8829// split the s_mul_u64 in 32-bit vector multiplications.
8830void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8831 MachineInstr &Inst,
8832 MachineDominatorTree *MDT) const {
8833 MachineBasicBlock &MBB = *Inst.getParent();
8834 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8835
8836 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8837 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8838 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8839
8840 MachineOperand &Dest = Inst.getOperand(0);
8841 MachineOperand &Src0 = Inst.getOperand(1);
8842 MachineOperand &Src1 = Inst.getOperand(2);
8843 const DebugLoc &DL = Inst.getDebugLoc();
8844 MachineBasicBlock::iterator MII = Inst;
8845
8846 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8847 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8848 const TargetRegisterClass *Src0SubRC =
8849 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8850 if (RI.isSGPRClass(Src0SubRC))
8851 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8852 const TargetRegisterClass *Src1SubRC =
8853 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8854 if (RI.isSGPRClass(Src1SubRC))
8855 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8856
8857 // First, we extract the low 32-bit and high 32-bit values from each of the
8858 // operands.
8859 MachineOperand Op0L =
8860 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8861 MachineOperand Op1L =
8862 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8863 MachineOperand Op0H =
8864 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8865 MachineOperand Op1H =
8866 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8867
8868 // The multilication is done as follows:
8869 //
8870 // Op1H Op1L
8871 // * Op0H Op0L
8872 // --------------------
8873 // Op1H*Op0L Op1L*Op0L
8874 // + Op1H*Op0H Op1L*Op0H
8875 // -----------------------------------------
8876 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8877 //
8878 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8879 // value and that would overflow.
8880 // The low 32-bit value is Op1L*Op0L.
8881 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8882
8883 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8884 MachineInstr *Op1L_Op0H =
8885 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8886 .add(Op1L)
8887 .add(Op0H);
8888
8889 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8890 MachineInstr *Op1H_Op0L =
8891 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8892 .add(Op1H)
8893 .add(Op0L);
8894
8895 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8896 MachineInstr *Carry =
8897 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8898 .add(Op1L)
8899 .add(Op0L);
8900
8901 MachineInstr *LoHalf =
8902 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8903 .add(Op1L)
8904 .add(Op0L);
8905
8906 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8907 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8908 .addReg(Op1L_Op0H_Reg)
8909 .addReg(Op1H_Op0L_Reg);
8910
8911 MachineInstr *HiHalf =
8912 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8913 .addReg(AddReg)
8914 .addReg(CarryReg);
8915
8916 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8917 .addReg(DestSub0)
8918 .addImm(AMDGPU::sub0)
8919 .addReg(DestSub1)
8920 .addImm(AMDGPU::sub1);
8921
8922 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8923
8924 // Try to legalize the operands in case we need to swap the order to keep it
8925 // valid.
8926 legalizeOperands(*Op1L_Op0H, MDT);
8927 legalizeOperands(*Op1H_Op0L, MDT);
8928 legalizeOperands(*Carry, MDT);
8929 legalizeOperands(*LoHalf, MDT);
8930 legalizeOperands(*Add, MDT);
8931 legalizeOperands(*HiHalf, MDT);
8932
8933 // Move all users of this moved value.
8934 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8935}
8936
8937// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8938// multiplications.
8939void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8940 MachineInstr &Inst,
8941 MachineDominatorTree *MDT) const {
8942 MachineBasicBlock &MBB = *Inst.getParent();
8943 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8944
8945 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8946 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8947 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8948
8949 MachineOperand &Dest = Inst.getOperand(0);
8950 MachineOperand &Src0 = Inst.getOperand(1);
8951 MachineOperand &Src1 = Inst.getOperand(2);
8952 const DebugLoc &DL = Inst.getDebugLoc();
8953 MachineBasicBlock::iterator MII = Inst;
8954
8955 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8956 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8957 const TargetRegisterClass *Src0SubRC =
8958 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8959 if (RI.isSGPRClass(Src0SubRC))
8960 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8961 const TargetRegisterClass *Src1SubRC =
8962 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8963 if (RI.isSGPRClass(Src1SubRC))
8964 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8965
8966 // First, we extract the low 32-bit and high 32-bit values from each of the
8967 // operands.
8968 MachineOperand Op0L =
8969 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8970 MachineOperand Op1L =
8971 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8972
8973 unsigned Opc = Inst.getOpcode();
8974 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8975 ? AMDGPU::V_MUL_HI_U32_e64
8976 : AMDGPU::V_MUL_HI_I32_e64;
8977 MachineInstr *HiHalf =
8978 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8979
8980 MachineInstr *LoHalf =
8981 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8982 .add(Op1L)
8983 .add(Op0L);
8984
8985 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8986 .addReg(DestSub0)
8987 .addImm(AMDGPU::sub0)
8988 .addReg(DestSub1)
8989 .addImm(AMDGPU::sub1);
8990
8991 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8992
8993 // Try to legalize the operands in case we need to swap the order to keep it
8994 // valid.
8995 legalizeOperands(*HiHalf, MDT);
8996 legalizeOperands(*LoHalf, MDT);
8997
8998 // Move all users of this moved value.
8999 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9000}
9001
9002void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9003 MachineInstr &Inst, unsigned Opcode,
9004 MachineDominatorTree *MDT) const {
9005 MachineBasicBlock &MBB = *Inst.getParent();
9006 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9007
9008 MachineOperand &Dest = Inst.getOperand(0);
9009 MachineOperand &Src0 = Inst.getOperand(1);
9010 MachineOperand &Src1 = Inst.getOperand(2);
9011 const DebugLoc &DL = Inst.getDebugLoc();
9012
9013 MachineBasicBlock::iterator MII = Inst;
9014
9015 const MCInstrDesc &InstDesc = get(Opcode);
9016 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9017 MRI.getRegClass(Src0.getReg()) :
9018 &AMDGPU::SGPR_32RegClass;
9019
9020 const TargetRegisterClass *Src0SubRC =
9021 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9022 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9023 MRI.getRegClass(Src1.getReg()) :
9024 &AMDGPU::SGPR_32RegClass;
9025
9026 const TargetRegisterClass *Src1SubRC =
9027 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9028
9029 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9030 AMDGPU::sub0, Src0SubRC);
9031 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9032 AMDGPU::sub0, Src1SubRC);
9033 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9034 AMDGPU::sub1, Src0SubRC);
9035 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9036 AMDGPU::sub1, Src1SubRC);
9037
9038 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9039 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9040 const TargetRegisterClass *NewDestSubRC =
9041 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9042
9043 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9044 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9045 .add(SrcReg0Sub0)
9046 .add(SrcReg1Sub0);
9047
9048 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9049 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9050 .add(SrcReg0Sub1)
9051 .add(SrcReg1Sub1);
9052
9053 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9054 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9055 .addReg(DestSub0)
9056 .addImm(AMDGPU::sub0)
9057 .addReg(DestSub1)
9058 .addImm(AMDGPU::sub1);
9059
9060 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9061
9062 Worklist.insert(&LoHalf);
9063 Worklist.insert(&HiHalf);
9064
9065 // Move all users of this moved value.
9066 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9067}
9068
9069void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9070 MachineInstr &Inst,
9071 MachineDominatorTree *MDT) const {
9072 MachineBasicBlock &MBB = *Inst.getParent();
9073 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9074
9075 MachineOperand &Dest = Inst.getOperand(0);
9076 MachineOperand &Src0 = Inst.getOperand(1);
9077 MachineOperand &Src1 = Inst.getOperand(2);
9078 const DebugLoc &DL = Inst.getDebugLoc();
9079
9080 MachineBasicBlock::iterator MII = Inst;
9081
9082 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9083
9084 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9085
9086 MachineOperand* Op0;
9087 MachineOperand* Op1;
9088
9089 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9090 Op0 = &Src0;
9091 Op1 = &Src1;
9092 } else {
9093 Op0 = &Src1;
9094 Op1 = &Src0;
9095 }
9096
9097 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9098 .add(*Op0);
9099
9100 Register NewDest = MRI.createVirtualRegister(DestRC);
9101
9102 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9103 .addReg(Interm)
9104 .add(*Op1);
9105
9106 MRI.replaceRegWith(Dest.getReg(), NewDest);
9107
9108 Worklist.insert(&Xor);
9109}
9110
9111void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9112 MachineInstr &Inst) const {
9113 MachineBasicBlock &MBB = *Inst.getParent();
9114 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9115
9116 MachineBasicBlock::iterator MII = Inst;
9117 const DebugLoc &DL = Inst.getDebugLoc();
9118
9119 MachineOperand &Dest = Inst.getOperand(0);
9120 MachineOperand &Src = Inst.getOperand(1);
9121
9122 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9123 const TargetRegisterClass *SrcRC = Src.isReg() ?
9124 MRI.getRegClass(Src.getReg()) :
9125 &AMDGPU::SGPR_32RegClass;
9126
9127 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9128 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9129
9130 const TargetRegisterClass *SrcSubRC =
9131 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9132
9133 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9134 AMDGPU::sub0, SrcSubRC);
9135 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9136 AMDGPU::sub1, SrcSubRC);
9137
9138 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9139
9140 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9141
9142 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9143
9144 // We don't need to legalize operands here. src0 for either instruction can be
9145 // an SGPR, and the second input is unused or determined here.
9146 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9147}
9148
9149void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9150 MachineInstr &Inst) const {
9151 MachineBasicBlock &MBB = *Inst.getParent();
9152 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9153 MachineBasicBlock::iterator MII = Inst;
9154 const DebugLoc &DL = Inst.getDebugLoc();
9155
9156 MachineOperand &Dest = Inst.getOperand(0);
9157 uint32_t Imm = Inst.getOperand(2).getImm();
9158 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9159 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9160
9161 (void) Offset;
9162
9163 // Only sext_inreg cases handled.
9164 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9165 Offset == 0 && "Not implemented");
9166
9167 if (BitWidth < 32) {
9168 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9169 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9170 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9171
9172 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9173 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
9174 .addImm(0)
9175 .addImm(BitWidth);
9176
9177 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9178 .addImm(31)
9179 .addReg(MidRegLo);
9180
9181 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9182 .addReg(MidRegLo)
9183 .addImm(AMDGPU::sub0)
9184 .addReg(MidRegHi)
9185 .addImm(AMDGPU::sub1);
9186
9187 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9188 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9189 return;
9190 }
9191
9192 MachineOperand &Src = Inst.getOperand(1);
9193 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9194 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9195
9196 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9197 .addImm(31)
9198 .addReg(Src.getReg(), 0, AMDGPU::sub0);
9199
9200 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9201 .addReg(Src.getReg(), 0, AMDGPU::sub0)
9202 .addImm(AMDGPU::sub0)
9203 .addReg(TmpReg)
9204 .addImm(AMDGPU::sub1);
9205
9206 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9207 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9208}
9209
9210void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9211 MachineInstr &Inst, unsigned Opcode,
9212 MachineDominatorTree *MDT) const {
9213 // (S_FLBIT_I32_B64 hi:lo) ->
9214 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9215 // (S_FF1_I32_B64 hi:lo) ->
9216 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9217
9218 MachineBasicBlock &MBB = *Inst.getParent();
9219 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9220 MachineBasicBlock::iterator MII = Inst;
9221 const DebugLoc &DL = Inst.getDebugLoc();
9222
9223 MachineOperand &Dest = Inst.getOperand(0);
9224 MachineOperand &Src = Inst.getOperand(1);
9225
9226 const MCInstrDesc &InstDesc = get(Opcode);
9227
9228 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9229 unsigned OpcodeAdd =
9230 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9231
9232 const TargetRegisterClass *SrcRC =
9233 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9234 const TargetRegisterClass *SrcSubRC =
9235 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9236
9237 MachineOperand SrcRegSub0 =
9238 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9239 MachineOperand SrcRegSub1 =
9240 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9241
9242 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9243 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9244 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9245 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9246
9247 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9248
9249 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9250
9251 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9252 .addReg(IsCtlz ? MidReg1 : MidReg2)
9253 .addImm(32)
9254 .addImm(1); // enable clamp
9255
9256 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9257 .addReg(MidReg3)
9258 .addReg(IsCtlz ? MidReg2 : MidReg1);
9259
9260 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9261
9262 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9263}
9264
9265void SIInstrInfo::addUsersToMoveToVALUWorklist(
9267 SIInstrWorklist &Worklist) const {
9268 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9269 MachineInstr &UseMI = *MO.getParent();
9270
9271 unsigned OpNo = 0;
9272
9273 switch (UseMI.getOpcode()) {
9274 case AMDGPU::COPY:
9275 case AMDGPU::WQM:
9276 case AMDGPU::SOFT_WQM:
9277 case AMDGPU::STRICT_WWM:
9278 case AMDGPU::STRICT_WQM:
9279 case AMDGPU::REG_SEQUENCE:
9280 case AMDGPU::PHI:
9281 case AMDGPU::INSERT_SUBREG:
9282 break;
9283 default:
9284 OpNo = MO.getOperandNo();
9285 break;
9286 }
9287
9288 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9289 MRI.constrainRegClass(DstReg, OpRC);
9290
9291 if (!RI.hasVectorRegisters(OpRC))
9292 Worklist.insert(&UseMI);
9293 else
9294 // Legalization could change user list.
9296 }
9297}
9298
9299void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9301 MachineInstr &Inst) const {
9302 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9303 MachineBasicBlock *MBB = Inst.getParent();
9304 MachineOperand &Src0 = Inst.getOperand(1);
9305 MachineOperand &Src1 = Inst.getOperand(2);
9306 const DebugLoc &DL = Inst.getDebugLoc();
9307
9308 if (ST.useRealTrue16Insts()) {
9309 Register SrcReg0, SrcReg1;
9310 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9311 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9312 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9313 } else {
9314 SrcReg0 = Src0.getReg();
9315 }
9316
9317 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9318 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9319 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9320 } else {
9321 SrcReg1 = Src1.getReg();
9322 }
9323
9324 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9325 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9326
9327 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9328 switch (Inst.getOpcode()) {
9329 case AMDGPU::S_PACK_LL_B32_B16:
9330 NewMI
9331 .addReg(SrcReg0, 0,
9332 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9333 .addImm(AMDGPU::lo16)
9334 .addReg(SrcReg1, 0,
9335 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9336 .addImm(AMDGPU::hi16);
9337 break;
9338 case AMDGPU::S_PACK_LH_B32_B16:
9339 NewMI
9340 .addReg(SrcReg0, 0,
9341 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9342 .addImm(AMDGPU::lo16)
9343 .addReg(SrcReg1, 0, AMDGPU::hi16)
9344 .addImm(AMDGPU::hi16);
9345 break;
9346 case AMDGPU::S_PACK_HL_B32_B16:
9347 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9348 .addImm(AMDGPU::lo16)
9349 .addReg(SrcReg1, 0,
9350 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9351 .addImm(AMDGPU::hi16);
9352 break;
9353 case AMDGPU::S_PACK_HH_B32_B16:
9354 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9355 .addImm(AMDGPU::lo16)
9356 .addReg(SrcReg1, 0, AMDGPU::hi16)
9357 .addImm(AMDGPU::hi16);
9358 break;
9359 default:
9360 llvm_unreachable("unhandled s_pack_* instruction");
9361 }
9362
9363 MachineOperand &Dest = Inst.getOperand(0);
9364 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9365 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9366 return;
9367 }
9368
9369 switch (Inst.getOpcode()) {
9370 case AMDGPU::S_PACK_LL_B32_B16: {
9371 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9372 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9373
9374 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9375 // 0.
9376 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9377 .addImm(0xffff);
9378
9379 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9380 .addReg(ImmReg, RegState::Kill)
9381 .add(Src0);
9382
9383 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9384 .add(Src1)
9385 .addImm(16)
9386 .addReg(TmpReg, RegState::Kill);
9387 break;
9388 }
9389 case AMDGPU::S_PACK_LH_B32_B16: {
9390 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9391 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9392 .addImm(0xffff);
9393 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9394 .addReg(ImmReg, RegState::Kill)
9395 .add(Src0)
9396 .add(Src1);
9397 break;
9398 }
9399 case AMDGPU::S_PACK_HL_B32_B16: {
9400 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9401 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9402 .addImm(16)
9403 .add(Src0);
9404 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9405 .add(Src1)
9406 .addImm(16)
9407 .addReg(TmpReg, RegState::Kill);
9408 break;
9409 }
9410 case AMDGPU::S_PACK_HH_B32_B16: {
9411 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9412 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9413 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9414 .addImm(16)
9415 .add(Src0);
9416 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9417 .addImm(0xffff0000);
9418 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9419 .add(Src1)
9420 .addReg(ImmReg, RegState::Kill)
9421 .addReg(TmpReg, RegState::Kill);
9422 break;
9423 }
9424 default:
9425 llvm_unreachable("unhandled s_pack_* instruction");
9426 }
9427
9428 MachineOperand &Dest = Inst.getOperand(0);
9429 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9430 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9431}
9432
9433void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9434 MachineInstr &SCCDefInst,
9435 SIInstrWorklist &Worklist,
9436 Register NewCond) const {
9437
9438 // Ensure that def inst defines SCC, which is still live.
9439 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9440 !Op.isDead() && Op.getParent() == &SCCDefInst);
9441 SmallVector<MachineInstr *, 4> CopyToDelete;
9442 // This assumes that all the users of SCC are in the same block
9443 // as the SCC def.
9444 for (MachineInstr &MI : // Skip the def inst itself.
9445 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9446 SCCDefInst.getParent()->end())) {
9447 // Check if SCC is used first.
9448 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9449 if (SCCIdx != -1) {
9450 if (MI.isCopy()) {
9451 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9452 Register DestReg = MI.getOperand(0).getReg();
9453
9454 MRI.replaceRegWith(DestReg, NewCond);
9455 CopyToDelete.push_back(&MI);
9456 } else {
9457
9458 if (NewCond.isValid())
9459 MI.getOperand(SCCIdx).setReg(NewCond);
9460
9461 Worklist.insert(&MI);
9462 }
9463 }
9464 // Exit if we find another SCC def.
9465 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9466 break;
9467 }
9468 for (auto &Copy : CopyToDelete)
9469 Copy->eraseFromParent();
9470}
9471
9472// Instructions that use SCC may be converted to VALU instructions. When that
9473// happens, the SCC register is changed to VCC_LO. The instruction that defines
9474// SCC must be changed to an instruction that defines VCC. This function makes
9475// sure that the instruction that defines SCC is added to the moveToVALU
9476// worklist.
9477void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9478 SIInstrWorklist &Worklist) const {
9479 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9480 // then there is nothing to do because the defining instruction has been
9481 // converted to a VALU already. If SCC then that instruction needs to be
9482 // converted to a VALU.
9483 for (MachineInstr &MI :
9484 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9485 SCCUseInst->getParent()->rend())) {
9486 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9487 break;
9488 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9489 Worklist.insert(&MI);
9490 break;
9491 }
9492 }
9493}
9494
9495const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9496 const MachineInstr &Inst) const {
9497 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9498
9499 switch (Inst.getOpcode()) {
9500 // For target instructions, getOpRegClass just returns the virtual register
9501 // class associated with the operand, so we need to find an equivalent VGPR
9502 // register class in order to move the instruction to the VALU.
9503 case AMDGPU::COPY:
9504 case AMDGPU::PHI:
9505 case AMDGPU::REG_SEQUENCE:
9506 case AMDGPU::INSERT_SUBREG:
9507 case AMDGPU::WQM:
9508 case AMDGPU::SOFT_WQM:
9509 case AMDGPU::STRICT_WWM:
9510 case AMDGPU::STRICT_WQM: {
9511 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9512 if (RI.isAGPRClass(SrcRC)) {
9513 if (RI.isAGPRClass(NewDstRC))
9514 return nullptr;
9515
9516 switch (Inst.getOpcode()) {
9517 case AMDGPU::PHI:
9518 case AMDGPU::REG_SEQUENCE:
9519 case AMDGPU::INSERT_SUBREG:
9520 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9521 break;
9522 default:
9523 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9524 }
9525
9526 if (!NewDstRC)
9527 return nullptr;
9528 } else {
9529 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9530 return nullptr;
9531
9532 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9533 if (!NewDstRC)
9534 return nullptr;
9535 }
9536
9537 return NewDstRC;
9538 }
9539 default:
9540 return NewDstRC;
9541 }
9542}
9543
9544// Find the one SGPR operand we are allowed to use.
9545Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9546 int OpIndices[3]) const {
9547 const MCInstrDesc &Desc = MI.getDesc();
9548
9549 // Find the one SGPR operand we are allowed to use.
9550 //
9551 // First we need to consider the instruction's operand requirements before
9552 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9553 // of VCC, but we are still bound by the constant bus requirement to only use
9554 // one.
9555 //
9556 // If the operand's class is an SGPR, we can never move it.
9557
9558 Register SGPRReg = findImplicitSGPRRead(MI);
9559 if (SGPRReg)
9560 return SGPRReg;
9561
9562 Register UsedSGPRs[3] = {Register()};
9563 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9564
9565 for (unsigned i = 0; i < 3; ++i) {
9566 int Idx = OpIndices[i];
9567 if (Idx == -1)
9568 break;
9569
9570 const MachineOperand &MO = MI.getOperand(Idx);
9571 if (!MO.isReg())
9572 continue;
9573
9574 // Is this operand statically required to be an SGPR based on the operand
9575 // constraints?
9576 const TargetRegisterClass *OpRC =
9577 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9578 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9579 if (IsRequiredSGPR)
9580 return MO.getReg();
9581
9582 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9583 Register Reg = MO.getReg();
9584 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9585 if (RI.isSGPRClass(RegRC))
9586 UsedSGPRs[i] = Reg;
9587 }
9588
9589 // We don't have a required SGPR operand, so we have a bit more freedom in
9590 // selecting operands to move.
9591
9592 // Try to select the most used SGPR. If an SGPR is equal to one of the
9593 // others, we choose that.
9594 //
9595 // e.g.
9596 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9597 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9598
9599 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9600 // prefer those.
9601
9602 if (UsedSGPRs[0]) {
9603 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9604 SGPRReg = UsedSGPRs[0];
9605 }
9606
9607 if (!SGPRReg && UsedSGPRs[1]) {
9608 if (UsedSGPRs[1] == UsedSGPRs[2])
9609 SGPRReg = UsedSGPRs[1];
9610 }
9611
9612 return SGPRReg;
9613}
9614
9616 AMDGPU::OpName OperandName) const {
9617 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9618 return nullptr;
9619
9620 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9621 if (Idx == -1)
9622 return nullptr;
9623
9624 return &MI.getOperand(Idx);
9625}
9626
9628 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9629 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9632 return (Format << 44) |
9633 (1ULL << 56) | // RESOURCE_LEVEL = 1
9634 (3ULL << 60); // OOB_SELECT = 3
9635 }
9636
9637 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9638 if (ST.isAmdHsaOS()) {
9639 // Set ATC = 1. GFX9 doesn't have this bit.
9640 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9641 RsrcDataFormat |= (1ULL << 56);
9642
9643 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9644 // BTW, it disables TC L2 and therefore decreases performance.
9645 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9646 RsrcDataFormat |= (2ULL << 59);
9647 }
9648
9649 return RsrcDataFormat;
9650}
9651
9655 0xffffffff; // Size;
9656
9657 // GFX9 doesn't have ELEMENT_SIZE.
9658 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9659 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9660 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9661 }
9662
9663 // IndexStride = 64 / 32.
9664 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9665 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9666
9667 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9668 // Clear them unless we want a huge stride.
9669 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9670 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9671 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9672
9673 return Rsrc23;
9674}
9675
9677 unsigned Opc = MI.getOpcode();
9678
9679 return isSMRD(Opc);
9680}
9681
9683 return get(Opc).mayLoad() &&
9684 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9685}
9686
9688 int &FrameIndex) const {
9689 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9690 if (!Addr || !Addr->isFI())
9691 return Register();
9692
9693 assert(!MI.memoperands_empty() &&
9694 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9695
9696 FrameIndex = Addr->getIndex();
9697 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9698}
9699
9701 int &FrameIndex) const {
9702 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9703 assert(Addr && Addr->isFI());
9704 FrameIndex = Addr->getIndex();
9705 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9706}
9707
9709 int &FrameIndex) const {
9710 if (!MI.mayLoad())
9711 return Register();
9712
9713 if (isMUBUF(MI) || isVGPRSpill(MI))
9714 return isStackAccess(MI, FrameIndex);
9715
9716 if (isSGPRSpill(MI))
9717 return isSGPRStackAccess(MI, FrameIndex);
9718
9719 return Register();
9720}
9721
9723 int &FrameIndex) const {
9724 if (!MI.mayStore())
9725 return Register();
9726
9727 if (isMUBUF(MI) || isVGPRSpill(MI))
9728 return isStackAccess(MI, FrameIndex);
9729
9730 if (isSGPRSpill(MI))
9731 return isSGPRStackAccess(MI, FrameIndex);
9732
9733 return Register();
9734}
9735
9737 unsigned Size = 0;
9739 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9740 while (++I != E && I->isInsideBundle()) {
9741 assert(!I->isBundle() && "No nested bundle!");
9743 }
9744
9745 return Size;
9746}
9747
9749 unsigned Opc = MI.getOpcode();
9751 unsigned DescSize = Desc.getSize();
9752
9753 // If we have a definitive size, we can use it. Otherwise we need to inspect
9754 // the operands to know the size.
9755 if (isFixedSize(MI)) {
9756 unsigned Size = DescSize;
9757
9758 // If we hit the buggy offset, an extra nop will be inserted in MC so
9759 // estimate the worst case.
9760 if (MI.isBranch() && ST.hasOffset3fBug())
9761 Size += 4;
9762
9763 return Size;
9764 }
9765
9766 // Instructions may have a 32-bit literal encoded after them. Check
9767 // operands that could ever be literals.
9768 if (isVALU(MI) || isSALU(MI)) {
9769 if (isDPP(MI))
9770 return DescSize;
9771 bool HasLiteral = false;
9772 unsigned LiteralSize = 4;
9773 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9774 const MachineOperand &Op = MI.getOperand(I);
9775 const MCOperandInfo &OpInfo = Desc.operands()[I];
9776 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9777 HasLiteral = true;
9778 if (ST.has64BitLiterals()) {
9779 switch (OpInfo.OperandType) {
9780 default:
9781 break;
9783 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9784 LiteralSize = 8;
9785 break;
9787 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9788 LiteralSize = 8;
9789 break;
9790 }
9791 }
9792 break;
9793 }
9794 }
9795 return HasLiteral ? DescSize + LiteralSize : DescSize;
9796 }
9797
9798 // Check whether we have extra NSA words.
9799 if (isMIMG(MI)) {
9800 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9801 if (VAddr0Idx < 0)
9802 return 8;
9803
9804 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9805 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9806 }
9807
9808 switch (Opc) {
9809 case TargetOpcode::BUNDLE:
9810 return getInstBundleSize(MI);
9811 case TargetOpcode::INLINEASM:
9812 case TargetOpcode::INLINEASM_BR: {
9813 const MachineFunction *MF = MI.getMF();
9814 const char *AsmStr = MI.getOperand(0).getSymbolName();
9815 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9816 }
9817 default:
9818 if (MI.isMetaInstruction())
9819 return 0;
9820
9821 // If D16 Pseudo inst, get correct MC code size
9822 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9823 if (D16Info) {
9824 // Assume d16_lo/hi inst are always in same size
9825 unsigned LoInstOpcode = D16Info->LoOp;
9826 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9827 DescSize = Desc.getSize();
9828 }
9829
9830 // If FMA Pseudo inst, get correct MC code size
9831 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9832 // All potential lowerings are the same size; arbitrarily pick one.
9833 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9834 DescSize = Desc.getSize();
9835 }
9836
9837 return DescSize;
9838 }
9839}
9840
9842 if (!isFLAT(MI))
9843 return false;
9844
9845 if (MI.memoperands_empty())
9846 return true;
9847
9848 for (const MachineMemOperand *MMO : MI.memoperands()) {
9849 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9850 return true;
9851 }
9852 return false;
9853}
9854
9857 static const std::pair<int, const char *> TargetIndices[] = {
9858 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9859 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9860 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9861 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9862 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9863 return ArrayRef(TargetIndices);
9864}
9865
9866/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9867/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9873
9874/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9875/// pass.
9880
9881// Called during:
9882// - pre-RA scheduling and post-RA scheduling
9885 const ScheduleDAGMI *DAG) const {
9886 // Borrowed from Arm Target
9887 // We would like to restrict this hazard recognizer to only
9888 // post-RA scheduling; we can tell that we're post-RA because we don't
9889 // track VRegLiveness.
9890 if (!DAG->hasVRegLiveness())
9891 return new GCNHazardRecognizer(DAG->MF);
9893}
9894
9895std::pair<unsigned, unsigned>
9897 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9898}
9899
9902 static const std::pair<unsigned, const char *> TargetFlags[] = {
9903 {MO_GOTPCREL, "amdgpu-gotprel"},
9904 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9905 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9906 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9907 {MO_REL32_LO, "amdgpu-rel32-lo"},
9908 {MO_REL32_HI, "amdgpu-rel32-hi"},
9909 {MO_REL64, "amdgpu-rel64"},
9910 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9911 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9912 {MO_ABS64, "amdgpu-abs64"},
9913 };
9914
9915 return ArrayRef(TargetFlags);
9916}
9917
9920 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9921 {
9922 {MONoClobber, "amdgpu-noclobber"},
9923 {MOLastUse, "amdgpu-last-use"},
9924 {MOCooperative, "amdgpu-cooperative"},
9925 };
9926
9927 return ArrayRef(TargetFlags);
9928}
9929
9931 const MachineFunction &MF) const {
9933 assert(SrcReg.isVirtual());
9934 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9935 return AMDGPU::WWM_COPY;
9936
9937 return AMDGPU::COPY;
9938}
9939
9941 uint16_t Opcode = MI.getOpcode();
9942 // Check if it is SGPR spill or wwm-register spill Opcode.
9943 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
9944 return true;
9945
9946 const MachineFunction *MF = MI.getMF();
9947 const MachineRegisterInfo &MRI = MF->getRegInfo();
9949
9950 // See if this is Liverange split instruction inserted for SGPR or
9951 // wwm-register. The implicit def inserted for wwm-registers should also be
9952 // included as they can appear at the bb begin.
9953 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
9954 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
9955 return false;
9956
9957 Register Reg = MI.getOperand(0).getReg();
9958 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
9959 return IsLRSplitInst;
9960
9961 return MFI->isWWMReg(Reg);
9962}
9963
9965 Register Reg) const {
9966 // We need to handle instructions which may be inserted during register
9967 // allocation to handle the prolog. The initial prolog instruction may have
9968 // been separated from the start of the block by spills and copies inserted
9969 // needed by the prolog. However, the insertions for scalar registers can
9970 // always be placed at the BB top as they are independent of the exec mask
9971 // value.
9972 bool IsNullOrVectorRegister = true;
9973 if (Reg) {
9974 const MachineFunction *MF = MI.getMF();
9975 const MachineRegisterInfo &MRI = MF->getRegInfo();
9976 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9977 }
9978
9979 return IsNullOrVectorRegister &&
9980 (canAddToBBProlog(MI) ||
9981 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
9982 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9983}
9984
9988 const DebugLoc &DL,
9989 Register DestReg) const {
9990 if (ST.hasAddNoCarry())
9991 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9992
9993 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9994 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9995 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9996
9997 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9998 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9999}
10000
10003 const DebugLoc &DL,
10004 Register DestReg,
10005 RegScavenger &RS) const {
10006 if (ST.hasAddNoCarry())
10007 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10008
10009 // If available, prefer to use vcc.
10010 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10011 ? Register(RI.getVCC())
10012 : RS.scavengeRegisterBackwards(
10013 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10014 0, /* AllowSpill */ false);
10015
10016 // TODO: Users need to deal with this.
10017 if (!UnusedCarry.isValid())
10018 return MachineInstrBuilder();
10019
10020 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10021 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10022}
10023
10024bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10025 switch (Opcode) {
10026 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10027 case AMDGPU::SI_KILL_I1_TERMINATOR:
10028 return true;
10029 default:
10030 return false;
10031 }
10032}
10033
10035 switch (Opcode) {
10036 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10037 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10038 case AMDGPU::SI_KILL_I1_PSEUDO:
10039 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10040 default:
10041 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10042 }
10043}
10044
10045bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10046 return Imm <= getMaxMUBUFImmOffset(ST);
10047}
10048
10050 // GFX12 field is non-negative 24-bit signed byte offset.
10051 const unsigned OffsetBits =
10052 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10053 return (1 << OffsetBits) - 1;
10054}
10055
10057 if (!ST.isWave32())
10058 return;
10059
10060 if (MI.isInlineAsm())
10061 return;
10062
10063 for (auto &Op : MI.implicit_operands()) {
10064 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10065 Op.setReg(AMDGPU::VCC_LO);
10066 }
10067}
10068
10070 if (!isSMRD(MI))
10071 return false;
10072
10073 // Check that it is using a buffer resource.
10074 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10075 if (Idx == -1) // e.g. s_memtime
10076 return false;
10077
10078 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10079 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10080}
10081
10082// Given Imm, split it into the values to put into the SOffset and ImmOffset
10083// fields in an MUBUF instruction. Return false if it is not possible (due to a
10084// hardware bug needing a workaround).
10085//
10086// The required alignment ensures that individual address components remain
10087// aligned if they are aligned to begin with. It also ensures that additional
10088// offsets within the given alignment can be added to the resulting ImmOffset.
10090 uint32_t &ImmOffset, Align Alignment) const {
10091 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10092 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10093 uint32_t Overflow = 0;
10094
10095 if (Imm > MaxImm) {
10096 if (Imm <= MaxImm + 64) {
10097 // Use an SOffset inline constant for 4..64
10098 Overflow = Imm - MaxImm;
10099 Imm = MaxImm;
10100 } else {
10101 // Try to keep the same value in SOffset for adjacent loads, so that
10102 // the corresponding register contents can be re-used.
10103 //
10104 // Load values with all low-bits (except for alignment bits) set into
10105 // SOffset, so that a larger range of values can be covered using
10106 // s_movk_i32.
10107 //
10108 // Atomic operations fail to work correctly when individual address
10109 // components are unaligned, even if their sum is aligned.
10110 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10111 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10112 Imm = Low;
10113 Overflow = High - Alignment.value();
10114 }
10115 }
10116
10117 if (Overflow > 0) {
10118 // There is a hardware bug in SI and CI which prevents address clamping in
10119 // MUBUF instructions from working correctly with SOffsets. The immediate
10120 // offset is unaffected.
10121 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10122 return false;
10123
10124 // It is not possible to set immediate in SOffset field on some targets.
10125 if (ST.hasRestrictedSOffset())
10126 return false;
10127 }
10128
10129 ImmOffset = Imm;
10130 SOffset = Overflow;
10131 return true;
10132}
10133
10134// Depending on the used address space and instructions, some immediate offsets
10135// are allowed and some are not.
10136// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10137// scratch instruction offsets can also be negative. On GFX12, offsets can be
10138// negative for all variants.
10139//
10140// There are several bugs related to these offsets:
10141// On gfx10.1, flat instructions that go into the global address space cannot
10142// use an offset.
10143//
10144// For scratch instructions, the address can be either an SGPR or a VGPR.
10145// The following offsets can be used, depending on the architecture (x means
10146// cannot be used):
10147// +----------------------------+------+------+
10148// | Address-Mode | SGPR | VGPR |
10149// +----------------------------+------+------+
10150// | gfx9 | | |
10151// | negative, 4-aligned offset | x | ok |
10152// | negative, unaligned offset | x | ok |
10153// +----------------------------+------+------+
10154// | gfx10 | | |
10155// | negative, 4-aligned offset | ok | ok |
10156// | negative, unaligned offset | ok | x |
10157// +----------------------------+------+------+
10158// | gfx10.3 | | |
10159// | negative, 4-aligned offset | ok | ok |
10160// | negative, unaligned offset | ok | ok |
10161// +----------------------------+------+------+
10162//
10163// This function ignores the addressing mode, so if an offset cannot be used in
10164// one addressing mode, it is considered illegal.
10165bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10166 uint64_t FlatVariant) const {
10167 // TODO: Should 0 be special cased?
10168 if (!ST.hasFlatInstOffsets())
10169 return false;
10170
10171 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10172 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10173 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10174 return false;
10175
10176 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10177 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10178 (Offset % 4) != 0) {
10179 return false;
10180 }
10181
10182 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10183 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10184 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10185}
10186
10187// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10188std::pair<int64_t, int64_t>
10189SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10190 uint64_t FlatVariant) const {
10191 int64_t RemainderOffset = COffsetVal;
10192 int64_t ImmField = 0;
10193
10194 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10195 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10196
10197 if (AllowNegative) {
10198 // Use signed division by a power of two to truncate towards 0.
10199 int64_t D = 1LL << NumBits;
10200 RemainderOffset = (COffsetVal / D) * D;
10201 ImmField = COffsetVal - RemainderOffset;
10202
10203 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10204 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10205 (ImmField % 4) != 0) {
10206 // Make ImmField a multiple of 4
10207 RemainderOffset += ImmField % 4;
10208 ImmField -= ImmField % 4;
10209 }
10210 } else if (COffsetVal >= 0) {
10211 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10212 RemainderOffset = COffsetVal - ImmField;
10213 }
10214
10215 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10216 assert(RemainderOffset + ImmField == COffsetVal);
10217 return {ImmField, RemainderOffset};
10218}
10219
10221 if (ST.hasNegativeScratchOffsetBug() &&
10222 FlatVariant == SIInstrFlags::FlatScratch)
10223 return false;
10224
10225 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10226}
10227
10228static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10229 switch (ST.getGeneration()) {
10230 default:
10231 break;
10234 return SIEncodingFamily::SI;
10237 return SIEncodingFamily::VI;
10243 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10245 }
10246 llvm_unreachable("Unknown subtarget generation!");
10247}
10248
10249bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10250 switch(MCOp) {
10251 // These opcodes use indirect register addressing so
10252 // they need special handling by codegen (currently missing).
10253 // Therefore it is too risky to allow these opcodes
10254 // to be selected by dpp combiner or sdwa peepholer.
10255 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10256 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10257 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10258 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10259 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10260 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10261 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10262 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10263 return true;
10264 default:
10265 return false;
10266 }
10267}
10268
10269#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10270 case OPCODE##_dpp: \
10271 case OPCODE##_e32: \
10272 case OPCODE##_e64: \
10273 case OPCODE##_e64_dpp: \
10274 case OPCODE##_sdwa:
10275
10276static bool isRenamedInGFX9(int Opcode) {
10277 switch (Opcode) {
10278 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10279 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10280 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10281 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10282 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10283 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10284 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10285 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10286 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10287 //
10288 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10289 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10290 case AMDGPU::V_FMA_F16_gfx9_e64:
10291 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10292 case AMDGPU::V_INTERP_P2_F16:
10293 case AMDGPU::V_MAD_F16_e64:
10294 case AMDGPU::V_MAD_U16_e64:
10295 case AMDGPU::V_MAD_I16_e64:
10296 return true;
10297 default:
10298 return false;
10299 }
10300}
10301
10302int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10303 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10304 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10305
10306 unsigned Gen = subtargetEncodingFamily(ST);
10307
10308 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10310
10311 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10312 // subtarget has UnpackedD16VMem feature.
10313 // TODO: remove this when we discard GFX80 encoding.
10314 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10316
10317 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10318 switch (ST.getGeneration()) {
10319 default:
10321 break;
10324 break;
10327 break;
10328 }
10329 }
10330
10331 if (isMAI(Opcode)) {
10332 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10333 if (MFMAOp != -1)
10334 Opcode = MFMAOp;
10335 }
10336
10337 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10338
10339 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10341
10342 // -1 means that Opcode is already a native instruction.
10343 if (MCOp == -1)
10344 return Opcode;
10345
10346 if (ST.hasGFX90AInsts()) {
10347 uint16_t NMCOp = (uint16_t)-1;
10348 if (ST.hasGFX940Insts())
10350 if (NMCOp == (uint16_t)-1)
10352 if (NMCOp == (uint16_t)-1)
10354 if (NMCOp != (uint16_t)-1)
10355 MCOp = NMCOp;
10356 }
10357
10358 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10359 // no encoding in the given subtarget generation.
10360 if (MCOp == (uint16_t)-1)
10361 return -1;
10362
10363 if (isAsmOnlyOpcode(MCOp))
10364 return -1;
10365
10366 return MCOp;
10367}
10368
10369static
10371 assert(RegOpnd.isReg());
10372 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10373 getRegSubRegPair(RegOpnd);
10374}
10375
10378 assert(MI.isRegSequence());
10379 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10380 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10381 auto &RegOp = MI.getOperand(1 + 2 * I);
10382 return getRegOrUndef(RegOp);
10383 }
10385}
10386
10387// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10388// Following a subreg of reg:subreg isn't supported
10391 if (!RSR.SubReg)
10392 return false;
10393 switch (MI.getOpcode()) {
10394 default: break;
10395 case AMDGPU::REG_SEQUENCE:
10396 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10397 return true;
10398 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10399 case AMDGPU::INSERT_SUBREG:
10400 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10401 // inserted the subreg we're looking for
10402 RSR = getRegOrUndef(MI.getOperand(2));
10403 else { // the subreg in the rest of the reg
10404 auto R1 = getRegOrUndef(MI.getOperand(1));
10405 if (R1.SubReg) // subreg of subreg isn't supported
10406 return false;
10407 RSR.Reg = R1.Reg;
10408 }
10409 return true;
10410 }
10411 return false;
10412}
10413
10415 const MachineRegisterInfo &MRI) {
10416 assert(MRI.isSSA());
10417 if (!P.Reg.isVirtual())
10418 return nullptr;
10419
10420 auto RSR = P;
10421 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10422 while (auto *MI = DefInst) {
10423 DefInst = nullptr;
10424 switch (MI->getOpcode()) {
10425 case AMDGPU::COPY:
10426 case AMDGPU::V_MOV_B32_e32: {
10427 auto &Op1 = MI->getOperand(1);
10428 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10429 if (Op1.isUndef())
10430 return nullptr;
10431 RSR = getRegSubRegPair(Op1);
10432 DefInst = MRI.getVRegDef(RSR.Reg);
10433 }
10434 break;
10435 }
10436 default:
10437 if (followSubRegDef(*MI, RSR)) {
10438 if (!RSR.Reg)
10439 return nullptr;
10440 DefInst = MRI.getVRegDef(RSR.Reg);
10441 }
10442 }
10443 if (!DefInst)
10444 return MI;
10445 }
10446 return nullptr;
10447}
10448
10450 Register VReg,
10451 const MachineInstr &DefMI,
10452 const MachineInstr &UseMI) {
10453 assert(MRI.isSSA() && "Must be run on SSA");
10454
10455 auto *TRI = MRI.getTargetRegisterInfo();
10456 auto *DefBB = DefMI.getParent();
10457
10458 // Don't bother searching between blocks, although it is possible this block
10459 // doesn't modify exec.
10460 if (UseMI.getParent() != DefBB)
10461 return true;
10462
10463 const int MaxInstScan = 20;
10464 int NumInst = 0;
10465
10466 // Stop scan at the use.
10467 auto E = UseMI.getIterator();
10468 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10469 if (I->isDebugInstr())
10470 continue;
10471
10472 if (++NumInst > MaxInstScan)
10473 return true;
10474
10475 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10476 return true;
10477 }
10478
10479 return false;
10480}
10481
10483 Register VReg,
10484 const MachineInstr &DefMI) {
10485 assert(MRI.isSSA() && "Must be run on SSA");
10486
10487 auto *TRI = MRI.getTargetRegisterInfo();
10488 auto *DefBB = DefMI.getParent();
10489
10490 const int MaxUseScan = 10;
10491 int NumUse = 0;
10492
10493 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10494 auto &UseInst = *Use.getParent();
10495 // Don't bother searching between blocks, although it is possible this block
10496 // doesn't modify exec.
10497 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10498 return true;
10499
10500 if (++NumUse > MaxUseScan)
10501 return true;
10502 }
10503
10504 if (NumUse == 0)
10505 return false;
10506
10507 const int MaxInstScan = 20;
10508 int NumInst = 0;
10509
10510 // Stop scan when we have seen all the uses.
10511 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10512 assert(I != DefBB->end());
10513
10514 if (I->isDebugInstr())
10515 continue;
10516
10517 if (++NumInst > MaxInstScan)
10518 return true;
10519
10520 for (const MachineOperand &Op : I->operands()) {
10521 // We don't check reg masks here as they're used only on calls:
10522 // 1. EXEC is only considered const within one BB
10523 // 2. Call should be a terminator instruction if present in a BB
10524
10525 if (!Op.isReg())
10526 continue;
10527
10528 Register Reg = Op.getReg();
10529 if (Op.isUse()) {
10530 if (Reg == VReg && --NumUse == 0)
10531 return false;
10532 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10533 return true;
10534 }
10535 }
10536}
10537
10540 const DebugLoc &DL, Register Src, Register Dst) const {
10541 auto Cur = MBB.begin();
10542 if (Cur != MBB.end())
10543 do {
10544 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10545 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10546 ++Cur;
10547 } while (Cur != MBB.end() && Cur != LastPHIIt);
10548
10549 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10550 Dst);
10551}
10552
10555 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10556 if (InsPt != MBB.end() &&
10557 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10558 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10559 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10560 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10561 InsPt++;
10562 return BuildMI(MBB, InsPt, DL,
10563 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10564 .addReg(Src, 0, SrcSubReg)
10565 .addReg(AMDGPU::EXEC, RegState::Implicit);
10566 }
10567 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10568 Dst);
10569}
10570
10571bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10572
10575 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10576 VirtRegMap *VRM) const {
10577 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10578 //
10579 // %0:sreg_32 = COPY $m0
10580 //
10581 // We explicitly chose SReg_32 for the virtual register so such a copy might
10582 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10583 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10584 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10585 // TargetInstrInfo::foldMemoryOperand() is going to try.
10586 // A similar issue also exists with spilling and reloading $exec registers.
10587 //
10588 // To prevent that, constrain the %0 register class here.
10589 if (isFullCopyInstr(MI)) {
10590 Register DstReg = MI.getOperand(0).getReg();
10591 Register SrcReg = MI.getOperand(1).getReg();
10592 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10593 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10595 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10596 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10597 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10598 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10599 return nullptr;
10600 }
10601 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10602 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10603 return nullptr;
10604 }
10605 }
10606 }
10607
10608 return nullptr;
10609}
10610
10612 const MachineInstr &MI,
10613 unsigned *PredCost) const {
10614 if (MI.isBundle()) {
10616 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10617 unsigned Lat = 0, Count = 0;
10618 for (++I; I != E && I->isBundledWithPred(); ++I) {
10619 ++Count;
10620 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10621 }
10622 return Lat + Count - 1;
10623 }
10624
10625 return SchedModel.computeInstrLatency(&MI);
10626}
10627
10628const MachineOperand &
10630 if (const MachineOperand *CallAddrOp =
10631 getNamedOperand(MI, AMDGPU::OpName::src0))
10632 return *CallAddrOp;
10634}
10635
10638 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10639 unsigned Opcode = MI.getOpcode();
10640
10641 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10642 Register Dst = MI.getOperand(0).getReg();
10643 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10644 : MI.getOperand(1).getReg();
10645 LLT DstTy = MRI.getType(Dst);
10646 LLT SrcTy = MRI.getType(Src);
10647 unsigned DstAS = DstTy.getAddressSpace();
10648 unsigned SrcAS = SrcTy.getAddressSpace();
10649 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10650 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10651 ST.hasGloballyAddressableScratch()
10654 };
10655
10656 // If the target supports globally addressable scratch, the mapping from
10657 // scratch memory to the flat aperture changes therefore an address space cast
10658 // is no longer uniform.
10659 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10660 return HandleAddrSpaceCast(MI);
10661
10662 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10663 auto IID = GI->getIntrinsicID();
10668
10669 switch (IID) {
10670 case Intrinsic::amdgcn_addrspacecast_nonnull:
10671 return HandleAddrSpaceCast(MI);
10672 case Intrinsic::amdgcn_if:
10673 case Intrinsic::amdgcn_else:
10674 // FIXME: Uniform if second result
10675 break;
10676 }
10677
10679 }
10680
10681 // Loads from the private and flat address spaces are divergent, because
10682 // threads can execute the load instruction with the same inputs and get
10683 // different results.
10684 //
10685 // All other loads are not divergent, because if threads issue loads with the
10686 // same arguments, they will always get the same result.
10687 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10688 Opcode == AMDGPU::G_SEXTLOAD) {
10689 if (MI.memoperands_empty())
10690 return InstructionUniformity::NeverUniform; // conservative assumption
10691
10692 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10693 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10694 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10695 })) {
10696 // At least one MMO in a non-global address space.
10698 }
10700 }
10701
10702 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10703 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10704 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10705 AMDGPU::isGenericAtomic(Opcode)) {
10707 }
10709}
10710
10713
10714 if (isNeverUniform(MI))
10716
10717 unsigned opcode = MI.getOpcode();
10718 if (opcode == AMDGPU::V_READLANE_B32 ||
10719 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10720 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10722
10723 if (isCopyInstr(MI)) {
10724 const MachineOperand &srcOp = MI.getOperand(1);
10725 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10726 const TargetRegisterClass *regClass =
10727 RI.getPhysRegBaseClass(srcOp.getReg());
10728 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10730 }
10732 }
10733
10734 // GMIR handling
10735 if (MI.isPreISelOpcode())
10737
10738 // Atomics are divergent because they are executed sequentially: when an
10739 // atomic operation refers to the same address in each thread, then each
10740 // thread after the first sees the value written by the previous thread as
10741 // original value.
10742
10743 if (isAtomic(MI))
10745
10746 // Loads from the private and flat address spaces are divergent, because
10747 // threads can execute the load instruction with the same inputs and get
10748 // different results.
10749 if (isFLAT(MI) && MI.mayLoad()) {
10750 if (MI.memoperands_empty())
10751 return InstructionUniformity::NeverUniform; // conservative assumption
10752
10753 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10754 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10755 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10756 })) {
10757 // At least one MMO in a non-global address space.
10759 }
10760
10762 }
10763
10764 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10765 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10766
10767 // FIXME: It's conceptually broken to report this for an instruction, and not
10768 // a specific def operand. For inline asm in particular, there could be mixed
10769 // uniform and divergent results.
10770 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10771 const MachineOperand &SrcOp = MI.getOperand(I);
10772 if (!SrcOp.isReg())
10773 continue;
10774
10775 Register Reg = SrcOp.getReg();
10776 if (!Reg || !SrcOp.readsReg())
10777 continue;
10778
10779 // If RegBank is null, this is unassigned or an unallocatable special
10780 // register, which are all scalars.
10781 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10782 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10784 }
10785
10786 // TODO: Uniformity check condtions above can be rearranged for more
10787 // redability
10788
10789 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10790 // currently turned into no-op COPYs by SelectionDAG ISel and are
10791 // therefore no longer recognizable.
10792
10794}
10795
10797 switch (MF.getFunction().getCallingConv()) {
10799 return 1;
10801 return 2;
10803 return 3;
10807 const Function &F = MF.getFunction();
10808 F.getContext().diagnose(DiagnosticInfoUnsupported(
10809 F, "ds_ordered_count unsupported for this calling conv"));
10810 [[fallthrough]];
10811 }
10814 case CallingConv::C:
10815 case CallingConv::Fast:
10816 default:
10817 // Assume other calling conventions are various compute callable functions
10818 return 0;
10819 }
10820}
10821
10823 Register &SrcReg2, int64_t &CmpMask,
10824 int64_t &CmpValue) const {
10825 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10826 return false;
10827
10828 switch (MI.getOpcode()) {
10829 default:
10830 break;
10831 case AMDGPU::S_CMP_EQ_U32:
10832 case AMDGPU::S_CMP_EQ_I32:
10833 case AMDGPU::S_CMP_LG_U32:
10834 case AMDGPU::S_CMP_LG_I32:
10835 case AMDGPU::S_CMP_LT_U32:
10836 case AMDGPU::S_CMP_LT_I32:
10837 case AMDGPU::S_CMP_GT_U32:
10838 case AMDGPU::S_CMP_GT_I32:
10839 case AMDGPU::S_CMP_LE_U32:
10840 case AMDGPU::S_CMP_LE_I32:
10841 case AMDGPU::S_CMP_GE_U32:
10842 case AMDGPU::S_CMP_GE_I32:
10843 case AMDGPU::S_CMP_EQ_U64:
10844 case AMDGPU::S_CMP_LG_U64:
10845 SrcReg = MI.getOperand(0).getReg();
10846 if (MI.getOperand(1).isReg()) {
10847 if (MI.getOperand(1).getSubReg())
10848 return false;
10849 SrcReg2 = MI.getOperand(1).getReg();
10850 CmpValue = 0;
10851 } else if (MI.getOperand(1).isImm()) {
10852 SrcReg2 = Register();
10853 CmpValue = MI.getOperand(1).getImm();
10854 } else {
10855 return false;
10856 }
10857 CmpMask = ~0;
10858 return true;
10859 case AMDGPU::S_CMPK_EQ_U32:
10860 case AMDGPU::S_CMPK_EQ_I32:
10861 case AMDGPU::S_CMPK_LG_U32:
10862 case AMDGPU::S_CMPK_LG_I32:
10863 case AMDGPU::S_CMPK_LT_U32:
10864 case AMDGPU::S_CMPK_LT_I32:
10865 case AMDGPU::S_CMPK_GT_U32:
10866 case AMDGPU::S_CMPK_GT_I32:
10867 case AMDGPU::S_CMPK_LE_U32:
10868 case AMDGPU::S_CMPK_LE_I32:
10869 case AMDGPU::S_CMPK_GE_U32:
10870 case AMDGPU::S_CMPK_GE_I32:
10871 SrcReg = MI.getOperand(0).getReg();
10872 SrcReg2 = Register();
10873 CmpValue = MI.getOperand(1).getImm();
10874 CmpMask = ~0;
10875 return true;
10876 }
10877
10878 return false;
10879}
10880
10882 for (MachineBasicBlock *S : MBB->successors()) {
10883 if (S->isLiveIn(AMDGPU::SCC))
10884 return false;
10885 }
10886 return true;
10887}
10888
10889// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10890// (incoming SCC) = !(SCC defined by SCCDef).
10891// Return true if all uses can be re-written, false otherwise.
10892bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10893 MachineBasicBlock *MBB = SCCDef->getParent();
10894 SmallVector<MachineInstr *> InvertInstr;
10895 bool SCCIsDead = false;
10896
10897 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10898 constexpr unsigned ScanLimit = 12;
10899 unsigned Count = 0;
10900 for (MachineInstr &MI :
10901 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
10902 if (++Count > ScanLimit)
10903 return false;
10904 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
10905 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10906 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10907 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10908 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10909 InvertInstr.push_back(&MI);
10910 else
10911 return false;
10912 }
10913 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
10914 SCCIsDead = true;
10915 break;
10916 }
10917 }
10918 if (!SCCIsDead && isSCCDeadOnExit(MBB))
10919 SCCIsDead = true;
10920
10921 // SCC may have more uses. Can't invert all of them.
10922 if (!SCCIsDead)
10923 return false;
10924
10925 // Invert uses
10926 for (MachineInstr *MI : InvertInstr) {
10927 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10928 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
10929 swapOperands(*MI);
10930 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10931 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
10932 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
10933 ? AMDGPU::S_CBRANCH_SCC1
10934 : AMDGPU::S_CBRANCH_SCC0));
10935 } else {
10936 llvm_unreachable("SCC used but no inversion handling");
10937 }
10938 }
10939 return true;
10940}
10941
10942// SCC is already valid after SCCValid.
10943// SCCRedefine will redefine SCC to the same value already available after
10944// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10945// update kill/dead flags if necessary.
10946bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10947 bool NeedInversion) const {
10948 MachineInstr *KillsSCC = nullptr;
10949 if (SCCValid->getParent() != SCCRedefine->getParent())
10950 return false;
10951 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10952 SCCRedefine->getIterator())) {
10953 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10954 return false;
10955 if (MI.killsRegister(AMDGPU::SCC, &RI))
10956 KillsSCC = &MI;
10957 }
10958 if (NeedInversion && !invertSCCUse(SCCRedefine))
10959 return false;
10960 if (MachineOperand *SccDef =
10961 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10962 SccDef->setIsDead(false);
10963 if (KillsSCC)
10964 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10965 SCCRedefine->eraseFromParent();
10966 return true;
10967}
10968
10969static bool foldableSelect(const MachineInstr &Def) {
10970 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10971 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10972 return false;
10973 bool Op1IsNonZeroImm =
10974 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10975 bool Op2IsZeroImm =
10976 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10977 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10978 return false;
10979 return true;
10980}
10981
10983 Register SrcReg2, int64_t CmpMask,
10984 int64_t CmpValue,
10985 const MachineRegisterInfo *MRI) const {
10986 if (!SrcReg || SrcReg.isPhysical())
10987 return false;
10988
10989 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10990 return false;
10991
10992 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10993 this](bool NeedInversion) -> bool {
10994 if (CmpValue != 0)
10995 return false;
10996
10997 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10998 if (!Def)
10999 return false;
11000
11001 // For S_OP that set SCC = DST!=0, do the transformation
11002 //
11003 // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
11004
11005 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11006 // for S_CSELECT* already has the same value that will be calculated by
11007 // s_cmp_lg_*
11008 //
11009 // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
11010 // imm), 0)
11011 if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
11012 return false;
11013
11014 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11015 return false;
11016
11017 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11018 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11019 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11020 // sX = s_cselect_b64 (non-zero imm), 0
11021 // sLo = copy sX.sub0
11022 // sHi = copy sX.sub1
11023 // sY = s_or_b32 sLo, sHi
11024 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11025 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11026 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11027 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11028 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11029 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11030 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11031 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11032 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11033 Def2->getOperand(1).isReg() &&
11034 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11035 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11036 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11037 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11038 if (Select && foldableSelect(*Select))
11039 optimizeSCC(Select, Def, false);
11040 }
11041 }
11042 }
11043 return true;
11044 };
11045
11046 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11047 this](int64_t ExpectedValue, unsigned SrcSize,
11048 bool IsReversible, bool IsSigned) -> bool {
11049 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11050 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11051 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11052 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11053 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11054 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11055 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11056 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11057 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11058 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11059 //
11060 // Signed ge/gt are not used for the sign bit.
11061 //
11062 // If result of the AND is unused except in the compare:
11063 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11064 //
11065 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11066 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11067 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11068 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11069 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11070 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11071
11072 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11073 if (!Def)
11074 return false;
11075
11076 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11077 Def->getOpcode() != AMDGPU::S_AND_B64)
11078 return false;
11079
11080 int64_t Mask;
11081 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11082 if (MO->isImm())
11083 Mask = MO->getImm();
11084 else if (!getFoldableImm(MO, Mask))
11085 return false;
11086 Mask &= maxUIntN(SrcSize);
11087 return isPowerOf2_64(Mask);
11088 };
11089
11090 MachineOperand *SrcOp = &Def->getOperand(1);
11091 if (isMask(SrcOp))
11092 SrcOp = &Def->getOperand(2);
11093 else if (isMask(&Def->getOperand(2)))
11094 SrcOp = &Def->getOperand(1);
11095 else
11096 return false;
11097
11098 // A valid Mask is required to have a single bit set, hence a non-zero and
11099 // power-of-two value. This verifies that we will not do 64-bit shift below.
11100 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11101 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11102 if (IsSigned && BitNo == SrcSize - 1)
11103 return false;
11104
11105 ExpectedValue <<= BitNo;
11106
11107 bool IsReversedCC = false;
11108 if (CmpValue != ExpectedValue) {
11109 if (!IsReversible)
11110 return false;
11111 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11112 if (!IsReversedCC)
11113 return false;
11114 }
11115
11116 Register DefReg = Def->getOperand(0).getReg();
11117 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11118 return false;
11119
11120 if (!optimizeSCC(Def, &CmpInstr, false))
11121 return false;
11122
11123 if (!MRI->use_nodbg_empty(DefReg)) {
11124 assert(!IsReversedCC);
11125 return true;
11126 }
11127
11128 // Replace AND with unused result with a S_BITCMP.
11129 MachineBasicBlock *MBB = Def->getParent();
11130
11131 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11132 : AMDGPU::S_BITCMP1_B32
11133 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11134 : AMDGPU::S_BITCMP1_B64;
11135
11136 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11137 .add(*SrcOp)
11138 .addImm(BitNo);
11139 Def->eraseFromParent();
11140
11141 return true;
11142 };
11143
11144 switch (CmpInstr.getOpcode()) {
11145 default:
11146 break;
11147 case AMDGPU::S_CMP_EQ_U32:
11148 case AMDGPU::S_CMP_EQ_I32:
11149 case AMDGPU::S_CMPK_EQ_U32:
11150 case AMDGPU::S_CMPK_EQ_I32:
11151 return optimizeCmpAnd(1, 32, true, false) ||
11152 optimizeCmpSelect(/*NeedInversion=*/true);
11153 case AMDGPU::S_CMP_GE_U32:
11154 case AMDGPU::S_CMPK_GE_U32:
11155 return optimizeCmpAnd(1, 32, false, false);
11156 case AMDGPU::S_CMP_GE_I32:
11157 case AMDGPU::S_CMPK_GE_I32:
11158 return optimizeCmpAnd(1, 32, false, true);
11159 case AMDGPU::S_CMP_EQ_U64:
11160 return optimizeCmpAnd(1, 64, true, false);
11161 case AMDGPU::S_CMP_LG_U32:
11162 case AMDGPU::S_CMP_LG_I32:
11163 case AMDGPU::S_CMPK_LG_U32:
11164 case AMDGPU::S_CMPK_LG_I32:
11165 return optimizeCmpAnd(0, 32, true, false) ||
11166 optimizeCmpSelect(/*NeedInversion=*/false);
11167 case AMDGPU::S_CMP_GT_U32:
11168 case AMDGPU::S_CMPK_GT_U32:
11169 return optimizeCmpAnd(0, 32, false, false);
11170 case AMDGPU::S_CMP_GT_I32:
11171 case AMDGPU::S_CMPK_GT_I32:
11172 return optimizeCmpAnd(0, 32, false, true);
11173 case AMDGPU::S_CMP_LG_U64:
11174 return optimizeCmpAnd(0, 64, true, false) ||
11175 optimizeCmpSelect(/*NeedInversion=*/false);
11176 }
11177
11178 return false;
11179}
11180
11182 AMDGPU::OpName OpName) const {
11183 if (!ST.needsAlignedVGPRs())
11184 return;
11185
11186 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11187 if (OpNo < 0)
11188 return;
11189 MachineOperand &Op = MI.getOperand(OpNo);
11190 if (getOpSize(MI, OpNo) > 4)
11191 return;
11192
11193 // Add implicit aligned super-reg to force alignment on the data operand.
11194 const DebugLoc &DL = MI.getDebugLoc();
11195 MachineBasicBlock *BB = MI.getParent();
11197 Register DataReg = Op.getReg();
11198 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11199 Register Undef = MRI.createVirtualRegister(
11200 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11201 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11202 Register NewVR =
11203 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11204 : &AMDGPU::VReg_64_Align2RegClass);
11205 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11206 .addReg(DataReg, 0, Op.getSubReg())
11207 .addImm(AMDGPU::sub0)
11208 .addReg(Undef)
11209 .addImm(AMDGPU::sub1);
11210 Op.setReg(NewVR);
11211 Op.setSubReg(AMDGPU::sub0);
11212 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11213}
11214
11216 if (isIGLP(*MI))
11217 return false;
11218
11220}
11221
11223 if (!isWMMA(MI) && !isSWMMAC(MI))
11224 return false;
11225
11226 if (AMDGPU::isGFX1250(ST))
11227 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11228
11229 return true;
11230}
11231
11233 unsigned Opcode = MI.getOpcode();
11234
11235 if (AMDGPU::isGFX12Plus(ST))
11236 return isDOT(MI) || isXDLWMMA(MI);
11237
11238 if (!isMAI(MI) || isDGEMM(Opcode) ||
11239 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11240 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11241 return false;
11242
11243 if (!ST.hasGFX940Insts())
11244 return true;
11245
11246 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11247}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:144
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1571
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool setsSCCifResultIsNonZero(const MachineInstr &MI)
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:587
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:589
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:586
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:588
@ TI_CONSTDATA_START
Definition AMDGPU.h:585
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.